# Explore the YouTube information bubbles

This notebook helps you get all the relevant information to explore information bubbles on YouTube. The idea is that you take a list of keywords, videos or channels, find related videos or channels, get all the metadata, comments and videotranscripts you need and use natural language processing techniques to explore and analyze the data. This notebook uses Python3. There is a similar script available for the R language. 

### Here's what still needs to be done:

1. Implement a search functionality so you can build a seed list from keywords, videos or channels. Currently were are using the [YouTubeDataTool](https://tools.digitalmethods.net/netvizz/youtube/).
2. All code needs to be more pythonic. 
3. Filtering options need to be added before compiling the final channel seeds list. 
4. If the channel seed list is large, we need multithreading options to help with data collection.
5. We need to find more clever ways to work around the throttling of the YouTube API.
6. We need download buffers. If an error is thrown, or if we add some data to the seed list, previous downloaded data should be skipped.

In [None]:
import pandas as pd
from __future__ import unicode_literals
import youtube_dl
import codecs
import webvtt
import glob
import csv
import requests
import os.path
import config

from apiclient.discovery import build
from apiclient.errors import HttpError


## Configuration

In [None]:
#create a config.py file with these constants

YOUTUBE_API_SERVICE_NAME = config.YOUTUBE_API_SERVICE_NAME
YOUTUBE_API_VERSION = config.YOUTUBE_API_VERSION
DEVELOPER_KEY = config.DEVELOPER_KEY
PATH_TEMP_RIGHT = config.PATH_TEMP_RIGHT
PATH_TEMP_LEFT = config.PATH_TEMP_LEFT

#uncomment which bubble you want to investigate

#seeds = PATH_TEMP_RIGHT + 'right_seeds.csv' #if you want to investigate the right bubble
seeds = PATH_TEMP_LEFT + 'lefty_seeds_v2.csv' #if you want to investigate the left bubble

## Get channel data

In [None]:
channel_seeds = pd.read_csv(seeds, encoding='latin-1')

In [None]:
def get_channels(channel_id):
    '''Queries the youtube API and 
    gets a json in return'''
        
    youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION,
                    developerKey=DEVELOPER_KEY)
    
    response = youtube.channels().list(
    part = 'snippet,contentDetails,topicDetails,statistics,brandingSettings',
    id = channel_id
    ).execute()
    #print('getting channel info for %s' % (channel_id))
    return response


def get_channel_data(response):
    '''Extracts the needed variables 
    from the returned json'''
    
    for channel in response['items']:
        channel_id = channel['id']
        channel_title = channel['snippet']['title']
        channel_description = channel['snippet']['description']
        try: #many channels do not set a language so we need to catch the exception
            channel_default_language = channel['snippet']['defaultLanguage']
        except:
            channel_default_language = 'not set'
        try:
            channel_country = channel['snippet']['country']
        except:
            channel_country = 'not set'
        channel_viewcount = channel['statistics']['viewCount']
        channel_commentcount = channel['statistics']['commentCount']
        channel_subscribercount = channel['statistics']['subscriberCount']
        channel_videocount = channel['statistics']['videoCount']
        try:
            channel_topic_ids = channel['topicDetails']['topicIds']
        except:
            channel_topic_ids = 'not set'
        try:
            channel_topic_categories = channel['topicDetails']['topicCategories']
        except:
            channel_topic_categories = 'not set'
            
        try:
            channel_branding_keywords = channel['brandingSettings']['channel']['keywords']
        except:
            channel_branding_keywords = 'not set'
        
        return (channel_id,
                channel_title,
                channel_description,
                channel_default_language,
                channel_country,
                channel_viewcount,
                channel_commentcount,
                channel_subscribercount,
                channel_videocount,
                channel_topic_ids,
                channel_topic_categories,
                channel_branding_keywords)

In [None]:
# Write the data to a file

channels = PATH_TEMP_LEFT + 'channels_left.csv'
count = -1 # if there is an error, it's easier to find the index position from where to continue

with open(channels, "a") as csvFile:
    fieldnames = ['channel_id',
                  'channel_title',
                  'channel_description',
                  'channel_default_language',
                  'channel_country',
                  'channel_viewcount',
                  'channel_commentcount',
                  'channel_subscribercount',
                  'channel_videocount',
                  'channel_topic_ids',
                  'channel_topic_categories',
                  'channel_branding_keywords'                                 
                 ]

    writer = csv.DictWriter(csvFile, fieldnames=fieldnames)
    writer.writeheader()
    
    for channel in channel_seeds['Id']:
        try:
            response = get_channels(channel)
            variabelen = get_channel_data(response)
            (channel_id,
            channel_title,
            channel_description,
            channel_default_language,
            channel_country,
            channel_viewcount,
            channel_commentcount,
            channel_subscribercount,
            channel_videocount,
            channel_topic_ids,
            channel_topic_categories,
            channel_branding_keywords) = variabelen
        except:
            continue
        
        writer.writerow({'channel_id': channel_id, 
                         'channel_title': channel_title, 
                         'channel_description': channel_description, 
                         'channel_default_language': channel_default_language, 
                         'channel_country': channel_country,
                         'channel_viewcount': channel_viewcount,
                         'channel_commentcount': channel_commentcount,
                         'channel_subscribercount': channel_subscribercount,
                         'channel_videocount': channel_videocount,
                         'channel_topic_ids': channel_topic_ids,
                         'channel_topic_categories': channel_topic_categories,
                         'channel_branding_keywords': channel_branding_keywords  
                        })
        count += 1
                         
        print('wrote data for ' + channel_title + ' and index is ' + str(count))

## Filter channel data

Filter the channel data to get a relevant seed list. It's better to do some extra work here, it saves you a lot of work (and API calls) later.

In [None]:
channel_seeds = pd.read_csv(channels)

## Get video data

In [None]:
def get_videos(channel):
    '''Takes a channel_id and finds 
    the first 50 videos'''
    youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION,
                    developerKey=DEVELOPER_KEY)

    response = youtube.search().list(
    channelId = channel,
    type = 'video',
    part ='snippet',
    maxResults = 50,
    ).execute()
    print('getting videos for ' + channel)
    return response

def get_more_videos(channel):
    '''Takes a channel_id and looks for
    the next page in the result list.'''
    youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION,
                    developerKey=DEVELOPER_KEY)

    response = youtube.search().list(
    channelId = channel,
    type = 'video',
    part ='snippet',
    maxResults = 50,
    pageToken = nextPageToken
    ).execute()
    print('getting more pages of ' + channel)

    return response

def get_video_metadata(video_id):
    '''Takes a video_id and gets
    the associated metadata'''
    
    youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION,
                    developerKey=DEVELOPER_KEY)

    response = youtube.videos().list(
        part = 'snippet,contentDetails,statistics',
        id = video_id
        ).execute()
    
    return response

def write_video_data_to_file(response, video_file):
    '''Write the video data to a file'''
    
    with open(video_file, "a") as csvFile:
        fieldnames = ['video_published', 
                      'video_id', 
                      'channel_id', 
                      'video_title', 
                      'video_description',
                      'channel_title',
                      'video_category_id',
                      'video_tags',
                      'video_duration',
                      'video_view_count',
                      'video_comment_count',
                      'video_likes_count',
                      'video_dislikes_count',
                      'video_topic_ids',
                      'video_topic_categories'
                     ]
    
        writer = csv.DictWriter(csvFile, fieldnames=fieldnames)
        writer.writeheader()
    
    
        for video in response['items']:

            video_published = video['snippet']['publishedAt']
            video_id = video['id']['videoId']
            channel_id = video['snippet']['channelId']
            video_title = video['snippet']['title']
            video_description = video['snippet']['description']
            channel_title = video['snippet']['channelTitle']
            try:
                video_category_id = video['snippet']['categoryId']
            except:
                video_category_id = 'not set'
            try:
                video_tags = video['snippet']['tags']
            except:
                video_tags = 'not set'

            video_metadata = get_video_metadata(video_id)

            for metadata in video_metadata['items']:
                print('getting metadata for ' + video_title)


                video_duration = metadata['contentDetails']['duration']
                video_view_count = metadata['statistics']['viewCount']
                try:
                    video_comment_count = metadata['statistics']['commentCount']
                except:
                    video_comment_count = 0

                try:
                    video_likes_count = metadata['statistics']['likeCount']
                except:
                    video_likes_count = 0

                try:
                    video_dislikes_count = metadata['statistics']['dislikeCount']
                except:
                    video_dislikes_count = 0

                try:
                    video_topic_ids = metadata['topicDetails']['topicIds']
                except:
                    video_topic_ids = 'not set'
                try:
                    video_topic_categories = metadata['topicDetails']['topicCategories']
                except:
                    video_topic_categories = 'not set'
                try:
                    video_category_id = metadata['snippet']['categoryId']
                except:
                    video_category_id = 'not set'
                try:
                    video_tags = metadata['snippet']['tags']
                except:
                    video_tags = 'not set'

                writer.writerow({'video_published': video_published,
                                 'video_id': video_id,
                                 'channel_id': channel_id,
                                 'video_title': video_title,
                                 'video_description': video_description,
                                 'channel_title': channel_title,
                                 'video_category_id':video_category_id,
                                 'video_tags': video_tags,
                                 'video_duration': video_duration,
                                 'video_view_count': video_view_count,
                                 'video_comment_count': video_comment_count,
                                 'video_likes_count': video_likes_count,
                                 'video_dislikes_count': video_dislikes_count,
                                 'video_topic_ids': video_topic_ids,
                                 'video_topic_categories': video_topic_categories
                                })
    return response


In [None]:
#channels = pd.read_csv(config.PATH_TEMP_RIGHT + 'channels.csv')
video_file = PATH_TEMP_RIGHT + 'video_file_test.csv'

for channel in sample['Id']:
    response = get_videos(channel)
    nextPageToken = response.get('nextPageToken', None)
    write_video_data_to_file(response, video_file)
    while nextPageToken:
        response = get_more_videos(channel)
        nextPageToken = response.get('nextPageToken', None)
        print('getting more videos for ' + channel)
        write_video_data_to_file(response, video_file)


## Get comment data

In [None]:
def get_comments(videoId):
    youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION,
                    developerKey=DEVELOPER_KEY)

    response = youtube.commentThreads().list(
    videoId = videoId,
    part ='snippet,replies'
    ).execute()

    return response

def get_more_comments(videoId):
    youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION,
                    developerKey=DEVELOPER_KEY)

    response = youtube.commentThreads().list(
    videoId = videoId,
    pageToken = nextPageToken,
    part ='snippet,replies'
    ).execute()

    return response

def write_comments(response, comments_file):
    with open(comments_file, 'a') as csvFile:
            header = ['video_id', 
                      'comment_id', 
                      'author_display_name', 
                      'author_channel_url', 
                      'author_channel_id', 
                      'comment_text', 
                      'comment_like_count', 
                      'comment_dislike_count']
            writer = csv.DictWriter(csvFile, fieldnames=header)
            writer.writeheader()
            for data in response['items']:
                comment_id = data['id']
                video_id = data['snippet']['videoId']
                author_display_name = data['snippet']['topLevelComment']['snippet']['authorDisplayName']
                author_channel_url =data['snippet']['topLevelComment']['snippet']['authorChannelUrl']
                author_channel_id = data['snippetauthorChannelId']['topLevelComment']['snippet']['authorChannelId']['value']
                comment_text = data['snippet']['topLevelComment']['snippet']['textDisplay']
                comment_likes_count = data['snippet']['topLevelComment']['snippet']['likeCount']
                comment_dislikes_count = data['snippet']['topLevelComment']['snippet']['disLikeCount']
                comment_time = data['snippet']['topLevelComment']['snippet']['publishedAt']
                
                
                writer.writerow(({'video_id': video_id, 
                                  'comment_id': comment_id, 
                                  'author_display_name': author_display_name, 
                                  'author_channel_url': author_channel_url, 
                                  'author_channel_id': author_channeld_id, 
                                  'comment_text': comment_text,
                                  'comment_like_count': comment_likes_count,
                                  'comment_dislike_count': comment_dislike_count,
                                  'comment_time': comment_time
                                 }))





In [None]:
comments_file = 'comments.csv'\
vidlist = ''

for videoId in vidlist['videoId']:
    try:
        response = get_comments(videoId)
        nextPageToken = response.get('nextPageToken', None)
        write_comments(response, comments_file)
        while nextPageToken:
            response = get_more_comments(videoId)
            nextPageToken = response.get('nextPageToken', None)
            write_comments(response, comments_file)
    except:
        continue
   

## Get related data

In [None]:
# to do: get related channels

def get_recommendations(video_id):
    youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION,
                    developerKey=DEVELOPER_KEY)

    response = youtube.search().list(
    #videoId = video_id,
    part ='snippet',
    type = 'video',
    relatedToVideoId = video_id,
    maxResults = 50
    ).execute()

    return response

def write_recommendations(response, recommendations_file, videoId):
    for data in response['items']:
        targetVideoId = data['id']['videoId']
        publishedAt = data['snippet']['publishedAt']
        channelId = data['snippet']['channelId']
        title = data['snippet']['title']
        description = data['snippet']['description']
        
        
        with open(recommendations_file, 'a') as csvFile:
            header = ['videoId', 'targetVideoId', 'publishedAt', 'channelId', 'title', 'description']
            writer = csv.DictWriter(csvFile, fieldnames=header)
            writer.writerow(({'videoId': videoId, 
                              'targetVideoId': targetVideoId, 
                              'publishedAt': publishedAt, 
                              'channelId': channelId, 
                              'title': title, 
                              'description': description 
                              }))


In [None]:
video_file = '/home/dim/Documents/projecten/extremisme/youtube/data/temp/bubble/videos.csv'

columns=['videoId', 'publishedAt','videoTitle', 'channelId', 'channelTitle']
vidlist = pd.read_csv(video_file, header=None, names=columns)

vidlist = vidlist[31906:]

In [None]:
recommendations_file = 'recommendations.csv'

for videoId in vidlist['videoId']:
    response = get_recommendations(videoId)
    write_recommendations(response, recommendations_file, videoId)

In [None]:
recos = pd.read_csv('recommendations.csv')

In [None]:
recos.tail()

## Get transcripts

In [None]:
ydl_opts = {
    'writeautomaticsub': True,
    'skip_download': True,
}

In [None]:
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
    for video in sample['videoId']:
        video = 'https://www.youtube.com/watch?v=' + str(video)
        ydl.download([video])

In [None]:
videoId = []
words = []
transcript = []

for filename in glob.glob('~/Documents/projecten/extremisme/youtube/data/temp/bubble/*.vtt'):
    ids = os.path.basename(filename)
    ids = ids[-18:-7]
    videoId.append(ids)
    try:
        for caption in webvtt.read(filename):
            words.append(caption.text)
        transcript.append(words)
    except:
        pass
    words = []
    
file_exists = os.path.isfile('captions.csv')

with open('captions.csv', 'w') as csvfile:
    header = ['videoId', 'transcript']
    writer=csv.writer(csvfile, delimiter=',', fieldnames=header)
    
    if not file_exists:
                writer.writeheader()
    writer.writerows(zip(videoId, transcript))

## Get language info and translations

In [None]:
# second for videoDescription
#for authentication see: https://cloud.google.com/translate/docs/quickstart-client-libraries


lang = []
trans = []
conf = []
target = 'en'

for text in videos_sample['videoDescription']:
    translation = translate_client(text, target_language=target)
    language = translate_client.detect_language(text)
    
    language_result = language['language']
    confidence_result = language['confidence']
    translation_result = translation['translatedText']
    language = translation['detectedSourceLanguage']
    
    lang.append(language_result)
    conf.append(confidence_result)
    trans.append(translation_result)
    
videos_sample['language_videoDescription'] = lang
videos_sample['language_videoDescription_confidence'] = conf
videos_sample['english_videoDescription'] = trans