In [20]:
import pandas as pd
import numpy as np
import os
import requests
import json
import matplotlib.pyplot as plt

In [21]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

## Data acquisition

#### Getting a list with videos

List of videos using the YouTube Data API [YouTube Data API](https://tools.digitalmethods.net/netvizz/youtube/mod_videos_list.php)

Querying for the terms: `Global warming`, `Climate change`, `Paris agreement`, `Climate realism`.

#### Getting all comments (including replies) to all videos in the list

Get all comments to a video using the [CommentThreads method of YouTube Developer API](https://developers.google.com/youtube/v3/docs/commentThreads/list)

The API documentation of CommentsThread states that it might not contain all replies: 

>A commentThread resource contains information about a YouTube comment thread, which comprises a top-level comment and replies, if any exist, to that comment. A commentThread resource can represent comments about either a video or a channel.

>Both the top-level comment and the replies are actually comment resources nested inside the commentThread resource. The commentThread resource does not necessarily contain all replies to a comment, and you need to use the comments.list method if you want to retrieve all replies for a particular comment. Also note that some comments do not have replies.

Therefore we use the [Coments list method](https://developers.google.com/youtube/v3/docs/commentThreads/list) to get all replies to a comment. 

In [22]:
API_KEY = 'AIzaSyAGegTsA3vp5N544npMDkbfDwZuqCOjeh0'

In [23]:
data_path = 'videolist_search500_2021_02_07-00_46_57_climate_crisis.tab'

In [24]:
def load_videos(data_path, min_comments_count = 3):
    videos = pd.read_csv(data_path, sep='\t',header=(0))
    #remove entries where commentCount is None
    videos = videos.dropna(how='all', subset=['commentCount'])
    #remove videos where comments count is lesser then minimum
    videos.drop(videos[videos['commentCount'] < min_comments_count].index, inplace = True)
    videos = videos.sort_values(['commentCount'], ascending=[False])  
    return videos

#### Class to load all comments of a video 

In [25]:
class Video_comments:
    
    def __init__(self, api_key):
        self.api_key  = api_key
        #self.video_id = video_id
        self.max_results = 100     
        self.comments_df = None
        self.video_published_at = None
        self.search_term = None
        
    '''load all replies of top level comments and append dataframe witth all top level comments and replies. 
    (appendingt to df and loading replies should be devided to different methods.)'''
    def _add_to_dataframe(self, response):
        for i, main_comment in enumerate(response['items']):
            comment = main_comment['snippet']['topLevelComment']['snippet']

            new_row = pd.Series(data={
                                    'id':main_comment['snippet']['topLevelComment']['id'],
                                    'threadId':main_comment['snippet']['topLevelComment']['id'],
                                    'published_at':comment['publishedAt'] , 
                                    'author_name': comment['authorDisplayName'], 
                                    'text': comment['textOriginal'],
                                    'likeCount':comment['likeCount'],
                                    'replyCount':main_comment['snippet']['totalReplyCount'],
                                    'authorChannelId':comment['authorChannelId']['value'],
                                    'is_reply': 0,
                                    'video_id': comment['videoId'],
                                    'video_published_at':self.video_published_at,
                                    'search_term':self.search_term})

            self.comments_df = self.comments_df.append(new_row, ignore_index=True)

            
            #check if the top level comment has replies. If yey then get these too and add to df
            request_replies = requests.get(f"https://youtube.googleapis.com/youtube/v3/comments?part=snippet&parentId={main_comment['snippet']['topLevelComment']['id']}&key={self.api_key}")
            response_replies = json.loads(request_replies.text)
        
            #if response_replies['items'] > 0 then the main comment has replies
            if(len(response_replies['items']) > 0):
             
                for i, main_reply in enumerate(response_replies['items']):      
                    reply = main_reply['snippet']

                    new_row = pd.Series(data={
                                            'id':reply['parentId'],
                                            'threadId':main_comment['snippet']['topLevelComment']['id'],
                                            'published_at':reply['publishedAt'] , 
                                            'author_name': reply['authorDisplayName'], 
                                            'text': reply['textOriginal'],
                                            'likeCount':reply['likeCount'],
                                            'replyCount': 0,
                                            'authorChannelId':reply['authorChannelId']['value'],
                                            'is_reply': 1,
                                            'video_id': comment['videoId'],
                                            'video_published_at':self.video_published_at,
                                            'search_term':self.search_term})

                    self.comments_df = self.comments_df.append(new_row, ignore_index=True)
    
    '''Load (and append comments dataframe) recursively comments from next page until there are no next page. '''
    def _get_next_page(self, response):     
        request1 = requests.get(f"https://www.googleapis.com/youtube/v3/commentThreads?part=snippet&maxResults={self.max_results}&pageToken={str(response['nextPageToken'])}&videoId={self.video_id}&key={self.api_key}")
        response1 = json.loads(request1.text)
        self._add_to_dataframe(response1)
        
        if ('nextPageToken' in response1.keys()):
            self._get_next_page(response1)
    
    '''Start loading comments. Paginated.'''
    def get_comments(self, video_id, video_published_at, search_term):  
        
        self.search_term = search_term
        self.video_published_at = video_published_at
        self.comments_df = pd.DataFrame({
                            'id':[],
                            'replyCount': [],
                            'likeCount': [],
                            'published_at': [], 
                            'author_name': [],
                            'text': [],
                            'authorChannelId':[],
                            'is_reply': [],
                            'threadId':[],
                            'video_id':[],
                            'video_published_at': [],
                            'search_term':[]}, 
                            columns = [ 'id',
                                        'replyCount',
                                        'likeCount',
                                        'published_at', 
                                        'author_name',
                                        'text',
                                        'authorChannelId',
                                        'is_reply',
                                        'threadId',
                                        'video_id',
                                        'video_published_at',
                                        'search_term'])
        
        self.video_id = video_id
        request  = requests.get(f"https://www.googleapis.com/youtube/v3/commentThreads?part=snippet&maxResults={self.max_results}&videoId={self.video_id}&key={self.api_key}")
        response = json.loads(request.text)     
        #print(len(self.comments_df))
        #print('ADDING FIRST PAGE')
        self._add_to_dataframe(response)
        
        if 'nextPageToken' in response.keys():
            self._get_next_page(response)
        
        self.video_published_at = None
        self.search_term = None
        return self.comments_df

vid_comments = Video_comments('AIzaSyCo58wzF-1eZXXTvb71cUJlzBJ2a9Dt3ms')
comments_df  = vid_comments.get_comments('oo5ca1dMbEc')

Index(['id', 'replyCount', 'likeCount', 'published_at', 'author_name', 'text',
       'authorChannelId', 'is_reply', 'threadId', 'video_id',
       'video_published_at'],
      dtype='object')

In [26]:
"""List with all API keys"""
api keys

In [27]:
videos = load_videos('summery_vid_lists/2021-03-09-15-12-13_master_video_list_below_10000.csv')

In [28]:
def create_comments_csv(videolist_name, API_KEY, max_dowload):
    """
    This method creates a csv files of comments by iterating through the videos in the specified videolist.
    A Google API key needs to be provided.
    
    The final csv is stored at data_raw/{number videos}_videos_{number comments}_comments_{your videlist}.csv
    """
    videos = load_videos('summery_vid_lists/2021-03-09-15-12-13_master_video_list_below_10000.csv')
    key = 0
    vid_comments = Video_comments(api_keys[key])
    totalVideoCount = videos.shape[0]
    counter = 1
    max_download = 10000
    all_comments_df = pd.DataFrame()
    
    for i, video in videos[1:len(videos)].iterrows():
        if((len(all_comments_df) + video.commentCount) < max_download):
            print('video: ',counter,' of ',totalVideoCount,' # of comments: ',video.commentCount)
            comments_df = vid_comments.get_comments(video.videoId, video.publishedAt, video.search_Term)
            all_comments_df = pd.concat([all_comments_df, comments_df], axis=0)
            
            #remove the downloaded video from the list
            videos_index = videos[videos['videoId'] == video.videoId].index 
            videos.drop(videos_index, inplace = True)
            
            print(all_comments_df.shape,'   ',comments_df.shape)
            counter+=1 
        elif(key < len(api_keys)): 
            '''if a new videos comments would exceed the limit with the api keys we have 
            then take the next key from the list and expand the max_download with 10000'''   
            key += 1
            print(len(all_comments_df),' + ',video.commentCount,' > 10K therefore new API key')
            vid_comments = Video_comments(api_keys[key])
            max_download += 10000
            counter = 1
            
    #store the list of remaining videos to download
    if(len(videos) > 0):
        videos.to_csv('summery_vid_lists/' + 'remaining_110321' + str(len(videos)) + '_videos' + '.csv', index = True)
    
    #store the downloaded comments
    all_comments_df.to_csv('data_raw/comments/' + str(counter) + '_videos_' + str(len(all_comments_df)) + '_comments_' + videolist_name[:-4] + '.csv', index = True) 
    return all_comments_df

In [29]:
download = True
if(download):
    df = create_comments_csv(data_path, API_KEY, 10)

video:  1  of  450  # of comments:  9946.0


KeyError: 'items'

In [None]:
df.head()