In [56]:
import pandas as pd
import numpy as np
import os
import requests
import json
from pathlib import Path

In [109]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

## Data acquisition

#### Getting a list with videos

List of videos using the YouTube Data API [YouTube Data API](https://tools.digitalmethods.net/netvizz/youtube/mod_videos_list.php)

Querying for the terms: `Global warming`, `Climate change`, `Paris agreement`, `Climate realism`.

#### Getting all comments (including replies) to all videos in the list

Get all comments to a video using the [CommentThreads method of YouTube Developer API](https://developers.google.com/youtube/v3/docs/commentThreads/list)

The API documentation of CommentsThread states that it might not contain all replies: 

>A commentThread resource contains information about a YouTube comment thread, which comprises a top-level comment and replies, if any exist, to that comment. A commentThread resource can represent comments about either a video or a channel.

>Both the top-level comment and the replies are actually comment resources nested inside the commentThread resource. The commentThread resource does not necessarily contain all replies to a comment, and you need to use the comments.list method if you want to retrieve all replies for a particular comment. Also note that some comments do not have replies.

Therefore we use the [Coments list method](https://developers.google.com/youtube/v3/docs/commentThreads/list) to get all replies to a comment. 

In [10]:
API_KEY = 'AIzaSyBvNFnt0KmQUBYsPc-vPAfqkkUjuvZa3CI'

#### List of vidoes containing the term `Paris agreement`

In [14]:
data_path = 'videolist_search50_2021_02_08-14_19_10.tab'

In [15]:
def load_videos(data_path, min_comments_count = 3):
    videos = pd.read_csv(data_path, sep='\t',header=(0))
    #remove entries where commentCount is None
    videos = videos.dropna(how='all', subset=['commentCount'])
    #remove videos where comments count is lesser then minimum
    videos.drop(videos[videos['commentCount'] < min_comments_count].index, inplace = True)
    videos = videos.sort_values(['commentCount'], ascending=[False])  
    return videos

In [16]:
videos = load_videos(data_path)

We need to find more videos with more than a minimum number of comments. E.g. 2 comments.

In [17]:
print(f"We have in total {np.sum(videos['commentCount'])} from comments distributed in {len(videos)} videos containing the term Paris Agreement.")
print(f"Mean comment count: {np.mean(videos['commentCount'])} Median: {np.median(videos['commentCount'])}")

We have in total 44624.0 from comments distributed in 47 videos containing the term Paris Agreement.
Mean comment count: 949.4468085106383 Median: 134.0


In [18]:
videos.head()

Unnamed: 0,position,channelId,channelTitle,videoId,publishedAt,publishedAtSQL,videoTitle,videoDescription,tags,videoCategoryId,...,dimension,definition,caption,thumbnail_maxres,licensedContent,viewCount,likeCount,dislikeCount,favoriteCount,commentCount
1,2,UC3XTzVzaHQEd30rQbuvCtTQ,LastWeekTonight,5scez5dqtAc,2017-06-05T06:30:00Z,2017-06-05 06:30:00,Paris Agreement: Last Week Tonight with John O...,Donald Trump plans to withdraw the United Stat...,"last week tonight paris agreement,paris accord...",24,...,2d,hd,False,https://i.ytimg.com/vi/5scez5dqtAc/maxresdefau...,1.0,13068762,177317,12663,0,13779.0
14,15,UC2LZO6swZ9SLUEOks3WnsfA,2veritasium,1WKoj-kodBw,2017-06-02T21:17:43Z,2017-06-02 21:17:43,5 Bad Reasons to Ditch the Paris Climate Agree...,I've heard a lot of reasons for withdrawing fr...,"veritasium,paris,donald trump,trump,climate ch...",27,...,2d,hd,True,https://i.ytimg.com/vi/1WKoj-kodBw/maxresdefau...,1.0,637029,36793,5448,0,6309.0
12,13,UCGaVdbSav8xWuFWTadK6loA,vlogbrothers,Sr2J_1J9w3A,2017-06-02T18:18:15Z,2017-06-02 18:18:15,The Paris Accord: What is it? And What Does it...,At the heart of the desire to get America out ...,"climate change,paris agreement,paris accord,pa...",22,...,2d,hd,True,https://i.ytimg.com/vi/Sr2J_1J9w3A/maxresdefau...,1.0,813125,42687,3222,0,4280.0
5,6,UCZWlSUNDvCCS1hBiXV0zKcA,PragerU,47bNzLj5E_Q,2017-01-16T10:38:20Z,2017-01-16 10:38:20,The Paris Climate Agreement Won't Change the C...,The Paris Climate Agreement will cost at least...,"Dennis Prager,PragerU,Prager University,Bjorn ...",27,...,2d,hd,True,https://i.ytimg.com/vi/47bNzLj5E_Q/maxresdefau...,1.0,3469006,20802,7839,0,4192.0
16,17,UCjo1uN-aM3rmBV46xj7l2KA,John Stossel,cVkAsPizAbU,2018-03-19T10:00:30Z,2018-03-19 10:00:30,The Paris Climate Fraud,President Trump is right to withdraw from the ...,"Paris climate accord,Paris Climate Agreement,P...",25,...,2d,hd,True,https://i.ytimg.com/vi/cVkAsPizAbU/maxresdefau...,1.0,853178,33702,1366,0,3640.0


In [116]:
#TODO: Concat video lists remove duplicates based on videoId
#TODO: find number of users commented multiple videos

#### Class to load all comments of a video 

In [117]:
class Video_comments:
    
    def __init__(self, api_key):
        self.api_key  = api_key
        #self.video_id = video_id
        self.max_results = 100     
        comments_df = pd.DataFrame({
                            'id':[],
                            'threadId':[],
                            'published_at': [], 
                            'author_name': [], 
                            'text': [],
                            'is_reply': [],
                            'likeCount': [],
                            'cleaned': [],
                            'video_id': [],
                            'video_published_at': []}, 
                            columns = [ 'id',
                                        'threadId',
                                        'published_at', 
                                        'author_name', 
                                        'text', 
                                        'likeCount',
                                        'is_reply', 
                                        'cleaned', 
                                        'video_id', 
                                        'video_published_at'])
        self.comments_df = comments_df
        
    '''load all replies of top level comments and append dataframe witth all top level comments and replies. 
    (appendingt to df and loading replies should be devided to different methods.)'''
    def _add_to_dataframe(self, response):
        for i, main_comment in enumerate(response['items']):
            comment = main_comment['snippet']['topLevelComment']['snippet']

            new_row = pd.Series(data={
                                    'id':main_comment['snippet']['topLevelComment']['id'],
                                    'threadId':main_comment['snippet']['topLevelComment']['id'],
                                    'published_at':comment['publishedAt'] , 
                                    'author_name': comment['authorDisplayName'], 
                                    'text': comment['textOriginal'],
                                    'likeCount':comment['likeCount'],
                                    'is_reply': 0,
                                    'video_id': comment['videoId']})

            self.comments_df = self.comments_df.append(new_row, ignore_index=True)

            
            #check if the top level comment has replies. If yey then get these too and add to df
            request_replies = requests.get(f"https://youtube.googleapis.com/youtube/v3/comments?part=snippet&parentId={main_comment['snippet']['topLevelComment']['id']}&key={self.api_key}")
            response_replies = json.loads(request_replies.text)
        
            #if response_replies['items'] > 0 then the main comment has replies
            if(len(response_replies['items']) > 0):
                for i, main_reply in enumerate(response_replies['items']):      
                    reply = main_reply['snippet']

                    new_row = pd.Series(data={
                                            'id':reply['parentId'],
                                            'threadId':main_comment['snippet']['topLevelComment']['id'],
                                            'published_at':reply['publishedAt'] , 
                                            'author_name': reply['authorDisplayName'], 
                                            'text': reply['textOriginal'],
                                            'likeCount':reply['likeCount'],
                                            'is_reply': 1,
                                            'video_id': comment['videoId']})

                    self.comments_df = self.comments_df.append(new_row, ignore_index=True)
    
    '''Load (and append comments dataframe) recursively comments from next page until there are no next page. '''
    def _get_next_page(self, response):     
        request1 = requests.get(f"https://www.googleapis.com/youtube/v3/commentThreads?part=snippet&maxResults={self.max_results}&pageToken={str(response['nextPageToken'])}&videoId={self.video_id}&key={self.api_key}")
        response1 = json.loads(request1.text)
        self._add_to_dataframe(response1)
        
        if ('nextPageToken' in response1.keys()):
            self._get_next_page(response1)
    
    '''Start loading comments. Paginated.'''
    def get_comments(self, video_id):  
        
        self.comments_df = pd.DataFrame({
                            'id':[],
                            'threadId':[],
                            'published_at': [], 
                            'author_name': [], 
                            'text': [],
                            'is_reply': [],
                            'likeCount': [],
                            'cleaned': [],
                            'video_id': [],
                            'video_published_at': []}, 
                            columns = [ 'id',
                                        'threadId',
                                        'published_at', 
                                        'author_name', 
                                        'text', 
                                        'likeCount',
                                        'is_reply', 
                                        'cleaned', 
                                        'video_id', 
                                        'video_published_at'])
        

        self.video_id = video_id
        request  = requests.get(f"https://www.googleapis.com/youtube/v3/commentThreads?part=snippet&maxResults={self.max_results}&videoId={self.video_id}&key={self.api_key}")
        response = json.loads(request.text)     
        #print(len(self.comments_df))
        #print('ADDING FIRST PAGE')
        self._add_to_dataframe(response)
        
        if 'nextPageToken' in response.keys():
            self._get_next_page(response)
        
        return self.comments_df

In [120]:
download = False

if(download):
    vid_comments = Video_comments(API_KEY)
    all_comments_df = pd.DataFrame()

    totalVideoCount = videos.shape[0]
    counter = 1
    max_download = 10000

    for i, video in videos[4:5].iterrows():

        if((len(all_comments_df) + video.commentCount) < max_download):
            #if(counter < 14 & ):
            print('video: ',counter,' of ',totalVideoCount,' # of comments: ',video.commentCount)
            comments_df = vid_comments.get_comments(video.videoId)
            print('video: ',counter,' of ',totalVideoCount,' # of comments: ',video.commentCount,'  ',comments_df.shape)

            all_comments_df = pd.concat([all_comments_df, comments_df], axis=0)
            print(all_comments_df.shape,'   ',comments_df.shape)
        counter+=1    
        
    all_comments_df.shape
    
    # saving the DataFrame as a CSV file 
    all_comments_df.to_csv('data_raw/comments/all_comments.csv', index = True) 

video:  1  of  47  # of comments:  3640.0
video:  1  of  47  # of comments:  3640.0    (3320, 10)
(3320, 10)     (3320, 10)


#### Read all comments from data_raw\comments folder into one dataframe

In [64]:
def read_folder(csv_folder):
    ''' Input is a folder with csv files; return list of data frames.'''
    csv_folder = Path(csv_folder).absolute()
    csv_files = [f for f in csv_folder.iterdir() if f.name.endswith('csv')]
    # the assign() method adds a helper column
    dfs = [
        pd.read_csv(csv_file)for idx, csv_file in enumerate(csv_files, 1)
    ]
    return dfs

In [65]:
dfs = read_folder('data_raw\comments')

In [66]:
all_comments_df = pd.DataFrame()

In [67]:
for df in dfs:
    df = df.drop(['Unnamed: 0'], axis=1)
    all_comments_df = pd.concat([all_comments_df, df]).drop_duplicates().reset_index(drop=True) 

In [68]:
all_comments_df.shape

(27804, 10)

In [69]:
all_comments_df.iloc[all_comments_df.likeCount.argmax()]

id                                                 UghiRnPt-t7jungCoAEC
threadId                                           UghiRnPt-t7jungCoAEC
published_at                                       2017-06-01T19:57:05Z
author_name                                                Daniel Casey
text                  "I know we messed up a bit." \n\na bit\n\nyeah...
likeCount                                                        2953.0
is_reply                                                            0.0
cleaned                                                             NaN
video_id                                                    VR1UhhO4UYU
video_published_at                                                  NaN
Name: 3078, dtype: object