In [1]:
import os
import googleapiclient.discovery
import pandas as pd
import requests
import time 
import json
import numpy as np

In [2]:
# Load credentials from json file
with open("/Users/tlipman/.secret/youtube_credentials.json", "r") as file:
    creds = json.load(file)

In [3]:
def get_response(vid_id, page):
    """Get response request for comments of a YouTube video from the Google API with given video ID and page ID"""
    #setting up API auth
    api_service_name = "youtube"
    api_version = "v3"
    api_key = creds['CONSUMER_KEY']
    
    youtube = googleapiclient.discovery.build(
        api_service_name, api_version, developerKey = api_key)
    #setting up our request
    req = youtube.commentThreads().list(
        part="snippet,replies",
        videoId=vid_id,
        maxResults= 100,
        pageToken= page)
    # getting response 
    response = req.execute()
    return response

In [4]:
def get_comments(num_comments, video_id):
    """Pass in number of comments requested & the video_id (end of url excluding timestamp), 
    returns requested comments as pandas DataFrame"""
    # figuring out how many iterations are needed from number of comments requested
    iterations = int(num_comments/100)
    # setting up our dictionary we will later convert to DataFrame
    dict_ = {'text': [], 'vid_id':[], 'likes': [], 'date': [], 'channel_id': [],
             'viewer_rating':[], 'mentions':[], 'comment_id':[]
    }
    # later on we will check if the dataframe is still empty (like we are assigning it here) or if we need to add to a created one
    df_youtube = ''
    # when starting we do not have a next page, but we will re-write this as we iterate 
    nextpage = None
    # start iterating
    for i in range(iterations):
        #wrap it in a try statment so that if we ask for too many comments and get an error it stops & still returns our df
        try:
            # get reponse for page 
            comments = get_response(video_id, nextpage)
            count = 0 
            # getting our coments and pages from our response 
            for item in comments.values():
                count = count + 1 
                if count == 3:
                    nextpage = item
                if count == 5:
                    comments = item
            for i in range(len(comments)):
                # extracting data wanted from comments 
                dict_['text'].append(comments[i]['snippet']['topLevelComment']['snippet']['textOriginal'])
                dict_['vid_id'].append(comments[i]['snippet']['videoId'])
                dict_['likes'].append(comments[i]['snippet']['topLevelComment']['snippet']['likeCount'])
                dict_['date'].append(comments[i]['snippet']['topLevelComment']['snippet']['publishedAt'])
                dict_['channel_id'].append(comments[i]['snippet']['topLevelComment']['snippet']['authorChannelId']['value'])
                dict_['viewer_rating'].append(comments[i]['snippet']['topLevelComment']['snippet']['viewerRating'])
                dict_['comment_id'].append(comments[i]['id'])
                dict_['mentions'].append('')
                # getting the number of replies 
                try:
                    replies = len(comments[i]['replies'])
                except:
                    replies = 0
                # iterating over our replies and adding them to our dataframe 
                for j in range(replies):
                    dict_['text'].append(comments[i]['replies']['comments'][j]['snippet']['textOriginal'])
                    dict_['vid_id'].append(comments[i]['replies']['comments'][j]['snippet']['videoId'])
                    dict_['likes'].append(comments[i]['replies']['comments'][j]['snippet']['likeCount'])
                    dict_['date'].append(comments[i]['replies']['comments'][j]['snippet']['publishedAt'])
                    dict_['channel_id'].append(comments[i]['replies']['comments'][j]['snippet']['authorChannelId']['value'])
                    dict_['viewer_rating'].append(comments[i]['replies']['comments'][j]['snippet']['viewerRating'])              
                    dict_['mentions'].append(comments[i]['replies']['comments'][j]['snippet']['parentId'])
                    dict_['comment_id'].append(comments[i]['replies']['comments'][j]['id'])
        except:
            pass
    # converting our dictionary into a DataFrame and returning it 
    df_youtube = pd.DataFrame.from_dict(dict_)
    return df_youtube

In [5]:
# creating a dictioniary of videos to get the comments from 
video_dict1 = {
    'Spaceship Design': 'V4ddnrBT6hE',
    'The Race to Mars in 2020': 'VQY3qWZMIl8',
    'What If We Try And Colonize Mars?': 'Gcnf5BdLXxw',
    'The VASIMR Engine: How to Get to Mars in 40 Days': 'uqX8wIkjoYg',
    'Bigelow Aerospace is Building the Worlds First Hotel | Answers with Joe': '5nE3UO1kqv0',
    'Why Earth Is A Prison and How To Escape It': 'RVMZxH1TIIQ',
    'Five REAL Possibilities for Interstellar Travel': 'EzZGPCyrpSU',
    'Horizons mission - Soyuz: launch to orbit': 'fr_hXLDLc38',
    'Will Humanity Reach Another Star In Your Lifetime?': '3zMUJwGrn6Q'
}

In [6]:
video_dict2 = {'How Fast Can We Travel In Space?': 'OYAgcS31-p0',
        'What I learned from going blind in space | Chris Hadfield': 'Zo62S0ulqhA',
        'Three Men Lost in Space – The Apollo 13 Disaster': '5OLtteIwwNs',
        'How Far Can We Go? Limits of Humanity.': 'ZL4yYHdDSWs',
        '10 Terrifying Facts About Space': 'UNEFDynNw-Q',
}

In [7]:
video_dict3 = {'Perseverance Rover’s Descent and Touchdown on Mars (Official NASA Video)': '4czjS9h4Fpg',
               'NASA’S Perseverance Rover’s First 360 View of Mars (Official)': 'wE-aQO9XD1g',
               'Perseverance’s First Sounds from Mars': 'ZBFjpnV9-sg',
               'Curiosity Mars Rover Snaps 1.8 Billion-Pixel Panorama (narrated video)': 'X2UaFuJsqxk',
               'The Mars Homes That NASA Awarded $500k': 'LCuZC-CRg4M',
               'An Epic Journey to a Black Hole to Give You Goosebumps': 'RV170sqhm4Q',
               'Mars: What happened to the Spirit Rover?': '7V54LRRJaGk',
               'Preparing to Land Perseverance': 'v7iUb_wDHxk',
               'Testing the Curiosity Rover on Earth': '3-MNAX1jgbA',
               'HASSELL + EOC presents MARS HABITAT': 'AIrH01N9AsE'
}

In [8]:
video_list = []
for value in video_dict1.values():
    video_list.append(value)
video_list

['V4ddnrBT6hE',
 'VQY3qWZMIl8',
 'Gcnf5BdLXxw',
 'uqX8wIkjoYg',
 '5nE3UO1kqv0',
 'RVMZxH1TIIQ',
 'EzZGPCyrpSU',
 'fr_hXLDLc38',
 '3zMUJwGrn6Q']

In [9]:
for value in video_dict2.values():
    video_list.append(value)
video_list

['V4ddnrBT6hE',
 'VQY3qWZMIl8',
 'Gcnf5BdLXxw',
 'uqX8wIkjoYg',
 '5nE3UO1kqv0',
 'RVMZxH1TIIQ',
 'EzZGPCyrpSU',
 'fr_hXLDLc38',
 '3zMUJwGrn6Q',
 'OYAgcS31-p0',
 'Zo62S0ulqhA',
 '5OLtteIwwNs',
 'ZL4yYHdDSWs',
 'UNEFDynNw-Q']

In [10]:
for value in video_dict3.values():
    video_list.append(value)
video_list

['V4ddnrBT6hE',
 'VQY3qWZMIl8',
 'Gcnf5BdLXxw',
 'uqX8wIkjoYg',
 '5nE3UO1kqv0',
 'RVMZxH1TIIQ',
 'EzZGPCyrpSU',
 'fr_hXLDLc38',
 '3zMUJwGrn6Q',
 'OYAgcS31-p0',
 'Zo62S0ulqhA',
 '5OLtteIwwNs',
 'ZL4yYHdDSWs',
 'UNEFDynNw-Q',
 '4czjS9h4Fpg',
 'wE-aQO9XD1g',
 'ZBFjpnV9-sg',
 'X2UaFuJsqxk',
 'LCuZC-CRg4M',
 'RV170sqhm4Q',
 '7V54LRRJaGk',
 'v7iUb_wDHxk',
 '3-MNAX1jgbA',
 'AIrH01N9AsE']

In [11]:
# starting a dataframe for us to add additional comments onto 
df = get_comments(100, 'BI-old7YI4I')

In [12]:
#iterating over our list of videos to get their comments and add them to our starting dataframe 
for video in video_list:
    df = df.append(get_comments(100, video))

In [13]:
len(df)

2733

In [14]:
# getting rid of duplicate entries (if any)
df.drop_duplicates(subset='comment_id', inplace=True)

In [16]:
df.to_csv('youtube_march28', sep='\t', index=False)