# Grabbing comments

In [12]:
token_count = 0

In [13]:
import pandas as pd
import numpy as np
import requests

In [14]:
df = pd.read_csv("IDs.csv")["video_id"][4000:4500]
df

4000    5iUhVa9XBIE
4001    5ig5ikYRQ4w
4002    5igiG6Zowzs
4003    5iknaBfiU30
4004    5ittr92m1Sw
           ...     
4495    6YLIOE56yOA
4496    6YW7Zu9KO98
4497    6YgKUZnUyak
4498    6YuspbP0koQ
4499    6Z0B6BP2tGM
Name: video_id, Length: 500, dtype: object

In [15]:
comments_df = pd.DataFrame(df)
comments_df

Unnamed: 0,video_id
4000,5iUhVa9XBIE
4001,5ig5ikYRQ4w
4002,5igiG6Zowzs
4003,5iknaBfiU30
4004,5ittr92m1Sw
...,...
4495,6YLIOE56yOA
4496,6YW7Zu9KO98
4497,6YgKUZnUyak
4498,6YuspbP0koQ


In [16]:
import re
import string
from nltk.corpus import words
from nltk.tokenize import word_tokenize
from nltk import download
download('words')

def clean_and_filter(df):
    def remove_newline(text):
        text = text.replace('\n', '') 
        return text

    def remove_punctuation(text):
        for punctuation in string.punctuation: 
            text = text.replace(punctuation, '') 
        return text

    def lowercase (text): 
        lowercased = text.lower() 
        return lowercased

    df['comment_clean'] = df.comment.apply(remove_newline)
    df['comment_clean'] = df.comment_clean.apply(remove_punctuation)
    df['comment_clean'] = df.comment_clean.apply(lowercase)
    
    english_words = set(words.words())
    def is_english(text):
        words_in_comment = word_tokenize(text)
        num_words_in_comment = len(words_in_comment)
        num_english_words_in_comment = 0
        for word in words_in_comment:
            if word in english_words:
                num_english_words_in_comment += 1
        english = False
        if num_words_in_comment > 0:
            if num_english_words_in_comment/num_words_in_comment >= 0.3:
                english = True
            return english
        else:
            return False
    
    def english_only(df):
        df['english'] = df['comment_clean'].apply(is_english)
        return df

    df = english_only(df)
    
    def remove_non_english_symbols(text):
        english_pattern = re.compile(r'[^a-zA-Z0-9\s\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF\U00002702-\U000027B0\U000024C2-\U0001F251]')
        cleaned_text = re.sub(english_pattern, '', text)
        return cleaned_text
    
    
    df['comment_clean'] = df.comment_clean.apply(remove_non_english_symbols)
    
    return df[df["english"] == True]

[nltk_data] Downloading package words to
[nltk_data]     /Users/willemduyck/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [17]:
genre_dict = {
    "1":"Film & Animation",
    "2":"Autos & Vehicles",
    "10":"Music",
    "15":"Pets & Animals",
    "17":"Sports",
    "18":"Short Movies",
    "19":"Travel & Events",
    "20":"Gaming",
    "21":"Videoblogging",
    "22":"People & Blogs",
    "23":"Comedy",
    "24":"Entertainment",
    "25":"News & Politics",
    "26":"Howto & Style",
    "27":"Education",
    "28":"Science & Technology",
    "29":"Nonprofits & Activism",
    "30":"Movies",
    "31":"Anime/Animation",
    "32":"Action/Adventure",
    "33":"Classics",
    "34":"Comedy",
    "35":"Documentary",
    "36":"Drama",
    "37":"Family",
    "38":"Foreign",
    "39":"Horror",
    "40":"Sci-Fi/Fantasy",
    "41":"Thriller",
    "42":"Shorts",
    "43":"Shows",
    "44":"Trailers"
}

In [None]:
def fetch_comments_relevance(video_id, api_key):
    url = f"https://www.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId={video_id}&key={api_key}&maxResults=100&order=relevance"
    comments = []
    nextPageToken = None
    iter_number = 0
    token_count = 0
    while len(comments) < 100:
        print(f"Tokens used: {token_count}")
        
        if nextPageToken:
            url = f"https://www.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId={video_id}&key={api_key}&maxResults=100&order=relevance"
            url += f"&pageToken={nextPageToken}"
        
        response = requests.get(url)
#         print(response.status_code)
        token_count += 1
        try:
            data = response.json()
        except ValueError:
            print(f"{url} failed!")
            
        
        error = data.get("error", False)
        if error:
            return [""]
        
        for item in data.get("items", []):
            if item:
                comment = item["snippet"]["topLevelComment"]["snippet"]["textOriginal"]
                author = item["snippet"]["topLevelComment"]["snippet"]["authorDisplayName"]
                likecount = item["snippet"]["topLevelComment"]["snippet"]["likeCount"]
                date = item["snippet"]["topLevelComment"]["snippet"]["publishedAt"]
                replies = item["snippet"]["totalReplyCount"]
                dict_ = {"comment": comment, "author": author, "likecount": likecount, "date": date, "replies": replies}

                if comment is not None and pd.to_datetime(date).year <= 2021:
                    cleaned_comment = clean_and_filter(pd.DataFrame([comment],columns=["comment"]))
                    if not cleaned_comment.empty:
                        if cleaned_comment["english"][0] == True:
                            dict_["comment_clean"] = cleaned_comment["comment_clean"]
                            comments.append(dict_)
        nextPageToken = data.get("nextPageToken", None)
        if not nextPageToken or iter_number >= 20:
            break
        if iter_number == 0:
            if len(comments) < 5:
                break
        iter_number += 1
    
    return comments

In [19]:
def fetch_stats(video_id, api_key):
    url = f"https://www.googleapis.com/youtube/v3/videos?part=statistics&id={video_id}&key={api_key}"
    
    response = requests.get(url)
#     print(response.status_code)
    try:
        data = response.json()
    except ValueError:
        print(f"{url} failed!")
    error = data.get("error",False)
    if not error:
        data = data.get("items",False)
        if data:
            views = data[0]["statistics"]["viewCount"]
            likes = data[0]["statistics"]["likeCount"]
            comments = data[0]["statistics"].get("commentCount",0)
            dict_ = {"views":[views], "likes":[likes], "comments":[comments]}
            return dict_
        return {}
    else:
        return {}

In [20]:
def fetch_details(video_id, api_key):
    url = f"https://www.googleapis.com/youtube/v3/videos?part=snippet&id={video_id}&key={api_key}"
    
    response = requests.get(url)
#     print(response.status_code)
    try:
        data = response.json()
    except ValueError:
        print(f"{url} failed!")
    error = data.get("error",False)
    if not error:
        data = data.get("items",False)
        if data:
            date = data[0]["snippet"]["publishedAt"]
            channel_id = data[0]["snippet"]["channelId"]
            title = data[0]["snippet"]["title"]
            description = data[0]["snippet"]["description"]
            channel_title = data[0]["snippet"]["channelTitle"]
            tags = data[0]["snippet"].get("tags",[""])
            genre = genre_dict[data[0]["snippet"]["categoryId"]]
            language = data[0]["snippet"].get("defaultAudioLanguage","")
            dict_ = {"date":date, "channel_id":channel_id, "title":title, "description":description, "channel_title":channel_title, "tags":[tags], "genre":genre, "language":language}
            return dict_
        return {}
    else:
        return {}

In [21]:
import os

API_KEY = os.environ.get('API_KEY')

In [22]:
def fetch_all_info(video_id, api_key):
    details = pd.DataFrame(fetch_details(video_id, api_key))
#     print(details)
#     print(details["language"][0])
    if details.get("language",[""])[0] in ("en", "en-GB", "en-US"):
        comments_relevance = pd.DataFrame(fetch_comments_relevance(video_id, api_key))
        stats = pd.DataFrame(fetch_stats(video_id, api_key))
        info_all = pd.concat([stats,details],axis=1)
        comments_relevance["video_id"] = video_id
        info_all["video_id"] = video_id
        return comments_relevance, info_all

In [23]:
dict_ = fetch_stats("--0bCF-iK2E", API_KEY)
dict_

{'views': ['1407242'], 'likes': ['24776'], 'comments': ['1438']}

In [24]:
dict_

{'views': ['1407242'], 'likes': ['24776'], 'comments': ['1438']}

In [25]:
pd.DataFrame(dict_)

Unnamed: 0,views,likes,comments
0,1407242,24776,1438


In [26]:
print(f"tokens: {token_count}")

tokens: 0


In [27]:
infos = []
comments = []
for id in comments_df["video_id"]:
    print(id)
    print(len(comments))
    result = fetch_all_info(id, API_KEY)
    if result:
        comment, info = result
        infos.append(info)
        comments.append(comment)
infos = pd.concat(infos)
comments[0]

5iUhVa9XBIE
0
5ig5ikYRQ4w
0
5igiG6Zowzs
0
5iknaBfiU30
0
Tokens used: 0
Tokens used: 1
5ittr92m1Sw
1
Tokens used: 0
Tokens used: 1
Tokens used: 2
5ivztp_7o-Q
2
Tokens used: 0
Tokens used: 1
Tokens used: 2
5iyadx4ZyBU
3
Tokens used: 0
Tokens used: 1
5iyn2q6s1Sk
4
Tokens used: 0
5j3_X2-JgVM
5
Tokens used: 0
Tokens used: 1
5jEcQKDmPEo
6
5jIBgfNSPyw
6
5jRaQBcgJW8
6
5jcHrzZdPyY
6
5jsswPkcdoQ
6
5juBmmXoW7I
6
Tokens used: 0
Tokens used: 1
Tokens used: 2
Tokens used: 3
Tokens used: 4
Tokens used: 5
Tokens used: 6
Tokens used: 7
5jyj5aJaLfc
7
5jykokUl9B0
7
5k-xJ2bphkY
7
Tokens used: 0
Tokens used: 1
5k3mCqBW_XU
8
5k5gVe4H-A0
8
Tokens used: 0
Tokens used: 1
5kT58zvtmfM
9
Tokens used: 0
Tokens used: 1
5kypMDoBndI
10
Tokens used: 0
5lIbc3Zl774
11
5lOl8Z34iM0
11
Tokens used: 0
Tokens used: 1
5lQFTdpUP3k
12
Tokens used: 0
5lQ_BuyZGvY
13
5l_FW9sijjk
13
Tokens used: 0
Tokens used: 1
5lb_Qqm9D0Y
14
Tokens used: 0
Tokens used: 1
5lrMlex9_nk
15
5luRAMe0170
15
5m4u8RVCgKs
15
Tokens used: 0
Tokens used: 1
5

Tokens used: 0
Tokens used: 1
69I_pRhmle0
135
Tokens used: 0
Tokens used: 1
69Nlu9W1X2s
136
Tokens used: 0
Tokens used: 1
69Or111IAhw
137
Tokens used: 0
Tokens used: 1
69T-dFOwe1k
138
69h7JSkw3Ww
138
Tokens used: 0
Tokens used: 1
6A7I5gnYOvM
139
6AG6hi2ojL8
139
6APVbQl2ODI
139
6AYggGaMRTQ
139
Tokens used: 0
Tokens used: 1
6AvkDSkAq-A
140
Tokens used: 0
Tokens used: 1
6B3I3qBs9P0
141
6BAMFGFXLXY
141
6BD9gEhvz0M
141
6BPnIL8dOAk
141
6BbWjYlAehg
141
6Binwq50SmA
141
6Bo3Mcc-1XM
141
6Bo85iulqvo
141
Tokens used: 0
Tokens used: 1
Tokens used: 2
6C-tMvpvIfI
142
Tokens used: 0
Tokens used: 1
6C0AWCrRqCM
143
6C4PQ6mHpl4
143
6CFYIOF89hc
143
6CPXE3ULqKk
143
Tokens used: 0
Tokens used: 1
6C_xFWwsCRE
144
6ClxisPJ1l0
144
Tokens used: 0
Tokens used: 1
6D36z-szpvc
145
Tokens used: 0
6D40yXg6PtQ
146
Tokens used: 0
Tokens used: 1
6D9UCe0iMaU
147
Tokens used: 0
Tokens used: 1
6DB7zczIbNI
148
Tokens used: 0
Tokens used: 1
6DD9-W9z62o
149
Tokens used: 0
Tokens used: 1
6DO6OSq7T9c
150
6DiJaODKUmE
150
6DndjMYB

Unnamed: 0,comment,author,likecount,date,replies,comment_clean,video_id
0,I like the worst part about the whole thing is...,Grim Reaper,1288,2021-07-26T00:12:07Z,14,0 i like the worst part about the whole thi...,5iknaBfiU30
1,Wishing a speedy recovery eddie ❤️,Veshremy Stories,620,2021-07-26T03:31:36Z,3,0 wishing a speedy recovery eddie ❤️ Name: ...,5iknaBfiU30
2,It is incredible how strong that tendon is giv...,Joe Anderson,135,2021-08-11T10:09:37Z,5,0 it is incredible how strong that tendon i...,5iknaBfiU30
3,Eddie is such a funny dude. Commentating on hi...,Andrew Bell,326,2021-07-27T08:23:07Z,1,0 eddie is such a funny dude commentating o...,5iknaBfiU30
4,The way the doc explained the surgery was frea...,92metro1,14,2021-08-08T23:59:11Z,0,0 the way the doc explained the surgery was...,5iknaBfiU30
...,...,...,...,...,...,...,...
191,I wish you a fast recovery. The good part is ...,Juan Daniel Dobre Espinosa,0,2021-07-27T22:10:13Z,0,0 i wish you a fast recovery the good part...,5iknaBfiU30
192,I had multiple surgeries so i know what after ...,ganerdene uurtsaih,0,2021-07-27T04:40:47Z,0,0 i had multiple surgeries so i know what a...,5iknaBfiU30
193,God your braver than me 😂😂 rest up ed be good ...,MW - Creative,1,2021-07-26T17:46:38Z,0,0 god your braver than me 😂😂 rest up ed be ...,5iknaBfiU30
194,Hey big guy don't worry about the delay nobody...,Alexander Shaffer,0,2021-07-27T07:43:44Z,0,0 hey big guy dont worry about the delay no...,5iknaBfiU30


In [28]:
infos

Unnamed: 0,views,likes,comments,date,channel_id,title,description,channel_title,tags,genre,language,video_id
0,814470,39189,4121,2021-07-25T17:01:50Z,UCTEpWW26-LKVvIP9mZkvlRw,I Had Surgery AWAKE! | Detached my bicep,Link to all my products and partnerships:-\nht...,Eddie Hall The Beast,"[Eddie hall, Larry wheels, Brian shaw, worlds ...",People & Blogs,en,5iknaBfiU30
0,1541453,64563,659,2021-07-31T17:07:28Z,UCP8A8blIPLuL2kSSrhKJIhg,ELLE N'AURAIT PAS DU SE MOQUER !,Ma chaîne Vlog : https://www.youtube.com/chann...,Anas-Le Bléd’Art,"[Anas, Bled'Art, Compilation, Drôle, Fail, Mem...",Comedy,en-US,5ittr92m1Sw
0,18497235,920418,10829,2021-12-05T12:00:02Z,UCke6I9N4KfC968-yRcd5YRg,Weak To Strong Mod in Among Us,We add a New modded weak to strong imposter Mo...,SSundee,[],Gaming,en,5ivztp_7o-Q
0,17562478,353306,11013,2021-08-14T07:00:02Z,UCqmld-BIYME2i_ooRTo1EOg,Legends of Two Worlds | MLBB x TRANSFORMERS Ci...,What if Transformers arrived in the Land of Da...,Mobile Legends: Bang Bang,[MobileLegends],Gaming,en-US,5iyadx4ZyBU
0,710491,21801,2751,2021-03-28T21:03:20Z,UCphqjYZxxzjNbONVmY-0J7Q,"Ever Given UPDATE 28 March Operation ""Backtwist""",These 4 GREAT YouTube sources helped me better...,blancolirio,[],Science & Technology,en,5iyn2q6s1Sk
...,...,...,...,...,...,...,...,...,...,...,...,...
0,208491,2306,688,2021-06-20T19:00:34Z,UCW6-BQWFA70Dyyc7ZpZ9Xlg,Highlights: Wales in last 16 despite being bea...,Wales demonstrate great resolve in their 1-0 d...,BBC Sport,"[BBC, BBC Sport, football, Match of the Day, M...",Sports,en-GB,6Y3bYICWURQ
0,7499401,323423,9782,2021-11-11T01:00:04Z,UCyFZMEnm1il5Wv3a6tPscbA,"SEVENTEEN ""2 MINUS 1"" (Live Performance) | Ope...",The South Korean boy band SEVENTEEN recently b...,Genius,"[seventeen, gbwc0, 2 minus 1, 2-1, 2 minus 1 s...",Music,en,6YW7Zu9KO98
0,6535223,288719,42974,2021-04-14T15:00:05Z,UC1sELGmy5jp5fQUugmuYlXQ,A Caves & Cliffs Announcement,Agnes and Henrik share some important news abo...,Minecraft,"[minecraft, minecraft update, caves & cliffs, ...",Gaming,en,6YgKUZnUyak
0,860101,10214,667,2021-06-25T03:41:59Z,UCVhibwHk4WKw4leUt6JfRLg,Re-Watch Vegas Golden Knights vs. Montreal Can...,Join Steve Dangle for Game 6 of the Montreal C...,SPORTSNET,"[Sportsnet, Sports, Canada, game 1, highlights...",Sports,en,6YuspbP0koQ


In [29]:
frame = pd.DataFrame()
for video in comments:
    if video.shape[0] >= 50:
        frame = pd.concat([frame,video])

In [30]:
frame2 = pd.DataFrame()
if type(infos) == type([]):
    for video in infos:
        frame2 = pd.concat([frame2,video])
else:
    frame2 = infos

In [32]:
frame2

Unnamed: 0,views,likes,comments,date,channel_id,title,description,channel_title,tags,genre,language,video_id
0,814470,39189,4121,2021-07-25T17:01:50Z,UCTEpWW26-LKVvIP9mZkvlRw,I Had Surgery AWAKE! | Detached my bicep,Link to all my products and partnerships:-\nht...,Eddie Hall The Beast,"[Eddie hall, Larry wheels, Brian shaw, worlds ...",People & Blogs,en,5iknaBfiU30
0,1541453,64563,659,2021-07-31T17:07:28Z,UCP8A8blIPLuL2kSSrhKJIhg,ELLE N'AURAIT PAS DU SE MOQUER !,Ma chaîne Vlog : https://www.youtube.com/chann...,Anas-Le Bléd’Art,"[Anas, Bled'Art, Compilation, Drôle, Fail, Mem...",Comedy,en-US,5ittr92m1Sw
0,18497235,920418,10829,2021-12-05T12:00:02Z,UCke6I9N4KfC968-yRcd5YRg,Weak To Strong Mod in Among Us,We add a New modded weak to strong imposter Mo...,SSundee,[],Gaming,en,5ivztp_7o-Q
0,17562478,353306,11013,2021-08-14T07:00:02Z,UCqmld-BIYME2i_ooRTo1EOg,Legends of Two Worlds | MLBB x TRANSFORMERS Ci...,What if Transformers arrived in the Land of Da...,Mobile Legends: Bang Bang,[MobileLegends],Gaming,en-US,5iyadx4ZyBU
0,710491,21801,2751,2021-03-28T21:03:20Z,UCphqjYZxxzjNbONVmY-0J7Q,"Ever Given UPDATE 28 March Operation ""Backtwist""",These 4 GREAT YouTube sources helped me better...,blancolirio,[],Science & Technology,en,5iyn2q6s1Sk
...,...,...,...,...,...,...,...,...,...,...,...,...
0,208491,2306,688,2021-06-20T19:00:34Z,UCW6-BQWFA70Dyyc7ZpZ9Xlg,Highlights: Wales in last 16 despite being bea...,Wales demonstrate great resolve in their 1-0 d...,BBC Sport,"[BBC, BBC Sport, football, Match of the Day, M...",Sports,en-GB,6Y3bYICWURQ
0,7499401,323423,9782,2021-11-11T01:00:04Z,UCyFZMEnm1il5Wv3a6tPscbA,"SEVENTEEN ""2 MINUS 1"" (Live Performance) | Ope...",The South Korean boy band SEVENTEEN recently b...,Genius,"[seventeen, gbwc0, 2 minus 1, 2-1, 2 minus 1 s...",Music,en,6YW7Zu9KO98
0,6535223,288719,42974,2021-04-14T15:00:05Z,UC1sELGmy5jp5fQUugmuYlXQ,A Caves & Cliffs Announcement,Agnes and Henrik share some important news abo...,Minecraft,"[minecraft, minecraft update, caves & cliffs, ...",Gaming,en,6YgKUZnUyak
0,860101,10214,667,2021-06-25T03:41:59Z,UCVhibwHk4WKw4leUt6JfRLg,Re-Watch Vegas Golden Knights vs. Montreal Can...,Join Steve Dangle for Game 6 of the Montreal C...,SPORTSNET,"[Sportsnet, Sports, Canada, game 1, highlights...",Sports,en,6YuspbP0koQ


In [33]:
frame

Unnamed: 0,comment,author,likecount,date,replies,comment_clean,video_id
0,I like the worst part about the whole thing is...,Grim Reaper,1288,2021-07-26T00:12:07Z,14,0 i like the worst part about the whole thi...,5iknaBfiU30
1,Wishing a speedy recovery eddie ❤️,Veshremy Stories,620,2021-07-26T03:31:36Z,3,0 wishing a speedy recovery eddie ❤️ Name: ...,5iknaBfiU30
2,It is incredible how strong that tendon is giv...,Joe Anderson,135,2021-08-11T10:09:37Z,5,0 it is incredible how strong that tendon i...,5iknaBfiU30
3,Eddie is such a funny dude. Commentating on hi...,Andrew Bell,326,2021-07-27T08:23:07Z,1,0 eddie is such a funny dude commentating o...,5iknaBfiU30
4,The way the doc explained the surgery was frea...,92metro1,14,2021-08-08T23:59:11Z,0,0 the way the doc explained the surgery was...,5iknaBfiU30
...,...,...,...,...,...,...,...
175,You are brilliant. And admirable.,Rachel Janine Harris,0,2020-12-22T21:19:43Z,0,0 you are brilliant and admirable Name: com...,6Z0B6BP2tGM
176,good work max so impressive!,dr jaymes,0,2021-02-25T19:52:00Z,0,0 good work max so impressive Name: comment...,6Z0B6BP2tGM
177,I’ve met Aled twice should have asked him for ...,Bethy Spenc,11,2020-12-22T17:26:14Z,0,0 ive met aled twice should have asked him ...,6Z0B6BP2tGM
178,I am deffo looking forward to listening to the...,Vi-king production,0,2020-12-25T16:01:44Z,0,0 i am deffo looking forward to listening t...,6Z0B6BP2tGM


In [34]:
frame.value_counts("video_id")

video_id
5txge3Uisfs    199
6I6avlSIxO8    199
5ulsxchKixg    198
60cjZnQDmSQ    198
5sAAPCS_nXY    198
              ... 
6G7ONJsY05A     85
6HaIyP2NNLg     70
61UIx6ZGvDA     61
66x9mklmTOA     58
6DxIcpXnLhw     56
Name: count, Length: 252, dtype: int64

In [35]:
frame.to_csv("comments.csv")

In [36]:
frame2.to_csv("infos.csv")