In [2]:
from googleapiclient.discovery import build
import googleapiclient.discovery
import json
import html


# Videos analyzed:
https://www.youtube.com/watch?v=ceKMnyMYIMo
11958 comments.
# Videos to analyze:
https://www.youtube.com/watch?v=v9WSjE3tIkg

# TO DO!
- Cleaning the data: are there multiple comments from the same user?
- Statistical analysis: what words do the most intensive comments contain?
 + make a subset with at least 500 comments that are most intensive
 + make a subset with 50 comments that are most neutral and try to see if the AI made mistakes; add to it a subset with the 50 comments that are most neutral and try to see if the AI made mistakes.

# Data Ingestion

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
#Parameters:
max_results = 5000
api_key = 'INSERT YOUR OWN GOOGLE API KEY HERE'

# Building the request
youtube = build('youtube', 'v3', developerKey=api_key)

In [5]:
try:

    #If it's already downloaded, don't bother the API
    with open('/content/drive/MyDrive/ColabNotebooks for NLP and other ML/StrongInternetLanguage/raw_comments.json', 'r', encoding='utf-8') as f:
        comments_dict = json.load(f)

except FileNotFoundError:
    # Actually requesting:
    video_id = 'ceKMnyMYIMo'
    response = youtube.commentThreads().list(
        part='snippet',
        videoId=video_id,
        maxResults=max_results
    ).execute()
    # Temporary dict to store comments
    comments_dict = {}

    for item in response['items']:
        comment_id = item['id']
        comment_text = item['snippet']['topLevelComment']['snippet']['textDisplay']
        comments_dict[comment_id] = comment_text

    while 'nextPageToken' in response:
        next_page_token = response['nextPageToken']

        # Make another request for the next page of comments
        response = youtube.commentThreads().list(
            part='snippet',
            videoId=video_id,
            maxResults=max_results,
            pageToken=next_page_token
        ).execute()

        # Extract and add comments from the current response to the dictionary
        for item in response['items']:
            #comment_id = item['id']
            author_name = item['snippet']['topLevelComment']['snippet']['authorDisplayName']
            comment_text = item['snippet']['topLevelComment']['snippet']['textDisplay']
            comments_dict[author_name] = comment_text  #this makes it possible for one comment only for each author
            # so that no multiple comments are allowed (they could be spam very easily)
    # The following helps to make sure characters are properly transmitted from html to json
    comments_dict = {author_name: html.unescape(comment) for author_name, comment in comments_dict.items()}

    # Finally save 'em to a json file.
    with open('/content/drive/MyDrive/ColabNotebooks for NLP and other ML/StrongInternetLanguage/raw_comments.json', 'w', encoding='utf-8') as json_file:
        json.dump(comments_dict, json_file, indent=4, ensure_ascii=False)

In [6]:
import re
from sklearn.utils import shuffle
import pandas as pd

df = pd.DataFrame(comments_dict.values(), index=list(comments_dict.keys()))
df.rename(columns = {0:'COMMENT'}, inplace = True)
print(f"In total there are {len(df)} comments. 5 Randomly chosen comments:")
random = shuffle(df)
random.head(5)

In total there are 10047 comments. 5 Randomly chosen comments:


Unnamed: 0,COMMENT
Lee-Ann Hendricks,"Never comment, but I need to right now, ❤"
Liezl Kleynhans,I understand this so well. Damn made so many m...
Emily Page,WHAT THE FUCK
Sofia Dunn,Does anyone else love how's she attacking ever...
Sen,Glad to see this is all a fucking joke to you....


In [7]:
print("First 5 comments posted, and last 5 comments retrieved:")

df.head(5)
df.head(-5)

First 5 comments posted, and last 5 comments retrieved:


Unnamed: 0,COMMENT
UgzIlWb9PViHZ8d7-yZ4AaABAg,"Lmao, Genius 😂"
UgxavzmXkL-n8W-wbK54AaABAg,"You have helped me through so so so much, igno..."
UgzzpNoWplHWBp_vE5R4AaABAg,You really can sing!!!
Ugwr3tf1W7B1MQOg4NR4AaABAg,yo this kinda a bop😜😍
UgwF8bT8AAQzFfLf6hJ4AaABAg,Well it changed my mind about you ❤❤❤
...,...
The Sweet Life Of The Solís’s,Missed you!
Amanda Cooper,I love you so much!!!!!!! We’ve missed you!
Nic 09,Gahhhhh
ritual,oh wow


# Data Cleaning
This includes taking away all punctuation and deleting the comments that are spam (doubles, excessive repetitions from the same user).

In [8]:
import string
punctuation = '"#$%&\'()*+,-./:;<=>@[\]^_`{|}~“'  #still leaving the ?, ! and emojis on purpose - they can be understood by some ML engines.
def remove_punctuations(text):
    for char in punctuation:
        text = text.replace(char, ' ')
    return text
clones = pd.DataFrame()
clones['COMMENT'] = df['COMMENT'].apply(remove_punctuations).str.lower() #applying the function

In [9]:
clones.head()

Unnamed: 0,COMMENT
UgzIlWb9PViHZ8d7-yZ4AaABAg,lmao genius 😂
UgxavzmXkL-n8W-wbK54AaABAg,you have helped me through so so so much igno...
UgzzpNoWplHWBp_vE5R4AaABAg,you really can sing!!!
Ugwr3tf1W7B1MQOg4NR4AaABAg,yo this kinda a bop😜😍
UgwF8bT8AAQzFfLf6hJ4AaABAg,well it changed my mind about you ❤❤❤


In [10]:
import random
maximum = len(clones)
to_pick_from = shuffle(list(range(0, maximum)))

In [11]:
from rsa.pkcs1 import common
#a different dataset with only numbers as author names randomly assigned to processed comments

clones['index'] = to_pick_from
clones.set_index(inplace=True, keys='index')

In [12]:
clones = clones.sort_index(axis=0)

clones.head(15)

Unnamed: 0_level_0,COMMENT
index,Unnamed: 1_level_1
0,oh u are weird
1,for 6 fucking minutes stop trying to be bo bur...
2,we all make mistakes this new generation of ...
3,huh?? what??? girl
4,this whole entire situation is exposing the da...
5,colleen you made kids put their hands down yo...
6,i feel like i’m watching a dramatized episode ...
7,be so fucking serious right now
8,my jaw is on the floor and not in a good way 🫠
9,what 😀 the 😀 actual 😀 fuck 😀


# Saving the first clean data.


In [14]:
clones.to_csv('/content/drive/MyDrive/ColabNotebooks for NLP and other ML/StrongInternetLanguage/clean_data.csv', index=True)

#with open('/content/drive/MyDrive/ColabNotebooks for NLP and other ML/StrongInternetLanguage/clean_data.json', 'w', encoding='utf-8') as json_file:
#    json_file.write(json_data)

# Evaluating data with human interpretation

In [18]:

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
import pandas as pd
import pprint
file_name = '/content/drive/MyDrive/ColabNotebooks for NLP and other ML/StrongInternetLanguage/sample_for_groundtruth.csv'
annotations = '/content/drive/MyDrive/ColabNotebooks for NLP and other ML/StrongInternetLanguage/sample_for_groundtruth.csv/annotations.csv'

class Notes:
    """
    at least four attributes other than initialization
    0- loads the dataframe with comments from the same folder this class is located in
    1- checks the state of progress is made it's uploaded and won't be lost.
    2- prints one text (one comment) and takes the annotation input
    3- iterates over the dataframe and uses #2
    4- saves the progress made.
    """
    def __init__(self):

        self.dataframe = self.__dataloader__()
        self.startingpoint, self.l, self.length = 0, [], 0
        self.__progress__()
        print(f"Starting analysis from comment {self.startingpoint + 1}")

        self.ci = 0
        self.evaluation()
        self.__closing__()

    #0
    def __dataloader__(self):
        try:
            dataframe = pd.read_csv(file_name)
            dataframe.rename(columns={'Unnamed: 0': 'User ID', '0': 'Comments'}, inplace=True)
            print("Comments were loaded. First comment:")
            print(dataframe.head(1))
            return dataframe
        except FileNotFoundError:
            print(f'File "{file_name}" not found!')
            return

    #1
    def __progress__(self):
        try:
            self.l = pd.read_csv(annotations)
            self.startingpoint = self.l['Last comment checked'][0] + 1
            self.length = len(self.l['User ID'])
            # so that the starting point is one position after the last one evaluated.

        except NotADirectoryError:
            print(f"File {annotations} not found...\nCreating an empty dataframe to fill with annotations...")
            result = pd.DataFrame()
            result['Semantic evaluation'] = [] * self.dataframe.shape[0]
            result['User ID'] = self.dataframe['User ID']
            result['Last comment checked'] = 0
            #result.rename(columns={0:'Semantic evaluation'}, inplace=True)
            print("Length of the annotation dataframe:", len(result))
            self.length = len(result)
            self.l = result
            print(self.l.head())
            print("Done.")

    #2
    def ground_truthing(self, text):
        pp = pprint.PrettyPrinter(width=64, depth=1)
        pp.pprint(text)
        inp = str(input("\n\t\tSCORE -->"))
        if inp not in ('0', '1', '2'):
            if inp.strip() == 'exit':
                return float("NaN")
            if inp != 'exit':
                print("Please insert a valid input: exit, or 0, 1, 2.")
                inp = str(input("\n\t\tSCORE --> ")).strip()
                if inp.strip() == 'exit':
                    return float("NaN")

        print()
        return inp

    def evaluation(self):
        print("\n\t\tPRESS 'exit' TO QUIT")
        # ci means comment index
        for ci in range(self.startingpoint, self.length):
            print("asking you")
            res = self.ground_truthing(self.dataframe['Comments'][ci])
            if type(res) == float:
                self.ci = ci
                return
            self.l.loc[ci,'Semantic evaluation'] = int(res)-1
            self.l.loc[ci+1:, 'Semantic evaluation'] = float('NaN')
            # when going from human using a keyboard to an intuitive
            # the semantic evaluation goes from 0, 1, 2 to -1 0 +1
            self.l.loc[0, 'Last comment checked'] = ci # this is the last comment checked

    def __closing__(self):

        print(f"Progress made: {self.ci/len(self.l)}%")
        print("Saving annotations...")
        self.l.to_csv('/content/drive/MyDrive/ColabNotebooks for NLP and other ML/StrongInternetLanguage/annotations.csv', sep=',', index=False, encoding="utf-8")
        print(self.l.head(25))
        print("Saved.")





Notes()




Comments were loaded. First comment:
   User ID          Comments
0     6811  GIRL REALLY?????
File /content/drive/MyDrive/ColabNotebooks for NLP and other ML/StrongInternetLanguage/sample_for_groundtruth.csv/annotations.csv not found...
Creating an empty dataframe to fill with annotations...
Length of the annotation dataframe: 1000
   Semantic evaluation  User ID  Last comment checked
0                  NaN     6811                     0
1                  NaN      278                     0
2                  NaN     3005                     0
3                  NaN      394                     0
4                  NaN     7021                     0
Done.
Starting analysis from comment 1

		PRESS 'exit' TO QUIT
asking you
'GIRL REALLY?????'

		SCORE -->0

asking you
'Idk what she did. But i forgive her.'

		SCORE -->1

asking you
'there’s literally no way this is real'

		SCORE -->exit
Progress made: 0.002%
Saving annotations...
    Semantic evaluation  User ID  Last comment checked
0

<__main__.Notes at 0x786f9e1edf30>