In [1]:
import pandas as pd 
import numpy as np 
from datetime import datetime
import nltk
from nltk.tokenize import word_tokenize
import csv
import re

In [2]:
pd.set_option('display.max_columns', 10000)
pd.set_option('display.max_rows', 10000)

In [3]:
# found on stackoverflow @ https://stackoverflow.com/questions/33404752/removing-emojis-from-a-string-in-python
def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

In [4]:
def reset(df):
    """
    pass in dataframe and returns dataframe with index reset
    just wanted to make it a bit of a quicker type since I'm using it so much 
    """
    return df.reset_index(drop=True, inplace=True)

def remove_comments(df, column, list_to_remove):
    """
    Pass in your dataframe, a column & a list of words to search for and remove.
    Does not change capitalization, but will remove if word is within another word.
    Returns dataframe with words removed. 
    """
    for item in list_to_remove:
        reset(df)
        for i in range(len(df)):
            if type(df.at[i, column]) == str:
                if item in df.at[i, column].lower():
                    df.drop(index=i, inplace=True)
    return df 

In [5]:
df = pd.read_csv('data/3.29.youtube.csv')

In [6]:
df = df[[ 'video_id', 'text', 'likes', 'date',
       'channel_id', 'viewer_rating', 'mentions', 'comment_id']].copy()

In [7]:
# getting rid of duplicate entries (if any)
df = df.drop_duplicates(subset='comment_id')
df = df.reset_index(drop=True)

In [8]:
# converting our date column to a column of datetime objects 
for row in range(len(df)):
    df.at[row, 'date'] = datetime.strptime(df.at[row, 'date'].replace('T', ' ').replace('Z','')[2:], '%y-%m-%d %H:%M:%S')

In [9]:
df = df.drop('viewer_rating', 1)

In [10]:
df.dropna(axis=0, subset=['text'], inplace=True)

In [11]:
df.reset_index(inplace=True)

In [12]:
text = ['elmer', 'fudd', 'impediment', 'mawrs', 'retard', 'voiceover', 'lisp', 'studder', 'accent', 
        'antichrist', 'sjw', 'dwopping', 'arthursday', 'bitch', 'junior high', 'shout out', 'shoutout', 
        'acadamy', 'middle school', 'high school', 'secondary school', 'academy']

In [13]:
df = remove_comments(df, 'text', text)

In [14]:
def clean_string(string):
    for symbol in "'’":
        string = string.replace(symbol, '')
    for symbol in "`@#()[]{}<>;_-=+~:,.?!''\n^\/“$":
        string = string.replace(symbol, ' ').lower()
    string = string.replace('"', ' ')
    string = string.replace('&', 'and')
    string = deEmojify(string)
    encoded_string = string.encode("ascii", "ignore")
    decode_string = encoded_string.decode()
    return decode_string

reset(df)
for i in range(len(df)):
    df.at[i, 'text'] = clean_string(str(df.at[i, 'text']))

In [15]:
for item in ['hello', 'hi', 'first comment']:
    reset(df)
    for i in range(len(df)):
        if type(df.at[i, 'text']) == str:
            if item in df.at[i, 'text']:
                if len(df.at[i, 'text']) < len(item) + 10: # too short to be anything but a greeting to the author
                    df.drop(index=i, inplace=True)

In [16]:
reset(df)
one_big_list = []

for i in range(len(df)):
    for word in word_tokenize(str(df.at[i, 'text'])):
        for symbol in "'[],":
            word = word.replace(symbol, "")
        if word != '':
            if word.startswith('//') != True:
                if word.startswith('http') != True:
                    one_big_list.append(word)

In [17]:
def count_vectorize(text):
    unique_words = set(text)
    word_dict = {i:0 for i in unique_words}
    
    for word in text:
        word_dict[word] += 1
    
    return word_dict

vectorized = count_vectorize(one_big_list)

In [18]:
a_file = open("data/vect_yts.csv", "w", encoding="utf-8")

writer = csv.writer(a_file)
for key, value in vectorized.items():
    writer.writerow([key, value])

a_file.close()

In [19]:
new_df = pd.read_csv('data/vect_yts.csv')

In [20]:
add_df = pd.DataFrame(columns=['word', 'count'], index=range(1))
add_df.at[0, 'word'] = new_df.columns[0]
add_df.at[0, 'count'] = new_df.columns[1]

In [21]:
new_df = new_df.rename(columns={new_df.columns[0]:'word', new_df.columns[1]:'count'})
new_df = new_df.append(add_df)

In [22]:
new_df['count'] = new_df['count'].astype(int)
new_df = new_df.sort_values('count', ascending=False)

In [23]:
new_df.to_csv('data/count_yts')

In [24]:
df = df[['text', 'likes', 'channel_id', 'mentions', 'comment_id']]
df = df.rename(columns={'likes':'favorite_count', 'channel_id':'user_id', 'post_id':'comment_id'})

In [25]:
df['repost_count'] = 0

In [26]:
text_counts = df['text'].value_counts().to_frame().reset_index().rename(columns={'index':'text', 'text':'count'})

In [27]:
for i in range(len(df)):
    df.at[i, 'repost_count'] = text_counts.loc[text_counts['text'] == df.at[i, 'text']]['count']

In [28]:
df.to_csv('data/cleaned_comments.csv')

In [29]:
df['repost_count'] = df['repost_count'] - 1 

In [30]:
reset(df)
for i in range(len(df)):
    if len(df.at[i, 'text']) <= 6:
        df.drop(index=i, inplace=True)

In [31]:
# df.drop_duplicates(subset='text').sort_values(by='repost_count', ascending=False).head(100)

Unnamed: 0,text,favorite_count,user_id,mentions,comment_id,repost_count
85320,what about wormholes,0.0,UCpsdzdIgmeZQS0DVyL3aUHw,,UgxJc68bfZ0m4JDkIbl4AaABAg,62
2996,thanks,0.0,UCZFipeZtQM5CKUjx6grh54g,UggssEFpGrmi2HgCoAEC,UggssEFpGrmi2HgCoAEC.8RDg_or5LiI8REero1P_bD,47
131524,davis elementary school georgia,0.0,UCqcNalyQL7AKe6SP6QcYS3Q,,Ugxjh7pwXZ0IHdikBLF4AaABAg,43
131450,st cyril school in calgary alberta canada,2.0,UCgraOxQ7iz1ljUHv2wO0a7A,,Ugz-J2XbYnFHRy0bBcF4AaABAg,40
1815,amazing,0.0,UCIABrwoSmrw80NHoNAH_yUw,,Ugxa9usRdYbS02SE7v94AaABAg,38
131507,kent denver school colorado,0.0,UCgSB8Gwz9c-CObDfV-Iq-mA,,Ugz4iIfTbqtThR6cjWB4AaABAg,34
4954,great video,1.0,UCl-VsIHatLvdvqA5kjdZYWA,,UgxvbFtMzwEI5dleB8t4AaABAg,34
130243,bismarck henning jr high in bismark il go ea...,0.0,UCfm_aiz3IdwYfelhbqkdE1g,,UgyR8Tng1u8rUXeexuZ4AaABAg,33
131322,buckeye jhs in medina ohio we love to watch cn...,0.0,UC5uPONCfGXjiOospychiNbQ,,UgzNx_PhtDILvnhWf6N4AaABAg,27
1195,thank you,0.0,UCGkAOWnvEAdATrNNSy00b3Q,UgxVggWjoRFuXByoD-t4AaABAg,UgxVggWjoRFuXByoD-t4AaABAg.9GiI1V5pAlF9HwlKuX1Z2R,25


In [32]:
#91732
#3609024337 l&i check reissuing phone num 