In [1]:
# Necessary imports
import nltk
import numpy as np
import pandas as pd
import tensorflow as ts
import matplotlib.pyplot as plt

In [2]:
# Reading in the positive voted comments and removing NaN rows
df_pos_full = pd.read_csv('comments_positive.csv')
print('Originial', df_pos_full.shape)
df_pos_full.dropna(axis=0, inplace=True)
print('NaN removed',df_pos_full.shape)

# Reading in the negative voted comments and removing NaN rows
df_neg_full = pd.read_csv('comments_negative.csv')
print('Originial', df_neg_full.shape)
df_neg_full.dropna(axis=0, inplace=True)
print('NaN removed',df_neg_full.shape)

Originial (2000000, 15)
NaN removed (1999977, 15)
Originial (2000000, 15)
NaN removed (1999951, 15)


In [3]:
df_pos_full.head()

Unnamed: 0,id,parent_id,subreddit_id,link_id,text,score,ups,author,controversiality,parent_link_id,parent_text,parent_score,parent_ups,parent_author,parent_controversiality
0,c092j8m,t1_c092gss,t5_2qh2p,t3_8eyy3,This isn't Twitter: try to comment on the arti...,9582,9582,nraustinii,0,t3_8eyy3,Fucking faggot.,-7526,-7526,Glorificus,0
1,c4imcva,t1_c4im948,t5_2qh1i,t3_t0ynr,"Well, it is exactly what it sounds like. It's ...",9531,9531,Lynfect,0,t3_t0ynr,"Elaborate on this cum box, please.",3841,3841,eeeeevil,0
2,c0s4nfi,t1_c0s4lje,t5_2qh1i,t3_cf1n2,"In soviet Russia, bomb disarms you!",8545,8545,CapnScumbone,0,t3_cf1n2,"I don't live in Russia anymore, and I will not...",621,621,shady8x,0
3,c4ini33,t1_c4incln,t5_2qh1i,t3_t0ynr,"""runin for senitur! #YOLO!""",7430,7430,[deleted],0,t3_t0ynr,This just made me realize that future presiden...,4651,4651,drspg99,0
4,c4imgel,t1_c4ima2e,t5_2qh1i,t3_t0ynr,You step motherfucker.,7173,7173,jbg89,0,t3_t0ynr,I have sex with my step mom when my dad isn't ...,4251,4251,audir8,0


In [4]:
df_neg_full.head()

Unnamed: 0,id,parent_id,subreddit_id,link_id,text,score,ups,author,controversiality,parent_link_id,parent_text,parent_score,parent_ups,parent_author,parent_controversiality
0,c0a2d2p,t1_c0a2cn1,t5_1a8ah,t3_8pr4w,"Na, not really. \n\nI just hate islam and ever...",-2946,-2946,b34nz,0,t3_8pr4w,What goes through the heads of you people? Is...,459,459,[deleted],0
1,c6okok8,t1_c6oaywb,t5_2s8e9,t3_11otij,lol you're some ugly ass white dude,-2724,-2724,letmetellyouhowitis,0,t3_11otij,This is worth noting.\n\nThankfully I don't.,72,72,flowen65,0
2,c3nlalf,t1_c3nijr7,t5_2qzb6,t3_p9a1v,"First of off, its not true, and second off, I ...",-2132,-2132,iamwoodyharrelson,0,t3_p9a1v,I swear this is *(allegedly)* a true story. I...,4028,4028,AndyRooney,0
3,c10nh8q,t1_c10nc34,t5_6,t3_djasj,Who made you reddit police? I will submit what...,-2117,-2117,JimmyJamesincorp,0,t3_djasj,I was on my way over here to bitch and moan ab...,1214,1214,SloaneRanger,0
4,c3nlufk,t1_c3nlcob,t5_2qzb6,t3_p9a1v,We gotta be...i consider my time valuable.,-1962,-1962,iamwoodyharrelson,0,t3_p9a1v,Should change this AMA to AMAAR (Ask Me Anythi...,1405,1405,bersh,0


In [5]:
# Our dataset consists of two times 2 million comments (rows) so we split it for easier experimenting. 
# 1% of 2 million = 20000
df_pos = df_pos_full.sample(frac=0.01,random_state=200)
df_neg = df_neg_full.sample(frac=0.01,random_state=200)

print(df_pos.shape)
print(df_neg.shape)

(20000, 15)
(20000, 15)


As we can see in our dataset, the most important columns for our language processing are `text` and `parent_text`

So let's get some statistics about these columns!

In [6]:
num_words_pos = df_pos['text'].apply(lambda x: len(x.split()))
num_words_neg = df_neg['text'].apply(lambda x: len(x.split()))

pos_words_mean, pos_words_std = np.mean(num_words_pos), np.std(num_words_pos)
neg_words_mean, neg_words_std = np.mean(num_words_neg), np.std(num_words_neg)

print("Positive stats:", pos_words_mean, pos_words_std)
print("Negative stats:", neg_words_mean, neg_words_std)

Positive stats: 27.37375 53.97812483717406
Negative stats: 32.42995 50.572425223608256


In [7]:
def clean(text, stemming=False, stop_words=False):
    import re
    from string import punctuation
    from nltk.stem import SnowballStemmer
    from nltk.corpus import stopwords
    from nltk import word_tokenize
    
    stops = stopwords.words('english')
    
    # Empty comment
    if type(text) != str or text=='':
        return ''
    
    # Commence the cleaning!
    text = re.sub("\'re", " are", text)
    text = re.sub("\'ve", " have", text)
    text = re.sub("\'d", " would", text)
    text = re.sub("cant", "can not", text)
    text = re.sub("can\'t", "can not", text)
    text = re.sub("isn\'t", "is not", text)
    text = re.sub("isnt", "is not", text)
    text = re.sub("whats", "what is", text)
    text = re.sub("what\'s", "what is", text)
    text = re.sub("shouldn't", "should not", text, flags=re.IGNORECASE)
    text = re.sub("I'm", "I am", text)
    # The comments contain \n for line breaks, we need to remove those too
    text = re.sub("\\n", "", text)
    
    # Special characters
    text = re.sub('\&', " and ", text)
    text = re.sub('\$', " dollar ", text)
    text = re.sub('\%', " percent ", text)
    
    # Remove punctuation
    text = ''.join([word for word in text if word not in punctuation]).lower()
    
    # If we want to do stemming...
    if stemming:
        sno = SnowballStemmer('english')
        text = ''.join([sno.stem[word] for word in text])
    
    # If we want to remove stop words...
    if stop_words:
        text = text.split()
        text = [word for word in text if word not in stopw]
        text = ' '.join(text)
    
    return text
    

In [8]:
df_pos['text_c'] = df_pos['text'].apply(clean)

In [9]:
df_pos['parent_text_c'] = df_pos['parent_text'].apply(clean)

In [10]:
df_neg['text_c'] = df_neg['text'].apply(clean)

In [11]:
df_neg['parent_text_c'] = df_neg['parent_text'].apply(clean)

In [12]:
df_pos.drop(['id','parent_id','subreddit_id','link_id','parent_link_id'], axis=1)
df_neg.drop(['id','parent_id','subreddit_id','link_id','parent_link_id'], axis=1)


Unnamed: 0,text,score,ups,author,controversiality,parent_text,parent_score,parent_ups,parent_author,parent_controversiality,text_c,parent_text_c
1188435,"no, i didn't know what they meant, actually, t...",-9,-9,nowatermelonnokfc,0,"No, it's a case of semantics vs pragmatics.\n\...",13,13,NixonsGhost,0,no i didnt know what they meant actually thats...,no its a case of semantics vs pragmaticssemant...
1803937,As a human being it saddens me that a group of...,-7,-7,mightyneonfraa,0,-.- As a Christian it saddens me that people v...,-13,-13,kubabubba,0,as a human being it saddens me that a group of...,as a christian it saddens me that people view...
1275549,Arguably: MineCraft.,-9,-9,TheRonMan,0,&gt;No save point RPG\n\nThey're called roguel...,111,111,Astral98,0,arguably minecraft,and gtno save point rpgthey are called roguel...
728533,"A couple more words: Become active, healthy, n...",-12,-12,[deleted],0,Three words : Roll One Up,12,12,tullypimp,0,a couple more words become active healthy norm...,three words roll one up
774262,"No, the POINT is that you're getting the actua...",-12,-12,realgenius,0,The point is that you're watching the performa...,23,23,victorria,0,no the point is that you are getting the actua...,the point is that you are watching the perform...
1193658,You took the quote out of context making it lo...,-9,-9,QueerCoup,0,Your distorting of the truth is sickening and ...,22,22,dbzer0,0,you took the quote out of context making it lo...,your distorting of the truth is sickening and ...
1211641,post replyare the murders of Amy Mihaljevic an...,-9,-9,redpillneo,0,Nice!,-2,-2,JamesRenner,0,post replyare the murders of amy mihaljevic an...,nice
1280424,"&gt;*The old ""red"" and ""terrorist apologist"" s...",-9,-9,umbama,0,"The old ""red"" and ""terrorist apologist"" smear....",8,8,erikbra81,0,and gtthe old red and terrorist apologist sme...,the old red and terrorist apologist smear he a...
1019261,"Sugar is at least as dangerous, metabolically,...",-10,-10,mantra,0,That person is an idiot. \n\nLet's try to get ...,20,20,mightycow,0,sugar is at least as dangerous metabolically a...,that person is an idiot lets try to get water ...
300594,....JUST ANOTHER MOMENT.....,-20,-20,[deleted],0,SPEEDYHANDEDOVERLORDS!,85,85,[deleted],0,just another moment,speedyhandedoverlords


In [13]:
df_pos.to_csv('clean_positive_train.csv')
df_neg.to_csv('clean_negative_train.csv')