In [1]:
# Necessary imports
import nltk
import numpy as np
import pandas as pd
import tensorflow as ts
import matplotlib.pyplot as plt

In [2]:
# Reading in the positive voted comments and removing NaN rows
df_pos_full = pd.read_csv('comments_positive.csv')
print('Originial', df_pos_full.shape)
df_pos_full.dropna(axis=0, inplace=True)
print('NaN removed',df_pos_full.shape)

# Reading in the negative voted comments and removing NaN rows
df_neg_full = pd.read_csv('comments_negative.csv')
print('Originial', df_neg_full.shape)
df_neg_full.dropna(axis=0, inplace=True)
print('NaN removed',df_neg_full.shape)

Originial (2000000, 15)
NaN removed (1999977, 15)
Originial (2000000, 15)
NaN removed (1999951, 15)


In [3]:
# df_pos_full.head()

In [4]:
# df_neg_full.head()

In [5]:
# Our dataset consists of two times 2 million comments (rows) so we split it for easier experimenting. 
# 1% of 2 million = 20000
# df_pos = df_pos_full.sample(frac=0.025,random_state=200)
# df_neg = df_neg_full.sample(frac=0.025,random_state=200)
# df_pos = df_pos.reset_index(drop=True)
# df_neg = df_neg.reset_index(drop=True)

# print(df_pos.shape)
# print(df_neg.shape)

We're splitting the data in 200 seperate pieces with each 10000 entries

In [13]:
df_pos_split = np.array_split(df_pos_full, 200)
df_neg_split = np.array_split(df_neg_full, 200)

As we can see in our dataset, the most important columns for our language processing are `text` and `parent_text`

So let's get some statistics about these columns!

In [7]:
# num_words_pos = df_pos['text'].apply(lambda x: len(x.split()))
# num_words_neg = df_neg['text'].apply(lambda x: len(x.split()))

# pos_words_mean, pos_words_std = np.mean(num_words_pos), np.std(num_words_pos)
# neg_words_mean, neg_words_std = np.mean(num_words_neg), np.std(num_words_neg)

# print("Positive stats:", pos_words_mean, pos_words_std)
# print("Negative stats:", neg_words_mean, neg_words_std)

In [None]:
def clean(text, stemming=False, stop_words=True):
    import re
    from string import punctuation
    from nltk.stem import SnowballStemmer
    from nltk.corpus import stopwords
    from nltk import word_tokenize
    
    stops = stopwords.words('english')
    
    # Empty comment
    if type(text) != str or text=='':
        return ''
    
    # Commence the cleaning!
    text = re.sub("\'re", " are", text)
    text = re.sub("\'ve", " have", text)
    text = re.sub("\'d", " would", text)
    text = re.sub("cant", "can not", text)
    text = re.sub("can\'t", "can not", text)
    text = re.sub("isn\'t", "is not", text)
    text = re.sub("isnt", "is not", text)
    text = re.sub("whats", "what is", text)
    text = re.sub("what\'s", "what is", text)
    text = re.sub("shouldn't", "should not", text, flags=re.IGNORECASE)
    text = re.sub("I'm", "I am", text)
    text = re.sub(":", " ", text)
    # The comments contain \n for line breaks, we need to remove those too
    text = re.sub("\\n", " ", text)
    
    # Special characters
    text = re.sub('\&', " and ", text)
    text = re.sub('\$', " dollar ", text)
    text = re.sub('\%', " percent ", text)
    
    # Remove punctuation
    text = ''.join([word for word in text if word not in punctuation]).lower()
    
    # If we want to do stemming...
    if stemming:
        sno = SnowballStemmer('english')
        text = ''.join([sno.stem[word] for word in text])
    
    # If we want to remove stop words...
    if stop_words:
        text = text.split()
        text = [word for word in text if word not in stops]
        text = ' '.join(text)
    
    return text
    

That was the first half of a million

In [25]:
from pathlib import Path
for n in range(200):
    
    # Clean the text
    df_pos_split[n]['text'] = df_pos_split[n]['text'].apply(clean)
    df_pos_split[n]['parent_text'] = df_pos_split[n]['parent_text'].apply(clean)
    df_neg_split[n]['text'] = df_neg_split[n]['text'].apply(clean)
    df_neg_split[n]['parent_text'] = df_neg_split[n]['parent_text'].apply(clean)

    df_pos_split[n].dropna(axis=0, inplace=True)
    df_neg_split[n].dropna(axis=0, inplace=True)
    
    # We don't need all columns
    col_write = ['text', 'score', 'ups', 'controversiality', 'parent_text', 'parent_score', 'parent_ups', 'parent_controversiality']
    # Save everything in the 'data' folder
    p = Path('data/')
    number = str(n)
    df_pos_split[n].to_csv(Path(p, 'clean_positive_train_' + number + '.csv'), columns=col_write, index=False)
    df_neg_split[n].to_csv(Path(p, 'clean_negative_train_' + number + '.csv'), columns=col_write, index=False)

Now we have 200 clean files for both positive and negative (so 400 total) which we can slowly feed to our neural network to get better and better