In [1]:
# Necessary imports
import nltk
import glob
import scipy
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
import matplotlib.pyplot as plt
from fuzzywuzzy import fuzz
from sklearn.preprocessing import normalize
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [102]:
pos_files = glob.glob("data/clean_positive_train_*.csv")
neg_files = glob.glob("data/clean_negative_train_*.csv")

df_pos_list = [pd.read_csv(open(fp, 'r'), encoding='utf-8', engine='c') for fp in pos_files[:10]]
df_neg_list = [pd.read_csv(open(fp, 'r'), encoding='utf-8', engine='c') for fp in neg_files[:10]]

Now we make features for each dataset that we have, calculating all these features takes a long long time...

In [103]:
df = [pd.concat([df_pos, df_neg]) for (df_pos, df_neg) in zip(df_pos_list, df_neg_list)]
print(len(df))

# Now we have scrambeled dataframes of 20000 entries, with features such as binary scores and profanity
df[0].describe()

10


Unnamed: 0,score,ups,controversiality,parent_score,parent_ups,parent_controversiality,sentiment,profanity,profanity_prob
count,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0
mean,0.5,32.5,0.001,0.94055,123.43795,0.0021,0.036857,0.14445,0.2213078
std,0.500013,41.501038,0.031608,0.236471,273.303875,0.045779,0.274485,0.351554,0.2556282
min,0.0,-9.0,0.0,0.0,-342.0,0.0,-1.0,0.0,1.092762e-20
25%,0.0,-9.0,0.0,1.0,10.0,0.0,-0.025,0.0,0.07562152
50%,0.5,32.5,0.0,1.0,42.0,0.0,0.0,0.0,0.1199541
75%,1.0,74.0,0.0,1.0,129.25,0.0,0.15,0.0,0.2229088
max,1.0,74.0,1.0,1.0,5295.0,1.0,1.0,1.0,1.0


In [104]:
df[0].head()

Unnamed: 0,text,score,ups,controversiality,parent_text,parent_score,parent_ups,parent_controversiality,sentiment,profanity,profanity_prob
0,work guys last year confirm brightest lights c...,1,74,0,bad enough lpt never even heard link shortener...,1,137,0,0.0,0,0.02107
1,seems kinda useless unless live bathroom brush...,1,74,0,random button power button tv remote yes prett...,1,441,0,-0.181818,0,0.059085
2,seriously least hitler killed hitler,1,74,0,please dont refer things hitler x bad taste ea...,1,71,0,-0.25,0,0.477267
3,dont worry legitimate hurricane,1,74,0,florida suffering one two shitstorms time,1,391,0,0.0,0,0.205006
4,sir ip adress 127001 hes good,1,74,0,start 127,1,21,0,0.7,0,0.192113


In [105]:
tokenizer = Tokenizer(num_words=10000, lower=True, split=' ', document_count=0)
# Create the word_index list based on all our data",
text_data = [np.array2string(df[single_df]['text'].values.astype(str)) for single_df in range(len(df))]
text_data = ' '.join(text_data)
tokenizer.fit_on_texts(text_data)

In [155]:
feature_data = [np.array([])]
score_data = [np.array([])]

X_train = []
X_test = []
y_train = []
y_test = []

# Iterate over each DataFrame
for n in range(len(df)):
    
    cur_df = df[n]
    print(n)
    df[n].dropna(axis=0, inplace=True)
    size = len(cur_df)
    # Iterate over each row in the DataFrame
    f_data = np.array([])
    for index, row in cur_df.iterrows():
        sentiment = row['sentiment']
        profanity = row['profanity_prob']
        features = np.hstack((sentiment, profanity))
        
        if index == 0:
            f_data = features
            s_data = row['score']
        else:
            f_data = np.vstack([f_data, features])
            s_data = np.vstack([s_data, row['score']])
    if n == 0:
        feature_data[0] = f_data
        score_data[0] = s_data
    else:
        feature_data.append(f_data)
        score_data.append(s_data)
            
    X_train_tmp, X_test_tmp, y_train_tmp, y_test_tmp = train_test_split(feature_data[n], score_data[n], test_size=0.20, random_state=42)
    X_train.append(X_train_tmp)
    X_test.append(X_test_tmp)
    y_train.append(y_train_tmp)
    y_test.append(y_test_tmp)

    X_train[n] = tf.keras.utils.normalize(X_train[n], axis=1)
    X_test[n] = tf.keras.utils.normalize(X_test[n], axis=1)



0
1
2
3
4
5
6
7
8
9


In [153]:
for n in range(0,4):
    print(feature_data[n].shape)
    print(score_data[n].shape)

(9851, 2)
(9851, 1)
(9852, 2)
(9852, 1)
(9827, 2)
(9827, 1)
(9821, 2)
(9821, 1)


## Finally we can do some neural networks!

In [91]:
model = tf.keras.models.Sequential()

model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(24, activation=tf.nn.relu))
model.add(tf.keras.layers.Dense(16, activation=tf.nn.relu))
model.add(tf.keras.layers.Dense(1, activation=tf.nn.sigmoid))

# model.summary()

In [92]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['acc'])

In [160]:
test_number = 4

train_data = X_train[test_number]
test_data = X_test[test_number]

history = model.fit(train_data,
                    y_train[test_number],
                    epochs=40,
                    batch_size=512,
                    validation_data=(test_data, y_test[test_number]),
                    verbose=1)

Train on 7880 samples, validate on 1970 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
