In [1]:
# Necessary imports
import nltk
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
import matplotlib.pyplot as plt
from sklearn.preprocessing import normalize
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
df_pos = pd.read_csv(open('clean_positive_train.csv','r'), encoding='utf-8', engine='c')
df_neg = pd.read_csv(open('clean_negative_train.csv','r'), encoding='utf-8', engine='c')

df_pos['text'] = df_pos['text'].astype(str)
df_pos['parent_text'] = df_pos['parent_text'].astype(str)

df_neg['text'] = df_neg['text'].astype(str)
df_neg['parent_text'] = df_neg['parent_text'].astype(str)

In [3]:
df_pos.describe()

Unnamed: 0,score,ups,controversiality,parent_score,parent_ups,parent_controversiality
count,49999.0,49999.0,49999.0,49999.0,49999.0,49999.0
mean,1.0,198.448509,2e-05,0.99024,369.154003,0.0007
std,0.0,256.4982,0.004472,0.098311,530.071252,0.026449
min,1.0,66.0,0.0,0.0,-8907.0,0.0
25%,1.0,83.0,0.0,1.0,84.0,0.0
50%,1.0,116.0,0.0,1.0,185.0,0.0
75%,1.0,201.0,0.0,1.0,419.0,0.0
max,1.0,4865.0,1.0,1.0,9531.0,1.0


In [4]:
df_neg.describe()

Unnamed: 0,score,ups,controversiality,parent_score,parent_ups,parent_controversiality
count,49999.0,49999.0,49999.0,49999.0,49999.0,49999.0
mean,0.0,-14.564351,0.00146,0.912598,66.805736,0.00274
std,0.0,15.274812,0.038183,0.282426,216.581912,0.052274
min,0.0,-634.0,0.0,0.0,-1622.0,0.0
25%,0.0,-15.0,0.0,1.0,6.0,0.0
50%,0.0,-10.0,0.0,1.0,15.0,0.0
75%,0.0,-8.0,0.0,1.0,44.0,0.0
max,0.0,-6.0,1.0,1.0,14776.0,1.0


We now have a a fraction of our dataset cleaned and loaded. The next step is to combine the two datasets, and shuffle them. After that we divide the datasets in a train and test set.

In [5]:
# First we concatenate both lists and shuffle it to scrabble positive and negative
df = pd.concat([df_pos, df_neg])
df = df.sample(frac=1).reset_index(drop=True)
df.dropna(axis=0, inplace=True)

# Second we split our data for training and testing
df['combined'] = df[['text', 'parent_text']].apply(lambda x: ' '.join(x), axis=1)

text_data = df['combined']
text_score = df['score']
# parent_text_data = df['parent_text']
# parent_text_score = df['parent_score']
X_train, X_test, y_train, y_test = train_test_split(text_data,text_score, test_size = 0.20, random_state = 42)

print("X_train:", X_train.shape)
print("X_test:", X_test.shape)
print("y_train:", y_train.shape)
print("y_test:", y_test.shape)

X_train: (79998,)
X_test: (20000,)
y_train: (79998,)
y_test: (20000,)


Now we use a tokenizer which fits all the text in our dataset, it then assigns an integer to each learned word which allows us to convert each entry to a sequence of numbers. These sequences can then be easily passed to our neural network

In [6]:
# We create a tokenizer which will give a word_index integer value to each word
tokenizer = Tokenizer(num_words=10000, lower=True, split=' ', document_count=0)

# Create the word_index list based on all our data
tokenizer.fit_on_texts(text_data)

# Now we make a list of sequences of integers based on our texts
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

Because each sentence has a different length and we want to pass the same length vector to our neural network every time, we pad them adding zeros at the end of each sequence so each is 128 integers long.

In [7]:
train_data = keras.preprocessing.sequence.pad_sequences(X_train_seq,
                                                        value=0,
                                                        padding='post',
                                                        maxlen=128)

test_data = keras.preprocessing.sequence.pad_sequences(X_test_seq,
                                                        value=0,
                                                        padding='post',
                                                        maxlen=128)

In [8]:
print(len(train_data[0]), len(train_data[1]))

128 128


## Neural networks!

Now we can start to build our neural network with TensorFlow. First we create an embedding layer which turns positive integers (indexes) into dense vectors of fixed size. 
After that we use a GlobalAveragePooling layer which averages all input, this is needed because we feed vectors that contain a lot of zeros, otherwise a lot of neurons will never fire. 
Then have 2 deep learning layers and in the end one node which will return whether the text that is passed will get a positive or negative score. 

In [9]:
vocab_size = len(tokenizer.word_index) + 1

model = tf.keras.models.Sequential()
model.add(keras.layers.Embedding(vocab_size, 16))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(16, activation=tf.nn.relu))
model.add(keras.layers.Dense(8, activation=tf.nn.relu))
model.add(keras.layers.Dense(4, activation=tf.nn.relu))
model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid))

model.summary()

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 16)          2237840   
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 16)                272       
_________________________________________________________________
dense_1 (Dense)              (None, 8)                 136       
_________________________________________________________________
dense_2 (Dense)              (None, 4)                 36        
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 5         
Total params: 2,238,289
Trainable params: 2,238,289
Non-trainable para

In [10]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['acc'])

In [11]:
history = model.fit(train_data,
                    y_train,
                    epochs=40,
                    batch_size=512,
                    validation_data=(test_data, y_test),
                    verbose=1)

Train on 79998 samples, validate on 20000 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


As we can see our neural network achieves an accuracy of about 85% on our training data and 64% on our test data. Also notice that the accuray of the validation set goes down while that of the training goes up, meaning that we're overfitting our network.