In [26]:
# Necessary imports
import nltk
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
import matplotlib.pyplot as plt
from sklearn.preprocessing import normalize
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [32]:
df_pos = pd.read_csv(open('clean_positive_train.csv','r'), encoding='utf-8', engine='c')
df_neg = pd.read_csv(open('clean_negative_train.csv','r'), encoding='utf-8', engine='c')

df_pos['text'] = df_pos['text'].astype(str)
df_pos['parent_text'] = df_pos['parent_text'].astype(str)

df_neg['text'] = df_neg['text'].astype(str)
df_neg['parent_text'] = df_neg['parent_text'].astype(str)

In [3]:
# df_pos.describe()

Unnamed: 0,score,ups,controversiality,parent_score,parent_ups,parent_controversiality
count,49999.0,49999.0,49999.0,49999.0,49999.0,49999.0
mean,198.448509,198.448509,2e-05,369.154003,369.154003,0.0007
std,256.4982,256.4982,0.004472,530.071252,530.071252,0.026449
min,66.0,66.0,0.0,-8907.0,-8907.0,0.0
25%,83.0,83.0,0.0,84.0,84.0,0.0
50%,116.0,116.0,0.0,185.0,185.0,0.0
75%,201.0,201.0,0.0,419.0,419.0,0.0
max,4865.0,4865.0,1.0,9531.0,9531.0,1.0


In [4]:
# df_neg.describe()

Unnamed: 0,score,ups,controversiality,parent_score,parent_ups,parent_controversiality
count,49999.0,49999.0,49999.0,49999.0,49999.0,49999.0
mean,-14.564351,-14.564351,0.00146,66.805736,66.805736,0.00274
std,15.274812,15.274812,0.038183,216.581912,216.581912,0.052274
min,-634.0,-634.0,0.0,-1622.0,-1622.0,0.0
25%,-15.0,-15.0,0.0,6.0,6.0,0.0
50%,-10.0,-10.0,0.0,15.0,15.0,0.0
75%,-8.0,-8.0,0.0,44.0,44.0,0.0
max,-6.0,-6.0,1.0,14776.0,14776.0,1.0


Now that we have a small part of clean text we can start to learn the texts to our machine

In [34]:
# First we concatenate both lists and shuffle it to scrabble positive and negative
df = pd.concat([df_pos, df_neg])
df = df.sample(frac=1).reset_index(drop=True)
df.dropna(axis=0, inplace=True)

# Second we split our data for training and testing
text_data = df['text']
text_score = df['score']
# parent_text_data = df['parent_text']
# parent_text_score = df['parent_score']
X_train, X_test, y_train, y_test = train_test_split(text_data,text_score, test_size = 0.20, random_state = 42)

print(min(y_train))
print(max(y_train))
print(y_train.shape)


def posneg(number):
    if number > 0:
        return 1
    else:
        return 0
    
y_train = y_train.apply(posneg)
y_test = y_test.apply(posneg)

-634
4865
(79998,)


In [35]:
# We create a tokenizer which will give a word_index integer value to each word
tokenizer = Tokenizer(num_words=10000, lower=True, split=' ', document_count=0)

# Create the word_index list based on all our data
tokenizer.fit_on_texts(text_data)

# Now we make a list of sequences of integers based on our texts
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [36]:
train_data = keras.preprocessing.sequence.pad_sequences(X_train_seq,
                                                        value=0,
                                                        padding='post',
                                                        maxlen=128)

test_data = keras.preprocessing.sequence.pad_sequences(X_test_seq,
                                                        value=0,
                                                        padding='post',
                                                        maxlen=128)

In [8]:
print(len(train_data[0]), len(train_data[1]))
print(max(train_data, key=len))
print(max(test_data, key=len))
print(len(tokenizer.word_index) + 1)

128 128
[7462  210  368 5123   21  287   44  176    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]
[ 122   26  192   90  752 9054  154 1367   25  262  243 1437    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0  

## Finally we can do some neural networks!

In [9]:
vocab_size = len(tokenizer.word_index) + 1


model = tf.keras.models.Sequential()
model.add(keras.layers.Embedding(vocab_size, 16))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(16, activation=tf.nn.relu))
model.add(keras.layers.Dense(24, activation=tf.nn.relu))
model.add(keras.layers.Dense(16, activation=tf.nn.relu))
model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid))

model.summary()

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 16)          1468352   
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 16)                272       
_________________________________________________________________
dense_1 (Dense)              (None, 24)                408       
_________________________________________________________________
dense_2 (Dense)              (None, 16)                400       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 17        
Total params: 1,469,449
Trainable params: 1,469,449
Non-trainable para

In [10]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['acc'])

In [37]:
history = model.fit(train_data,
                    y_train,
                    epochs=40,
                    batch_size=512,
                    validation_data=(test_data, y_test),
                    verbose=1)

Train on 79998 samples, validate on 20000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
