In [78]:
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn as skl
import nltk
import itertools
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.models import model_from_json

# Data loading

In [59]:
categories = ['safe', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

we will not be needing the id column so delete it

In [60]:
train = pd.read_csv('data/train.csv')
del train['id']

In [61]:
train.head(10)

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Explanation\r\nWhy the edits made under my use...,0,0,0,0,0,0
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,"""\r\nMore\r\nI can't make any real suggestions...",0,0,0,0,0,0
4,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,"""\r\n\r\nCongratulations from me as well, use ...",0,0,0,0,0,0
6,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,alignment on this subject and which are contra...,0,0,0,0,0,0


in the data safe comments are those for which all the category columns are 0 but to make it more readable we will add another 'safe' column with a 1 when all the other columns are zero

In [62]:
train['safe'] = np.where((train[categories[1:]] == 0).all(axis=1), 1, 0)

In [63]:
train.head(10)

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,safe
0,Explanation\r\nWhy the edits made under my use...,0,0,0,0,0,0,1
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,1
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,1
3,"""\r\nMore\r\nI can't make any real suggestions...",0,0,0,0,0,0,1
4,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,1
5,"""\r\n\r\nCongratulations from me as well, use ...",0,0,0,0,0,0,1
6,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0,0
7,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0,1
8,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0,1
9,alignment on this subject and which are contra...,0,0,0,0,0,0,1


# Preprocessing

Let's build our vocabulary

* Split each sentence into tokens where a token is a word in the sentence  
* Use nltk to get the frequency of each word in the corpus  
* Get the `vocabSize` most common words  
* encode words in integers were i is the ith word in our vocab

In [64]:
vocabSize = 50000 # FOR NOW USE ALL OF THE VOCAB

In [65]:
sentences = train[['comment_text']].values.tolist()
labels = train['safe'].values.tolist()

In [66]:
tokenizedSentences = [sent[0].split(' ')[:maxSentLength] for sent in sentences]
wordFrequencies = nltk.FreqDist(itertools.chain(*tokenizedSentences))
vocab = wordFrequencies.most_common(vocabSize-1)

replace words not in our vocab with unknown, we'll use -1 for unknown words

In [67]:
indexToWord = [word[0] for word in vocab]
indexToWord = ['unkown'] + indexToWord
wordToIndex = dict([(w, i) for i, w in enumerate(indexToWord)])

In [68]:
for i, sent in enumerate(tokenizedSentences):
    tokenizedSentences[i] = [wordToIndex[w]  if w in wordToIndex else 0 for w in sent]

In [69]:
# tokenizedSentences[0], labels[0]
# len(wordFrequencies)
# vocab[:10]

# Baseline and Reduction to binary classification

 As a reduction of the problem we will only use safe or not safe (reduced it to a binary classification problem)

### hyperparameters and data splitting

In [70]:
N = len(tokenizedSentences)
split = int(.80 * N) # use 80% for training
maxSentLength = 500;
embeddingSize = 32

xTrain = tokenizedSentences[:split]
xTest = tokenizedSentences[split:]
xTrain = sequence.pad_sequences(xTrain, maxlen=maxSentLength)
xTest = sequence.pad_sequences(xTest, maxlen=maxSentLength)

yTrain = np.array(labels[:split])
yTest = np.array(labels[split:])

## Model and training

In [71]:
model = Sequential()
model.add(Embedding(vocabSize, embeddingSize, input_length=maxSentLength))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 500, 32)           1600000   
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 101       
Total params: 1,653,301
Trainable params: 1,653,301
Non-trainable params: 0
_________________________________________________________________


In [176]:
model.fit(xTrain, yTrain, validation_data=(xTest, yTest), epochs=1, batch_size=256)

Train on 127656 samples, validate on 31915 samples
Epoch 1/1


<keras.callbacks.History at 0x216b94b6a58>

### save the model

In [None]:
# serialize model to JSON
model_json = model.to_json()
with open('./models/model.json', 'w') as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights('./models/model.h5')

# Model Evaluation on new data

load saved model

In [81]:
# load json and create model
modelFile = open('./models/vanilla-lstm-model.json', 'r')
loaded_model_json = modelFile.read()
modelFile.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights('./models/vanilla-lstm-model.h5')

In [82]:
loaded_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [83]:
loaded_model.evaluate(xTest, yTest)



[0.33694515845328499, 0.89895033683221059]