In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn as skl
import nltk
import itertools
from keras.models import Sequential
from keras.layers import Dense, LSTM, Bidirectional
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.models import model_from_json
from sklearn.metrics import hamming_loss
import matplotlib.pyplot as plt

# Data loading

In [None]:
categories = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

we will not be needing the id column so delete it, and get the labels for each example

In [None]:
train = pd.read_csv('data/train.csv')
del train['id']
labels = train[categories].values.tolist()

In [None]:
train.head(110)

in the data safe comments are those for which all the category columns are 0 but to make it more readable we will add another 'safe' column with a 1 when all the other columns are zero

# Preprocessing

Let's build our vocabulary

* Split each sentence into tokens where a token is a word in the sentence  
* Use nltk to get the frequency of each word in the corpus  
* Get the `vocabSize` most common words  
* encode words in integers were i is the ith word in our vocab

In [None]:
vocabSize = 20000 # FOR NOW USE ALL OF THE VOCAB

Put it in a function for future testing

In [None]:
def preprocess(df, maxSentLength=500):
    sentences = df[['comment_text']].values.tolist()
    tokenizedSentences = [sent[0].split(' ') for sent in sentences]
    wordFrequencies = nltk.FreqDist(itertools.chain(*tokenizedSentences))
    vocab = wordFrequencies.most_common(vocabSize-1)
    indexToWord = [word[0] for word in vocab]
    indexToWord = ['unkown'] + indexToWord
    wordToIndex = dict([(w, i) for i, w in enumerate(indexToWord)])
    for i, sent in enumerate(tokenizedSentences):
        tokenizedSentences[i] = [wordToIndex[w]  if w in wordToIndex else 0 for w in sent]
    
    tokenizedSentences = sequence.pad_sequences(tokenizedSentences, maxlen=maxSentLength)
    preprocessedValues = {
        'sentences': sentences,
        'tokenizedSentences': tokenizedSentences,
        'wordFrequencies': wordFrequencies,
        'vocab': vocab,
        'indexToWord': indexToWord,
        'wordToIndex': wordToIndex
    }
    
    return preprocessedValues

# Modelling

 As a reduction of the problem we will only use safe or not safe (reduced it to a binary classification problem)

### hyperparameters and data splitting

you will also notice that we cut the sentences to 500 words as another preprocessing step

In [None]:
maxSentLength=200
processedData = preprocess(train, maxSentLength=maxSentLength)
tokenizedSentences = processedData['tokenizedSentences']
N = len(tokenizedSentences)
split = int(.80 * N) # use 80% for training
embeddingSize = 128


xTrain = tokenizedSentences[:split]
xTest = tokenizedSentences[split:]

yTrain = np.array(labels[:split])
yTest = np.array(labels[split:])

## model and training

In [None]:
C = len(categories)
lstmModel = Sequential()
lstmModel.add(Embedding(vocabSize, embeddingSize, input_length=maxSentLength))
lstmModel.add(LSTM(200))
lstmModel.add(Dense(C, activation='sigmoid'))
lstmModel.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
lstmModel.summary()

In [None]:
lstmModel.fit(xTrain, yTrain, validation_data=(xTest, yTest), batch_size=256, epochs=1)

### save the model

In [None]:
# serialize model to JSON
model_json = lstmModel.to_json()
with open('./models/lstm_final.json', 'w') as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
lstmModel.save_weights('./models/lstm_final.h5')

load saved model

In [None]:
# loaded_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# loaded_model.evaluate(xTest, yTest)