In [14]:
import pandas as pd
import numpy as np
import seaborn as sns
import nltk
import itertools
from keras.preprocessing import sequence
from keras.models import model_from_json
from text_cnn import TextCNN

# Data loading

In [15]:
categories = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

we will not be needing the id column so delete it, and get the labels for each example

In [16]:
train = pd.read_csv('data/train.csv')
del train['id']
labels = train[categories].values.tolist()

In [17]:
train.head(110)

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Explanation\r\nWhy the edits made under my use...,0,0,0,0,0,0
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,"""\r\nMore\r\nI can't make any real suggestions...",0,0,0,0,0,0
4,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,"""\r\n\r\nCongratulations from me as well, use ...",0,0,0,0,0,0
6,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,alignment on this subject and which are contra...,0,0,0,0,0,0


in the data safe comments are those for which all the category columns are 0 but to make it more readable we will add another 'safe' column with a 1 when all the other columns are zero

# Preprocessing

Let's build our vocabulary

* Split each sentence into tokens where a token is a word in the sentence  
* Use nltk to get the frequency of each word in the corpus  
* Get the `vocabSize` most common words  
* encode words in integers were i is the ith word in our vocab

In [18]:
vocabSize = 20000 # FOR NOW USE ALL OF THE VOCAB

Put it in a function for future testing

In [19]:
def preprocess(df, maxSentLength=500):
    sentences = df[['comment_text']].values.tolist()
    tokenizedSentences = [sent[0].split(' ') for sent in sentences]
    wordFrequencies = nltk.FreqDist(itertools.chain(*tokenizedSentences))
    vocab = wordFrequencies.most_common(vocabSize-1)
    indexToWord = [word[0] for word in vocab]
    indexToWord = ['unkown'] + indexToWord
    wordToIndex = dict([(w, i) for i, w in enumerate(indexToWord)])
    for i, sent in enumerate(tokenizedSentences):
        tokenizedSentences[i] = [wordToIndex[w]  if w in wordToIndex else 0 for w in sent]
    
    tokenizedSentences = sequence.pad_sequences(tokenizedSentences, maxlen=maxSentLength)
    preprocessedValues = {
        'sentences': sentences,
        'tokenizedSentences': tokenizedSentences,
        'wordFrequencies': wordFrequencies,
        'vocab': vocab,
        'indexToWord': indexToWord,
        'wordToIndex': wordToIndex
    }
    
    return preprocessedValues

# Modelling

 As a reduction of the problem we will only use safe or not safe (reduced it to a binary classification problem)

### hyperparameters and data splitting

you will also notice that we cut the sentences to 500 words as another preprocessing step

In [20]:
maxSentLength=200
processedData = preprocess(train, maxSentLength=maxSentLength)
tokenizedSentences = processedData['tokenizedSentences']
N = len(tokenizedSentences)
split = int(.80 * N) # use 80% for training
embeddingSize = 128


xTrain = tokenizedSentences[:split]
xTest = tokenizedSentences[split:]

yTrain = np.array(labels[:split])
yTest = np.array(labels[split:])

## model and training

In [23]:
model = TextCNN(128, [2, 3, 5], embeddingSize, vocabSize, maxSentLength, len(categories))
model.build_model()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 200)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 200, 128)     2560000     input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 199, 128)     32896       embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 198, 128)     49280       embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_3 (

In [25]:
model.fit(xTrain, yTrain, xTest, yTest, batch_size=256, epochs=1)

Train on 127656 samples, validate on 31915 samples
Epoch 1/1


### save the model

In [28]:
# serialize model to JSON
model_json = model.model.to_json()
with open('./models/cnn_final.json', 'w') as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.model.save_weights('./models/cnn_final.h5')