In [1]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
train = pd.read_csv('data/train.csv', nrows = 10000)
test = pd.read_csv('data/test.csv', nrows = 10000)

In [3]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_train = train["comment_text"]
list_sentences_test = test["comment_text"]

In [5]:
max_features = 20000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

In [6]:
maxlen = 200
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [7]:
totalNumWords = [len(one_comment) for one_comment in list_tokenized_train]

In [8]:
inp = Input(shape=(maxlen, )) #maxlen=200 as defined earlier
embed_size = 128
x = Embedding(max_features, embed_size)(inp)
x = LSTM(60, return_sequences=True,name='lstm_layer')(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.1)(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

In [9]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 200)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 200, 128)          2560000   
_________________________________________________________________
lstm_layer (LSTM)            (None, 200, 60)           45360     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 60)                0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 60)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 50)                3050      
_________________________________________________________________
dropout_2 (Dropout)          (None, 50)                0         
__________

In [10]:
batch_size = 32
epochs = 2
model.fit(X_t, y, batch_size=batch_size, epochs=epochs, validation_split=0.1)

Train on 9000 samples, validate on 1000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f6823cd65f8>

In [11]:
model.evaluate(X_t, y)



[0.09844064986109734, 0.9638000020980835]

In [12]:
model.predict(X_t)

array([[0.04906259, 0.00349596, 0.02074811, 0.00133955, 0.02470692,
        0.00362732],
       [0.03775631, 0.00221668, 0.01568422, 0.00086377, 0.01692799,
        0.00234967],
       [0.06624741, 0.00406932, 0.02643051, 0.00149949, 0.03010903,
        0.00428672],
       ...,
       [0.07356469, 0.00742989, 0.03679536, 0.00373201, 0.03714164,
        0.00753169],
       [0.0290519 , 0.00141408, 0.01160747, 0.00055574, 0.0125699 ,
        0.00147665],
       [0.19844837, 0.02652761, 0.10537517, 0.01393593, 0.11614423,
        0.02779877]], dtype=float32)

In [13]:
model.fit(X_t, y, batch_size=batch_size, epochs=epochs, validation_split=0.1)
model.predict(X_t)

Train on 9000 samples, validate on 1000 samples
Epoch 1/2
Epoch 2/2


array([[1.3921679e-03, 1.0432388e-05, 2.4829400e-04, 4.6654241e-05,
        3.0967526e-04, 7.5174765e-05],
       [9.4815274e-04, 6.6363036e-06, 1.6204921e-04, 2.9031871e-05,
        1.9879456e-04, 4.9367674e-05],
       [4.0437658e-03, 2.7721126e-05, 6.5952848e-04, 1.1226043e-04,
        8.4918377e-04, 1.9640668e-04],
       ...,
       [2.2812658e-03, 1.8730694e-05, 4.0738631e-04, 7.3606709e-05,
        4.2956087e-04, 1.0862270e-04],
       [6.9622666e-04, 4.0051136e-06, 1.2023504e-04, 1.7306886e-05,
        1.2969760e-04, 2.7124343e-05],
       [2.5728364e-02, 5.0689676e-04, 5.2447715e-03, 1.4500211e-03,
        7.1872659e-03, 2.1220602e-03]], dtype=float32)