starting out by replicating [this notebook](https://www.kaggle.com/sbongo/for-beginners-tackling-toxic-using-keras) for beginners, to get a feel for Keras.

In [1]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

Using TensorFlow backend.


In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
train.isnull().any()

id               False
comment_text     False
toxic            False
severe_toxic     False
obscene          False
threat           False
insult           False
identity_hate    False
dtype: bool

In [5]:
test.isnull().any()

id              False
comment_text    False
dtype: bool

Theres no nulls, so we can be lazy with preprocessing

In [6]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_train = train["comment_text"]
list_sentences_test = test["comment_text"]

Now lets use some Keras (`tokenizer`) to speed this up

In [7]:
max_features = 20000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

In [8]:
max_len = 200
X_tr = pad_sequences(list_tokenized_train, maxlen=max_len)
X_te = pad_sequences(list_tokenized_test, maxlen=max_len)

Input Layer

In [9]:
inp = Input(shape=(max_len, ))

Embedding Layer

In [10]:
embed_size = 128
x = Embedding(max_features, embed_size)(inp)

LSTM layer

In [11]:
x = LSTM(60, return_sequences=True,name='lstm_layer')(x)

Pooling Layer

In [12]:
x = GlobalMaxPool1D()(x)

Dropout Layer (10%)

In [13]:
x = Dropout(0.1)(x)

dense layer (,50) with relu activation function

In [14]:
x = Dense(50, activation="relu")(x)

another dropout

In [15]:
x = Dropout(0.1)(x)

finally a sigmoid layer to render predictions

In [16]:
x = Dense(6, activation="sigmoid")(x)

Now we define a model

In [17]:
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

In [None]:
model.layers

[<keras.engine.topology.InputLayer at 0x11eb77c18>,
 <keras.layers.embeddings.Embedding at 0x10d75d7b8>,
 <keras.layers.recurrent.LSTM at 0x11eb59ba8>,
 <keras.layers.pooling.GlobalMaxPooling1D at 0x11eb90c88>,
 <keras.layers.core.Dropout at 0x11eb77128>,
 <keras.layers.core.Dense at 0x11bc0dd30>,
 <keras.layers.core.Dropout at 0x11eb77908>,
 <keras.layers.core.Dense at 0x124c0f320>]

lets fit it

In [None]:
batch_size = 32
epochs = 2
model.fit(X_tr,y, batch_size=batch_size, epochs=epochs, validation_split=0.1)

Train on 143613 samples, validate on 15958 samples
Epoch 1/2
   928/143613 [..............................] - ETA: 46:11 - loss: 0.4991 - acc: 0.8949