# Imports

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPool1D, concatenate
from keras.layers import Dropout, Flatten, Input
from keras.models import Model
from keras.layers.embeddings import Embedding

seed = 7
np.random.seed(seed)

# Data loading

In [5]:
categories = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
              'identity_hate']

### load the already preprocessed data

In [6]:
X = np.load('data/X.npy')
Y = np.load('data/Y.npy')

# Model


set our parameters

In [10]:
vocab_size = 10000
embedding_size = 128
max_seq_length = 300
n_filters = 128
filter_sizes = [2, 3, 5]
seed = 7
output_size = len(categories)
epoch = 10

In [11]:
seq_input = Input(shape=(max_seq_length, ), dtype='int32')
model = Embedding(vocab_size, embedding_size,
                       input_length=max_seq_length)(seq_input)

# Convolution outputs in parallel will be using the input
conv_outputs = []

for size in filter_sizes:
    sub_model = Conv1D(n_filters, size, activation='relu')(model)
    sub_model = MaxPool1D(pool_size=size)(sub_model)
    conv_outputs.append(sub_model)

# Concatenate layers
merged = concatenate(conv_outputs, axis=1)
model = Flatten()(merged)
model = Dropout(0.5)(model)
model = Dense(output_size, activation='sigmoid')(model)
model = Model(seq_input, model)
model.compile(loss='binary_crossentropy', optimizer='rmsprop',
                   metrics=['accuracy'])

print(model.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 300)          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 300, 128)     1280000     input_2[0][0]                    
__________________________________________________________________________________________________
conv1d_4 (Conv1D)               (None, 299, 128)     32896       embedding_2[0][0]                
__________________________________________________________________________________________________
conv1d_5 (Conv1D)               (None, 298, 128)     49280       embedding_2[0][0]                
__________________________________________________________________________________________________
conv1d_6 (

# training

In [13]:
x_train, x_test, y_train , y_test = train_test_split(X, Y, test_size=.33)

In [None]:
hist = model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=20, batch_size=128)

Train on 106912 samples, validate on 52659 samples
Epoch 1/20
Epoch 2/20
 21888/106912 [=====>........................] - ETA: 15:06 - loss: 0.0619 - acc: 0.9798

### save the model

In [None]:
# # serialize model to JSON
# model_json = model.to_json()
# with open('./cnn_final.json', 'w') as json_file:
#     json_file.write(model_json)
# # serialize weights to HDF5
# model.save_weights('./cnn_final.h5')