# Imports

In [0]:
import pandas as pd
import numpy as np
import nltk
import itertools
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from keras.wrappers.scikit_learn import KerasClassifier
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.layers.embeddings import Embedding
from keras.models import Model


seed = 7
np.random.seed(seed)

# Data loading

set our parameters

In [0]:
categories = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
              'identity_hate']
vocab_size = 20000
embedding_size = 128
max_seq_length = 500
epochs = 3

load the data to a pandas dataframe

In [0]:
data = pd.read_csv('./train.csv')


In [0]:
Y = data[categories].values

# Preprocessing

The preprocess steps are:


1.   Split each sentence into tokens
2.   encode the words into numbers
3.   save to npy file for use



In [0]:
def preprocess(df, max_length=500):
    sentences = df[['comment_text']].values.tolist()
    tokenized_sentences = [sent[0].split(' ') for sent in sentences]
    wordFrequencies = nltk.FreqDist(itertools.chain(*tokenized_sentences))
    vocab = wordFrequencies.most_common(vocab_size-1)
    index_to_word = [word[0] for word in vocab]
    index_to_word = ['unknown'] + index_to_word
    word_to_index = dict([(w, i) for i, w in enumerate(index_to_word)])
    for i, sent in enumerate(tokenized_sentences):
        tokenized_sentences[i] = [word_to_index[w]  if w in word_to_index else 0 for w in sent]
    
    processed_sentences = sequence.pad_sequences(tokenized_sentences, maxlen=max_length)
    return processed_sentences

In [0]:
X = preprocess(data, max_length=max_seq_length)

# Trainings

In [0]:
def create_model():
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_size, input_length=max_seq_length))
    model.add(LSTM(200))
    model.add(Dense(len(categories), activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    return model

In [33]:
scores = []

final_model = None

# X_train, X_test, y_train, y_test = train_test_split(cleaned_sentences, labels,
#                                                     test_size=.20, random_state=seed)

for eval_steps in range(1):
  model = KerasClassifier(build_fn=create_model, epochs=epochs, batch_size=512, verbose=1)
  kfold = KFold(n_splits=5, shuffle=True, random_state=seed)
  results = cross_val_score(model, X, Y, cv=kfold)
  final_model = model
  _mean = results.mean()
  scores.append(_mean)
  print(_mean)

mean_acc = np.mean(scores)
mean_std = np.std(scores)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 500, 128)          2560000   
_________________________________________________________________
lstm_5 (LSTM)                (None, 200)               263200    
_________________________________________________________________
dense_5 (Dense)              (None, 6)                 1206      
Total params: 2,824,406
Trainable params: 2,824,406
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/10

KeyboardInterrupt: ignored

### save the model

In [0]:
# serialize model to JSON
final_model = final_model.to_json()
with open('./lstm_final.json', 'w') as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
final_model.save_weights('./lstm_final.h5')

In [0]:
from google.colab import files

files.download('lstm_final.json')
files.download('lstm_final.h5')