In [1]:
import numpy as np 
import pandas as pd
from keras.layers import Dense, Embedding, Input
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint
import re
import math
import random
from tqdm import tqdm

max_features = 20000
maxlen = 100

Using TensorFlow backend.


In [2]:
from keras.models import Sequential
def get_model():
    embed_size = 128
    model = Sequential()
    model.add(Embedding(max_features, embed_size, input_length=maxlen))
    model.add(LSTM(50, input_shape=(embed_size, 1)))
    model.add(Dropout(0.1))
    model.add(Dense(50, activation="relu",kernel_initializer='he_normal'))
    model.add(Dropout(0.1))
    model.add(Dense(1, activation="sigmoid"))
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

In [3]:
batch_size = 32
early = EarlyStopping(monitor="val_loss", mode="min", patience=20)
callbacks_list = [early]

classes = ["toxic", "severe_toxic", "obscene" ,"threat", "insult" ,"identity_hate"]
#Iterate for 5 samples
for i in tqdm(range(0,5)):
    #load test datasets
    df_test = pd.read_csv('clean_test_wo_capital.csv')
    df_test = df_test.fillna("")
    del df_test["Unnamed: 0"]
    
    #Iterate for 6 classes
    for cls in classes:
        #load each sampled training datasets
        s = cls + str(i) + '.csv'
        print(s)
        df_train = pd.read_csv(s)
        df_train = df_train.fillna("")
        #Combine test and train datasets
        df_all = pd.concat([df_train,df_test])

        #Build feature vector
        tokenizer = text.Tokenizer(num_words=max_features)
        tokenizer.fit_on_texts(list(df_all["comment_text"]))
        features = tokenizer.texts_to_sequences(df_all["comment_text"])
        features = sequence.pad_sequences(features, maxlen=maxlen)
        
        model = None
        model = get_model()
        #Training and predict
        if(cls == "threat"):
            epochs = 5
        elif(cls == "identity_hate" or cls == "severe_toxic" ):
            epochs = 4
        else:
            epochs = 3
        model.fit(features[0:df_train.shape[0]], df_train[cls], batch_size=batch_size, epochs=epochs, validation_split=0.3, callbacks=callbacks_list)
        prob = model.predict(features[df_train.shape[0]:])
        df_test[cls] = prob
    #Generate submission file for each sample
    #Average them later
    del df_test["comment_text"]
    df_test.set_index('id',inplace=True)
    df_test.to_csv("submission" + "_lstm_2_20000" + str(i) +  ".csv")

  0%|          | 0/5 [00:00<?, ?it/s]

toxic0.csv
Train on 21928 samples, validate on 9398 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
severe_toxic0.csv
Train on 2228 samples, validate on 955 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
obscene0.csv
Train on 12143 samples, validate on 5205 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
threat0.csv
Train on 669 samples, validate on 288 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
insult0.csv
Train on 11120 samples, validate on 4766 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
identity_hate0.csv
Train on 1972 samples, validate on 846 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


 20%|██        | 1/5 [47:17<3:09:09, 2837.49s/it]

toxic1.csv
Train on 21872 samples, validate on 9375 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
severe_toxic1.csv
Train on 2235 samples, validate on 958 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
obscene1.csv
Train on 12102 samples, validate on 5187 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
threat1.csv
Train on 669 samples, validate on 287 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
insult1.csv
Train on 11102 samples, validate on 4759 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
identity_hate1.csv
Train on 1973 samples, validate on 846 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


 40%|████      | 2/5 [1:32:53<2:19:20, 2786.97s/it]

toxic2.csv
Train on 21867 samples, validate on 9372 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
severe_toxic2.csv
Train on 2233 samples, validate on 958 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
obscene2.csv
Train on 12124 samples, validate on 5197 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
threat2.csv
Train on 668 samples, validate on 287 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
insult2.csv
Train on 11132 samples, validate on 4771 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
identity_hate2.csv
Train on 1972 samples, validate on 846 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


 60%|██████    | 3/5 [2:05:30<1:23:40, 2510.11s/it]

toxic3.csv
Train on 21869 samples, validate on 9373 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
severe_toxic3.csv
Train on 2235 samples, validate on 959 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
obscene3.csv
Train on 12122 samples, validate on 5196 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
threat3.csv
Train on 668 samples, validate on 287 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
insult3.csv
Train on 11093 samples, validate on 4755 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
identity_hate3.csv
Train on 1969 samples, validate on 845 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


 80%|████████  | 4/5 [2:38:34<39:38, 2378.57s/it]  

toxic4.csv
Train on 21898 samples, validate on 9386 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
severe_toxic4.csv
Train on 2231 samples, validate on 957 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
obscene4.csv
Train on 12130 samples, validate on 5199 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
threat4.csv
Train on 668 samples, validate on 287 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
insult4.csv
Train on 11071 samples, validate on 4746 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
identity_hate4.csv
Train on 1975 samples, validate on 847 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


100%|██████████| 5/5 [3:10:50<00:00, 2290.05s/it]
