In [1]:
import numpy as np 
import pandas as pd
from keras.layers import Dense, Embedding, Input
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint
import re
import math
import random
from tqdm import tqdm

max_features = 20000
maxlen = 100

Using TensorFlow backend.


In [2]:
from keras.models import Sequential
def get_model():
    embed_size = 128
    model = Sequential()
    model.add(Embedding(max_features, embed_size, input_length=maxlen))
    model.add(LSTM(50, input_shape=(embed_size, 1)))
    #model.add(Dropout(0.1))
    model.add(Dense(50, activation="relu",kernel_initializer='he_normal'))
    #model.add(Dropout(0.1))
    model.add(Dense(6, activation="sigmoid"))
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

In [3]:
batch_size = 32
early = EarlyStopping(monitor="val_loss", mode="min", patience=20)
callbacks_list = [early]

classes = ["toxic", "severe_toxic", "obscene" ,"threat", "insult" ,"identity_hate"]
#Iterate for 5 samples
for i in tqdm(range(0,5)):
    #load test datasets
    df_test = pd.read_csv('clean_test_wo_capital.csv')
    df_test = df_test.fillna("")
    del df_test["Unnamed: 0"]
    for j in range(len(classes)):
        df_test[classes[j]] = 0
        
    #load each sampled training datasets
    s = "multiple_" + str(i) + '.csv'
    print(s)
    df_train = pd.read_csv(s)
    df_train = df_train.fillna("")
    #Combine test and train datasets
    df_all = pd.concat([df_train,df_test])

    #Build feature vector
    tokenizer = text.Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(list(df_all["comment_text"]))
    features = tokenizer.texts_to_sequences(df_all["comment_text"])
    features = sequence.pad_sequences(features, maxlen=maxlen)
        
    model = None
    model = get_model()
    #Training and predict
    #if(cls == "threat"):
    #    epochs = 5
    #elif(cls == "identity_hate" or cls == "severe_toxic" ):
    #   epochs = 4
    #else:
    epochs = 2
    model.fit(features[0:df_train.shape[0]], df_train[classes].values, batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=callbacks_list)
    prob = model.predict(features[df_train.shape[0]:])
    df_test[classes] = prob
    
    #Generate submission file for each sample
    #Average them later
    del df_test["comment_text"]
    df_test.set_index('id',inplace=True)
    df_test.to_csv("submission" + "_lstm_multi" + str(i) +  ".csv")

  0%|          | 0/5 [00:00<?, ?it/s]

multiple_0.csv
Train on 29205 samples, validate on 3245 samples
Epoch 1/2
Epoch 2/2


 20%|██        | 1/5 [08:44<34:57, 524.42s/it]

multiple_1.csv
Train on 29205 samples, validate on 3245 samples
Epoch 1/2
Epoch 2/2


 40%|████      | 2/5 [17:40<26:30, 530.14s/it]

multiple_2.csv
Train on 29205 samples, validate on 3245 samples
Epoch 1/2
Epoch 2/2


 60%|██████    | 3/5 [26:32<17:41, 530.99s/it]

multiple_3.csv
Train on 29205 samples, validate on 3245 samples
Epoch 1/2
Epoch 2/2


 80%|████████  | 4/5 [35:29<08:52, 532.31s/it]

multiple_4.csv
Train on 29205 samples, validate on 3245 samples
Epoch 1/2
Epoch 2/2


100%|██████████| 5/5 [44:15<00:00, 531.18s/it]


In [4]:
df_test

Unnamed: 0_level_0,toxic,severe_toxic,obscene,threat,insult,identity_hate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
00001cee341fdb12,0.986980,0.210856,0.919931,0.087881,0.865560,0.348874
0000247867823ef7,0.018215,0.000519,0.004399,0.000868,0.003677,0.000731
00013b17ad220c46,0.089516,0.001258,0.015143,0.002058,0.013872,0.001994
00017563c3f7919a,0.011621,0.000415,0.004216,0.000508,0.002565,0.000285
00017695ad8997eb,0.046341,0.000764,0.007145,0.001459,0.007420,0.001438
0001ea8717f6de06,0.059086,0.000998,0.009907,0.001741,0.009549,0.001857
00024115d4cbde0f,0.009364,0.000389,0.003876,0.000381,0.002123,0.000317
000247e83dcc1211,0.884497,0.004485,0.089487,0.004878,0.244783,0.015074
00025358d4737918,0.851002,0.006008,0.122287,0.025973,0.348972,0.089292
00026d1092fe71cc,0.011192,0.000293,0.002875,0.000380,0.002112,0.000309
