In [1]:
import numpy as np 
import pandas as pd
from keras.layers import Dense, Embedding, Input
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint
import re
import math
import random
from tqdm import tqdm

max_features = 20000
maxlen = 100

Using TensorFlow backend.


In [2]:
from keras.models import Model
def get_model(embed_size):
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = Bidirectional(LSTM(50, return_sequences=True))(x)
    x = GlobalMaxPool1D()(x)
    x = Dropout(0.1)(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.1)(x)
    x = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

In [3]:
batch_size = 32
early = EarlyStopping(monitor="val_loss", mode="min", patience=20)
callbacks_list = [early]
embed_size = 300
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
EMBEDDING_FILE= 'crawl-300d-2M.vec'
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))
classes = ["toxic", "severe_toxic", "obscene" ,"threat", "insult" ,"identity_hate"]

#Iterate for 5 samples
for i in tqdm(range(0,5)):
    #load test datasets
    df_test = pd.read_csv('clean_test_wo_capital.csv')
    df_test = df_test.fillna("")
    del df_test["Unnamed: 0"]
    for j in range(len(classes)):
        df_test[classes[j]] = 0
        
    #load each sampled training datasets
    s = "2_multiple_" + str(i) + '.csv'
    print(s)
    df_train = pd.read_csv(s)
    df_train = df_train.fillna("")
    #Combine test and train datasets
    df_all = pd.concat([df_train,df_test])

    #Build feature vector
    tokenizer = text.Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(list(df_all["comment_text"]))
    features = tokenizer.texts_to_sequences(df_all["comment_text"])
    features = sequence.pad_sequences(features, maxlen=maxlen)
        
    embedding_matrix = None
    word_index = None
    nb_words = None
    embedding_vector = None
    
    word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.zeros((nb_words, embed_size))
    for word, j in word_index.items():
        if j >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[j] = embedding_vector
    
    
    model = None
    model = get_model(embed_size)

    epochs = 2
    model.fit(features[0:df_train.shape[0]], df_train[classes].values, batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=callbacks_list)
    prob = model.predict(features[df_train.shape[0]:])
    df_test[classes] = prob
    
    #Generate submission file for each sample
    #Average them later
    del df_test["comment_text"]
    df_test.set_index('id',inplace=True)
    df_test.to_csv("submission" + "fast_bi_lstm__multi" + str(i) +  ".csv")

  0%|          | 0/5 [00:00<?, ?it/s]

2_multiple_0.csv
Train on 29205 samples, validate on 3245 samples
Epoch 1/2
Epoch 2/2


 20%|██        | 1/5 [19:33<1:18:12, 1173.05s/it]

2_multiple_1.csv
Train on 29205 samples, validate on 3245 samples
Epoch 1/2
Epoch 2/2


 40%|████      | 2/5 [38:07<57:11, 1143.87s/it]  

2_multiple_2.csv
Train on 29205 samples, validate on 3245 samples
Epoch 1/2
Epoch 2/2


 60%|██████    | 3/5 [56:23<37:35, 1127.94s/it]

2_multiple_3.csv
Train on 29205 samples, validate on 3245 samples
Epoch 1/2
Epoch 2/2


 80%|████████  | 4/5 [1:14:59<18:44, 1124.94s/it]

2_multiple_4.csv
Train on 29205 samples, validate on 3245 samples
Epoch 1/2
Epoch 2/2


100%|██████████| 5/5 [1:33:35<00:00, 1123.17s/it]


In [4]:
df_test

Unnamed: 0_level_0,toxic,severe_toxic,obscene,threat,insult,identity_hate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
00001cee341fdb12,0.990957,0.380567,0.927656,0.128223,0.836956,0.399166
0000247867823ef7,0.002173,0.000033,0.000713,0.000022,0.000519,0.000084
00013b17ad220c46,0.113225,0.001833,0.032307,0.001801,0.023242,0.003763
00017563c3f7919a,0.002425,0.000077,0.001511,0.000054,0.000873,0.000087
00017695ad8997eb,0.021298,0.000386,0.003875,0.000642,0.003073,0.000377
0001ea8717f6de06,0.006016,0.000144,0.001708,0.000168,0.001967,0.000356
00024115d4cbde0f,0.017662,0.000226,0.005901,0.000214,0.003792,0.000393
000247e83dcc1211,0.961618,0.002638,0.060667,0.003213,0.227057,0.005104
00025358d4737918,0.247615,0.000373,0.023756,0.000584,0.057313,0.002524
00026d1092fe71cc,0.007057,0.000116,0.002734,0.000138,0.002511,0.000321
