In [1]:
import numpy as np 
import pandas as pd
from keras.layers import Dense, Embedding, Input
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint
import re
import math
import random
from tqdm import tqdm

max_features = 20000
maxlen = 100

Using TensorFlow backend.


In [2]:
from keras.models import Model
def get_model(embed_size):
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = Bidirectional(LSTM(50, return_sequences=True))(x)
    x = GlobalMaxPool1D()(x)
    x = Dropout(0.1)(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.1)(x)
    x = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

In [3]:
batch_size = 32
early = EarlyStopping(monitor="val_loss", mode="min", patience=20)
callbacks_list = [early]
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')

classes = ["toxic", "severe_toxic", "obscene" ,"threat", "insult" ,"identity_hate"]
#Iterate for 5 samples
for i in tqdm(range(0,5)):
    #load test datasets
    df_test = pd.read_csv('clean_test_wo_capital.csv')
    df_test = df_test.fillna("")
    del df_test["Unnamed: 0"]
    for j in range(len(classes)):
        df_test[classes[j]] = 0
        
    #load each sampled training datasets
    s = "2_multiple_" + str(i) + '.csv'
    print(s)
    df_train = pd.read_csv(s)
    df_train = df_train.fillna("")
    #Combine test and train datasets
    df_all = pd.concat([df_train,df_test])

    #Build feature vector
    tokenizer = text.Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(list(df_all["comment_text"]))
    features = tokenizer.texts_to_sequences(df_all["comment_text"])
    features = sequence.pad_sequences(features, maxlen=maxlen)
        
    EMBEDDING_FILE= 'glove.6B.300d.txt'
    embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))
    embed_size = 300
    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    emb_mean,emb_std
    word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, j in word_index.items():
        if j >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[j] = embedding_vector
    
    
    model = None
    model = get_model(embed_size)

    epochs = 2
    model.fit(features[0:df_train.shape[0]], df_train[classes].values, batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=callbacks_list)
    prob = model.predict(features[df_train.shape[0]:])
    df_test[classes] = prob
    
    #Generate submission file for each sample
    #Average them later
    del df_test["comment_text"]
    df_test.set_index('id',inplace=True)
    df_test.to_csv("submission" + "glove_bi_lstm__multi" + str(i) +  ".csv")

  0%|          | 0/5 [00:00<?, ?it/s]

2_multiple_0.csv
Train on 29205 samples, validate on 3245 samples
Epoch 1/2
Epoch 2/2


 20%|██        | 1/5 [21:06<1:24:27, 1266.90s/it]

2_multiple_1.csv
Train on 29205 samples, validate on 3245 samples
Epoch 1/2
Epoch 2/2


 40%|████      | 2/5 [41:55<1:02:53, 1257.70s/it]

2_multiple_2.csv
Train on 29205 samples, validate on 3245 samples
Epoch 1/2
Epoch 2/2


 60%|██████    | 3/5 [1:02:17<41:31, 1245.99s/it]

2_multiple_3.csv
Train on 29205 samples, validate on 3245 samples
Epoch 1/2
Epoch 2/2


 80%|████████  | 4/5 [1:22:52<20:43, 1243.07s/it]

2_multiple_4.csv
Train on 29205 samples, validate on 3245 samples
Epoch 1/2
Epoch 2/2


100%|██████████| 5/5 [1:43:28<00:00, 1241.79s/it]


In [4]:
df_test

Unnamed: 0_level_0,toxic,severe_toxic,obscene,threat,insult,identity_hate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
00001cee341fdb12,0.984758,0.294513,0.905906,0.081816,0.747830,0.095753
0000247867823ef7,0.008172,0.000080,0.004079,0.000154,0.003225,0.000303
00013b17ad220c46,0.044409,0.000393,0.014445,0.000656,0.012550,0.001035
00017563c3f7919a,0.003217,0.000009,0.000922,0.000020,0.000957,0.000050
00017695ad8997eb,0.029296,0.000180,0.004288,0.000472,0.005237,0.000397
0001ea8717f6de06,0.004554,0.000031,0.001469,0.000069,0.001633,0.000137
00024115d4cbde0f,0.025455,0.000024,0.001789,0.000055,0.002880,0.000120
000247e83dcc1211,0.909812,0.005389,0.070395,0.008514,0.197582,0.007474
00025358d4737918,0.657957,0.000330,0.034763,0.000487,0.085131,0.001087
00026d1092fe71cc,0.008462,0.000026,0.001970,0.000057,0.002115,0.000171
