In [3]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd

from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model, load_model
from keras import initializers, regularizers, constraints, optimizers, layers

In [4]:
embedding_path = f'glove.6B.50d.txt'
train_data_path = f'train.csv'
test_data_path = f'test.csv'

In [5]:
# word vector size
embed_size = 50
# unique words to use
max_features = 20000
# max words in comment
maxlen = 100

In [6]:
train = pd.read_csv(train_data_path)
test = pd.read_csv(test_data_path)
print(train.head())
print(test.head())

                 id                                       comment_text  toxic  \
0  0000997932d777bf  Explanation\nWhy the edits made under my usern...      0   
1  000103f0d9cfb60f  D'aww! He matches this background colour I'm s...      0   
2  000113f07ec002fd  Hey man, I'm really not trying to edit war. It...      0   
3  0001b41b1c6bb37e  "\nMore\nI can't make any real suggestions on ...      0   
4  0001d958c54c6e35  You, sir, are my hero. Any chance you remember...      0   

   severe_toxic  obscene  threat  insult  identity_hate  
0             0        0       0       0              0  
1             0        0       0       0              0  
2             0        0       0       0              0  
3             0        0       0       0              0  
4             0        0       0       0              0  
                 id                                       comment_text
0  00001cee341fdb12  Yo bitch Ja Rule is more succesful then you'll...
1  0000247867823ef7  ==

In [7]:
test[test['comment_text'].str.match('!{30,}')]['comment_text'].tolist()

['!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! \n\n HAPPY NOW GOT SOURCES SO SUCK A DICK BITCH!!!!!!!!!!!!!!!!!!!!!!',
 '!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! IDIOT FOUND  DELETING IDIOT... DONE!']

In [10]:
list_sentences_train = train["comment_text"].fillna("_na_").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("_na_").values

In [11]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_train = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_test = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [12]:
X_train

array([[    0,     0,     0, ...,  4583,  2273,   985],
       [    0,     0,     0, ...,   589,  8377,   182],
       [    0,     0,     0, ...,     1,   737,   468],
       ...,
       [    0,     0,     0, ...,  3509, 13675,  4528],
       [    0,     0,     0, ...,   151,    34,    11],
       [    0,     0,     0, ...,  1627,  2056,    88]], dtype=int32)

In [13]:
def get_coefs(word, *arr): 
    return word, np.asarray(arr, dtype='float32')

with open(embedding_path, 'r', encoding='utf-8') as f:
    embeddings_index = dict(get_coefs(*o.strip().split()) for o in f)

In [14]:
list(embeddings_index.items())[:2]

[('the',
  array([ 4.1800e-01,  2.4968e-01, -4.1242e-01,  1.2170e-01,  3.4527e-01,
         -4.4457e-02, -4.9688e-01, -1.7862e-01, -6.6023e-04, -6.5660e-01,
          2.7843e-01, -1.4767e-01, -5.5677e-01,  1.4658e-01, -9.5095e-03,
          1.1658e-02,  1.0204e-01, -1.2792e-01, -8.4430e-01, -1.2181e-01,
         -1.6801e-02, -3.3279e-01, -1.5520e-01, -2.3131e-01, -1.9181e-01,
         -1.8823e+00, -7.6746e-01,  9.9051e-02, -4.2125e-01, -1.9526e-01,
          4.0071e+00, -1.8594e-01, -5.2287e-01, -3.1681e-01,  5.9213e-04,
          7.4449e-03,  1.7778e-01, -1.5897e-01,  1.2041e-02, -5.4223e-02,
         -2.9871e-01, -1.5749e-01, -3.4758e-01, -4.5637e-02, -4.4251e-01,
          1.8785e-01,  2.7849e-03, -1.8411e-01, -1.1514e-01, -7.8581e-01],
        dtype=float32)),
 (',', array([ 0.013441,  0.23682 , -0.16899 ,  0.40951 ,  0.63812 ,  0.47709 ,
         -0.42852 , -0.55641 , -0.364   , -0.23938 ,  0.13001 , -0.063734,
         -0.39575 , -0.48162 ,  0.23291 ,  0.090201, -0.13324 ,  0.078

In [15]:
all_embs = np.stack(embeddings_index.values())
emb_mean, emb_std = all_embs.mean(), all_embs.std()
emb_mean, emb_std

(0.020940498, 0.6441043)

In [16]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features:
        break
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: 
        embedding_matrix[i] = embedding_vector

In [17]:
embedding_matrix[:2]

array([[ 8.01680741e-02,  5.91824793e-01, -6.79126240e-01,
        -4.42072324e-01,  1.26020857e+00,  1.06595895e-01,
         7.87426905e-01, -2.90026880e-01, -8.65713030e-01,
        -3.28408097e-01,  7.78407805e-01, -4.28119494e-01,
         2.01601673e-01, -1.47451229e+00, -7.30224845e-01,
         8.09387699e-01, -7.69374261e-01, -2.69213554e-01,
         1.46790529e-01,  4.75291641e-01, -6.07028302e-01,
        -8.77028475e-01,  9.14019476e-02, -2.54890856e-01,
         7.18932097e-02, -1.57073547e+00, -5.28505600e-01,
         5.43274271e-01,  3.93683655e-02,  9.94280273e-01,
        -1.61606144e-01, -8.37602799e-01,  8.57207336e-01,
         1.54765813e-01, -6.12322005e-01,  3.68125386e-01,
        -9.96883544e-01, -3.13112456e-01, -1.89330683e+00,
        -6.20428237e-02, -2.65378913e-01, -2.44561596e+00,
        -2.40054897e-01, -3.71249938e-01,  4.23471990e-01,
        -1.98272462e-01, -8.23704367e-02,  1.08898483e+00,
        -8.27122287e-01, -3.80705907e-01],
       [ 4.18

In [18]:
inp = Input(shape=(maxlen, ))
# i used here nb_words instead of max_words to allow less words
x = Embedding(nb_words, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
optimizer_adam = optimizers.adam(lr=0.01)
model.compile(loss='binary_crossentropy', optimizer=optimizer_adam, metrics=['accuracy'])

In [19]:
model.fit(X_train, y, batch_size=32, epochs=2, validation_split=0.1);

Train on 143613 samples, validate on 15958 samples
Epoch 1/2
Epoch 2/2


In [21]:
model.save('keras_001.h5')

In [22]:
y_test = model.predict([X_test], batch_size=1024, verbose=1)



In [23]:
csv_name = 'keras_001_lr.csv'

In [26]:
sample_submission = pd.read_csv(f'test_labels.csv')
sample_submission[list_classes] = y_test
sample_submission.to_csv(csv_name, index=False)

In [27]:
sample_submission.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.990604,0.3382859,0.93011,0.01121102,0.755626,0.04344
1,0000247867823ef7,0.001634,1.047053e-09,5.2e-05,7.379389e-09,5e-06,4e-06
2,00013b17ad220c46,0.003418,2.361733e-08,0.000516,4.202543e-07,6.1e-05,8e-06
3,00017563c3f7919a,0.002699,1.361869e-09,0.000103,4.268088e-08,1.5e-05,5e-06
4,00017695ad8997eb,0.027865,1.8364e-07,0.000695,3.630687e-06,0.00063,0.000132
