In [1]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd

from tqdm import tqdm
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

Using TensorFlow backend.


In [2]:
path = 'input/'
EMBEDDING_FILE=f'{path}glove6b50d/glove.6B.50d.txt'
TRAIN_DATA_FILE=f'{path}train.csv'
TEST_DATA_FILE=f'{path}test.csv'

In [3]:
embed_size = 50 # how big is each word vector
max_features = 20000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a comment to use

In [4]:
train = pd.read_csv(TRAIN_DATA_FILE)
test = pd.read_csv(TEST_DATA_FILE)

list_sentences_train = train["comment_text"].fillna("_na_").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("_na_").values


In [5]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [8]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE,encoding="utf8"))

In [9]:
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
emb_mean,emb_std

(0.020940498, 0.6441043)

In [10]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [11]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [12]:
model.fit(X_t, y, batch_size=32, epochs=2) # validation_split=0.1);

Epoch 1/2


 6528/95851 [=>............................] - ETA: 2:12:09 - loss: 0.6929 - acc: 0.56 - ETA: 1:10:12 - loss: 0.6736 - acc: 0.59 - ETA: 49:37 - loss: 0.6599 - acc: 0.6233 - ETA: 39:05 - loss: 0.6477 - acc: 0.64 - ETA: 32:53 - loss: 0.6326 - acc: 0.67 - ETA: 28:51 - loss: 0.6221 - acc: 0.69 - ETA: 26:00 - loss: 0.6098 - acc: 0.70 - ETA: 23:53 - loss: 0.5960 - acc: 0.72 - ETA: 22:08 - loss: 0.5819 - acc: 0.73 - ETA: 20:44 - loss: 0.5686 - acc: 0.75 - ETA: 19:35 - loss: 0.5582 - acc: 0.76 - ETA: 18:33 - loss: 0.5462 - acc: 0.77 - ETA: 17:49 - loss: 0.5328 - acc: 0.78 - ETA: 17:07 - loss: 0.5197 - acc: 0.79 - ETA: 16:28 - loss: 0.5072 - acc: 0.80 - ETA: 15:57 - loss: 0.4955 - acc: 0.81 - ETA: 15:31 - loss: 0.4824 - acc: 0.82 - ETA: 15:05 - loss: 0.4700 - acc: 0.82 - ETA: 14:45 - loss: 0.4604 - acc: 0.83 - ETA: 14:27 - loss: 0.4473 - acc: 0.84 - ETA: 14:10 - loss: 0.4388 - acc: 0.84 - ETA: 13:55 - loss: 0.4286 - acc: 0.85 - ETA: 13:39 - loss: 0.4179 - acc: 0.86 - ETA: 13:25 - loss: 0.4113 -

13056/95851 [===>..........................] - ETA: 8:32 - loss: 0.1573 - acc: 0.953 - ETA: 8:31 - loss: 0.1568 - acc: 0.953 - ETA: 8:31 - loss: 0.1564 - acc: 0.953 - ETA: 8:31 - loss: 0.1561 - acc: 0.953 - ETA: 8:30 - loss: 0.1557 - acc: 0.953 - ETA: 8:30 - loss: 0.1553 - acc: 0.953 - ETA: 8:30 - loss: 0.1550 - acc: 0.953 - ETA: 8:29 - loss: 0.1546 - acc: 0.954 - ETA: 8:29 - loss: 0.1543 - acc: 0.954 - ETA: 8:29 - loss: 0.1539 - acc: 0.954 - ETA: 8:28 - loss: 0.1535 - acc: 0.954 - ETA: 8:28 - loss: 0.1533 - acc: 0.954 - ETA: 8:27 - loss: 0.1530 - acc: 0.954 - ETA: 8:27 - loss: 0.1526 - acc: 0.954 - ETA: 8:27 - loss: 0.1522 - acc: 0.954 - ETA: 8:26 - loss: 0.1521 - acc: 0.954 - ETA: 8:26 - loss: 0.1520 - acc: 0.954 - ETA: 8:26 - loss: 0.1516 - acc: 0.954 - ETA: 8:25 - loss: 0.1517 - acc: 0.954 - ETA: 8:25 - loss: 0.1515 - acc: 0.954 - ETA: 8:25 - loss: 0.1515 - acc: 0.954 - ETA: 8:25 - loss: 0.1513 - acc: 0.954 - ETA: 8:24 - loss: 0.1510 - acc: 0.954 - ETA: 8:24 - loss: 0.1507 - acc: 0

19584/95851 [=====>........................] - ETA: 7:36 - loss: 0.1163 - acc: 0.964 - ETA: 7:36 - loss: 0.1161 - acc: 0.964 - ETA: 7:35 - loss: 0.1159 - acc: 0.964 - ETA: 7:35 - loss: 0.1158 - acc: 0.964 - ETA: 7:35 - loss: 0.1157 - acc: 0.964 - ETA: 7:34 - loss: 0.1155 - acc: 0.964 - ETA: 7:34 - loss: 0.1158 - acc: 0.964 - ETA: 7:34 - loss: 0.1157 - acc: 0.964 - ETA: 7:34 - loss: 0.1156 - acc: 0.964 - ETA: 7:33 - loss: 0.1154 - acc: 0.964 - ETA: 7:33 - loss: 0.1153 - acc: 0.964 - ETA: 7:33 - loss: 0.1152 - acc: 0.964 - ETA: 7:32 - loss: 0.1150 - acc: 0.964 - ETA: 7:32 - loss: 0.1148 - acc: 0.964 - ETA: 7:32 - loss: 0.1146 - acc: 0.964 - ETA: 7:32 - loss: 0.1144 - acc: 0.964 - ETA: 7:31 - loss: 0.1143 - acc: 0.964 - ETA: 7:31 - loss: 0.1142 - acc: 0.964 - ETA: 7:31 - loss: 0.1141 - acc: 0.964 - ETA: 7:30 - loss: 0.1139 - acc: 0.964 - ETA: 7:30 - loss: 0.1137 - acc: 0.965 - ETA: 7:30 - loss: 0.1135 - acc: 0.965 - ETA: 7:30 - loss: 0.1133 - acc: 0.965 - ETA: 7:29 - loss: 0.1132 - acc: 0























Epoch 2/2


 6528/95851 [=>............................] - ETA: 7:55 - loss: 0.0509 - acc: 0.974 - ETA: 8:01 - loss: 0.0369 - acc: 0.984 - ETA: 7:59 - loss: 0.0281 - acc: 0.989 - ETA: 7:58 - loss: 0.0308 - acc: 0.989 - ETA: 7:58 - loss: 0.0332 - acc: 0.987 - ETA: 7:57 - loss: 0.0413 - acc: 0.985 - ETA: 7:55 - loss: 0.0390 - acc: 0.985 - ETA: 7:54 - loss: 0.0452 - acc: 0.983 - ETA: 7:54 - loss: 0.0429 - acc: 0.984 - ETA: 7:55 - loss: 0.0432 - acc: 0.983 - ETA: 7:56 - loss: 0.0520 - acc: 0.980 - ETA: 7:57 - loss: 0.0479 - acc: 0.981 - ETA: 7:56 - loss: 0.0453 - acc: 0.982 - ETA: 7:56 - loss: 0.0460 - acc: 0.982 - ETA: 7:57 - loss: 0.0493 - acc: 0.981 - ETA: 7:59 - loss: 0.0474 - acc: 0.982 - ETA: 7:58 - loss: 0.0513 - acc: 0.981 - ETA: 7:57 - loss: 0.0504 - acc: 0.981 - ETA: 7:57 - loss: 0.0496 - acc: 0.982 - ETA: 7:56 - loss: 0.0513 - acc: 0.981 - ETA: 7:56 - loss: 0.0506 - acc: 0.981 - ETA: 7:55 - loss: 0.0496 - acc: 0.981 - ETA: 7:55 - loss: 0.0506 - acc: 0.981 - ETA: 7:54 - loss: 0.0523 - acc: 0

13056/95851 [===>..........................] - ETA: 7:36 - loss: 0.0489 - acc: 0.981 - ETA: 7:36 - loss: 0.0488 - acc: 0.981 - ETA: 7:36 - loss: 0.0488 - acc: 0.981 - ETA: 7:35 - loss: 0.0486 - acc: 0.981 - ETA: 7:35 - loss: 0.0486 - acc: 0.981 - ETA: 7:35 - loss: 0.0486 - acc: 0.981 - ETA: 7:34 - loss: 0.0486 - acc: 0.981 - ETA: 7:34 - loss: 0.0485 - acc: 0.981 - ETA: 7:34 - loss: 0.0487 - acc: 0.981 - ETA: 7:34 - loss: 0.0487 - acc: 0.981 - ETA: 7:34 - loss: 0.0490 - acc: 0.981 - ETA: 7:33 - loss: 0.0491 - acc: 0.981 - ETA: 7:33 - loss: 0.0489 - acc: 0.981 - ETA: 7:33 - loss: 0.0493 - acc: 0.981 - ETA: 7:32 - loss: 0.0493 - acc: 0.981 - ETA: 7:32 - loss: 0.0492 - acc: 0.981 - ETA: 7:31 - loss: 0.0493 - acc: 0.981 - ETA: 7:31 - loss: 0.0493 - acc: 0.981 - ETA: 7:31 - loss: 0.0494 - acc: 0.981 - ETA: 7:31 - loss: 0.0494 - acc: 0.981 - ETA: 7:30 - loss: 0.0494 - acc: 0.981 - ETA: 7:30 - loss: 0.0493 - acc: 0.981 - ETA: 7:30 - loss: 0.0491 - acc: 0.981 - ETA: 7:29 - loss: 0.0491 - acc: 0

19584/95851 [=====>........................] - ETA: 6:47 - loss: 0.0478 - acc: 0.982 - ETA: 6:47 - loss: 0.0478 - acc: 0.982 - ETA: 6:47 - loss: 0.0478 - acc: 0.982 - ETA: 6:47 - loss: 0.0478 - acc: 0.982 - ETA: 6:47 - loss: 0.0477 - acc: 0.982 - ETA: 6:46 - loss: 0.0478 - acc: 0.982 - ETA: 6:46 - loss: 0.0477 - acc: 0.982 - ETA: 6:46 - loss: 0.0478 - acc: 0.982 - ETA: 6:46 - loss: 0.0479 - acc: 0.982 - ETA: 6:46 - loss: 0.0478 - acc: 0.982 - ETA: 6:46 - loss: 0.0479 - acc: 0.982 - ETA: 6:46 - loss: 0.0480 - acc: 0.982 - ETA: 6:45 - loss: 0.0479 - acc: 0.982 - ETA: 6:45 - loss: 0.0479 - acc: 0.982 - ETA: 6:45 - loss: 0.0481 - acc: 0.982 - ETA: 6:45 - loss: 0.0481 - acc: 0.982 - ETA: 6:45 - loss: 0.0481 - acc: 0.982 - ETA: 6:44 - loss: 0.0480 - acc: 0.982 - ETA: 6:44 - loss: 0.0479 - acc: 0.982 - ETA: 6:44 - loss: 0.0478 - acc: 0.982 - ETA: 6:44 - loss: 0.0478 - acc: 0.982 - ETA: 6:44 - loss: 0.0477 - acc: 0.982 - ETA: 6:43 - loss: 0.0477 - acc: 0.982 - ETA: 6:43 - loss: 0.0476 - acc: 0

























<keras.callbacks.History at 0x7b16a49c50>

In [13]:
y_test = model.predict([X_te], batch_size=1024, verbose=1)
sample_submission = pd.read_csv(f'{path}sample_submission.csv')
sample_submission[list_classes] = y_test
sample_submission.to_csv('submission.csv', index=False)

