In [1]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D
from tensorflow.keras.models import Model

In [2]:
EMBEDDING_FILE='./data/multilingual_embeddings.ru'
TRAIN_DATA_FILE='./data/train'
TEST_DATA_FILE='./data/test'
DATA = './data/data'

In [3]:
embed_size = 300 
max_features = 20000 
maxlen = 100

In [4]:
train = pd.read_table(
    TRAIN_DATA_FILE, 
    delimiter='\t', 
    error_bad_lines=False, 
    header = None, 
    names = ['id', 'label', 'comment']
)


In [5]:
train.head()

Unnamed: 0,id,label,comment
0,41127,__label__THREAT,–¥–≤–æ—Ä–Ω–∏–∫–∞ –Ω–∞–¥–æ —Ç–æ–∂–µ —É–Ω–∏—á—Ç–æ–∂–∏—Ç—å!
1,6812,__label__NORMAL,"–º–æ—è —Å—Ç–∞—Ä—à–∞—è –Ω–µ–¥–µ–ª—é —à–∏–ø–µ–ª–∞, –Ω–µ –ø—Ä–∏–Ω–∏–º–∞–ª–∞ –ø–æ–¥–∫–∏–¥..."
2,6256,__label__NORMAL,–ø–æ–ª–Ω–æ—Å—Ç—å—é —Å –≤–∞–º–∏ —Å–æ–≥–ª–∞—Å–Ω–∞!
3,189636,__label__NORMAL,"—Ö–æ—Ç—å –Ω–æ–≥—É –≤–≤–µ—Ä—Ö, –Ω–∏—á–µ–≥–æ –Ω–µ –∏–∑–º–µ–Ω–∏—Ç—Å—è"
4,99053,__label__NORMAL,–∞ —á—Ç–æ –∑–Ω–∞—á–∏—Ç - –ª–µ–≤–æ–≥–æ —Ä–µ–±–µ–Ω–∫–∞?


In [6]:
# getting rid of '__label__'
def clear_label(label):
    return label.split('__label__')[1]
    
train['label'] = train['label'].apply(clear_label)
train = pd.concat([train.drop('label', axis=1), pd.get_dummies(train['label'])], axis=1)
train.columns= train.columns.str.strip().str.lower()
train  = train[['id', 'comment', 'normal', 'insult','obscenity', 'threat']]

In [7]:
train.head()

Unnamed: 0,id,comment,normal,insult,obscenity,threat
0,41127,–¥–≤–æ—Ä–Ω–∏–∫–∞ –Ω–∞–¥–æ —Ç–æ–∂–µ —É–Ω–∏—á—Ç–æ–∂–∏—Ç—å!,0,0,0,1
1,6812,"–º–æ—è —Å—Ç–∞—Ä—à–∞—è –Ω–µ–¥–µ–ª—é —à–∏–ø–µ–ª–∞, –Ω–µ –ø—Ä–∏–Ω–∏–º–∞–ª–∞ –ø–æ–¥–∫–∏–¥...",1,0,0,0
2,6256,–ø–æ–ª–Ω–æ—Å—Ç—å—é —Å –≤–∞–º–∏ —Å–æ–≥–ª–∞—Å–Ω–∞!,1,0,0,0
3,189636,"—Ö–æ—Ç—å –Ω–æ–≥—É –≤–≤–µ—Ä—Ö, –Ω–∏—á–µ–≥–æ –Ω–µ –∏–∑–º–µ–Ω–∏—Ç—Å—è",1,0,0,0
4,99053,–∞ —á—Ç–æ –∑–Ω–∞—á–∏—Ç - –ª–µ–≤–æ–≥–æ —Ä–µ–±–µ–Ω–∫–∞?,1,0,0,0


In [8]:
test = pd.read_table(
    TEST_DATA_FILE, 
    header = None, 
    names = ['id']
)
test

Unnamed: 0,id
0,167315
1,224546
2,241309
3,31170
4,173358
...,...
99510,192320
99511,6646
99512,215218
99513,139806


In [9]:
data = pd.read_table(
    DATA, 
    delimiter='\t', 
    error_bad_lines=False, 
    header = None, 
    names = ['id', 'comment']
)
data


Unnamed: 0,id,comment
0,41127,–¥–≤–æ—Ä–Ω–∏–∫–∞ –Ω–∞–¥–æ —Ç–æ–∂–µ —É–Ω–∏—á—Ç–æ–∂–∏—Ç—å!
1,6812,"–º–æ—è —Å—Ç–∞—Ä—à–∞—è –Ω–µ–¥–µ–ª—é —à–∏–ø–µ–ª–∞, –Ω–µ –ø—Ä–∏–Ω–∏–º–∞–ª–∞ –ø–æ–¥–∫–∏–¥..."
2,6256,–ø–æ–ª–Ω–æ—Å—Ç—å—é —Å –≤–∞–º–∏ —Å–æ–≥–ª–∞—Å–Ω–∞!
3,189636,"—Ö–æ—Ç—å –Ω–æ–≥—É –≤–≤–µ—Ä—Ö, –Ω–∏—á–µ–≥–æ –Ω–µ –∏–∑–º–µ–Ω–∏—Ç—Å—è"
4,99053,–∞ —á—Ç–æ –∑–Ω–∞—á–∏—Ç - –ª–µ–≤–æ–≥–æ —Ä–µ–±–µ–Ω–∫–∞?
...,...,...
248285,192320,"–Ω–µ —Ç—Ä–æ–≥–∞–π –ø–æ–ø—É,—ç—Ç–æ –Ω–µ —Ç–≤–æ—ë!"
248286,6646,–ø–æ–ø–æ–ª–Ω–µ–Ω–∏–µ –æ—á–µ—Ä–µ–¥–Ω—ã—Ö –ø–∏–¥–æ—Ä–∞—Å–æ–≤
248287,215218,"–µ—Å–ª–∏ –≥—Ä—ë–±–∞–Ω—ã–µ –¥–µ–ø—É—Ç–∞—Ç—ã —Å—è–¥—É—Ç –Ω–∞ –Ω–∞—à–∏ –ø–µ–Ω—Å–∏–∏, –æ..."
248288,139806,–≤–æ–∂–¥—å –∞–ø–∞—á–∏.


In [10]:
test = pd.merge(test, data, how='inner', on=['id'])
test.isna().all().all()

False

In [11]:
list_sentences_train = train.comment.values
list_sentences_test = test.comment.values
list_classes = ['normal', 'insult', 'obscenity', 'threat']
y = train[list_classes].values


In [12]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_train = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_test = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [13]:
def get_coefs(word,*arr): 
    return word, np.asarray(arr, dtype='float32')

embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))


In [14]:
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()

  if (await self.run_code(code, result,  async_=asy)):


In [15]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size)) # 50

for word, i in word_index.items():
    if i >= max_features: 
        continue
    embedding_vector = embeddings_index.get(word) 
    if embedding_vector is not None: 
        embedding_matrix[i] = embedding_vector

In [16]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(4, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [17]:
model.fit(X_train, y, batch_size=32, epochs=2, validation_split=0.1)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x14d2ac048>

In [18]:
model.save(os.getcwd())

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: /Users/alexeynikitin/Desktop/Python_Projects/OK_ML_Cup_Toxic_Comments/assets


In [19]:
y_test = model.predict(X_test, batch_size=1024, verbose=1)



In [31]:
result = pd.concat([test, pd.DataFrame(data=y_test, columns=list_classes)], axis=1)
result.drop(columns=['comment'])

Unnamed: 0,id,normal,insult,obscenity,threat
0,167315,0.999741,0.000121,0.000003,0.000004
1,224546,0.954923,0.033137,0.000666,0.002782
2,241309,0.000059,0.999799,0.000006,0.000011
3,31170,0.000498,0.996552,0.000225,0.000570
4,173358,0.989832,0.007829,0.000168,0.000175
...,...,...,...,...,...
99510,192320,0.401002,0.357163,0.162442,0.003799
99511,6646,0.000069,0.999820,0.000027,0.000005
99512,215218,0.985269,0.012497,0.000067,0.000254
99513,139806,0.598991,0.439473,0.001447,0.008462


In [32]:
result = result.to_numpy()
result

array([[167315, '–∫–∞–∫–∞—è –ø—Ä–µ–ª–µ—Å—Ç—å!!!üòç', 0.99974125623703,
        0.00012110308307455853, 3.02076045954891e-06,
        3.993211066699587e-06],
       [224546, '–∫–∞–∞–ª –∫–∞–∫–æ–π –Ω–µ —Å –∫—Ä–æ–≤—å—é?', 0.9549233913421631,
        0.03313741087913513, 0.0006659328937530518, 0.002781778573989868],
       [241309, '–≥–Ω–æ–π–Ω—ã–µ –ø–∏–¥–æ—Ä—ã –∞–ª–ª—ã –æ–Ω–∏', 5.875798160559498e-05,
        0.9997990131378174, 5.837040134792915e-06,
        1.1158575944136828e-05],
       ...,
       [215218,
        '–µ—Å–ª–∏ –≥—Ä—ë–±–∞–Ω—ã–µ –¥–µ–ø—É—Ç–∞—Ç—ã —Å—è–¥—É—Ç –Ω–∞ –Ω–∞—à–∏ –ø–µ–Ω—Å–∏–∏, –æ–Ω–∏ —Å—Ä–∞–∑—É –∂–µ —Å–≤–æ–∏ —à—Ç–∞–Ω—ã –ø–æ—Ç–µ—Ä—è—é—Ç, –±—Ä—ã–ª–∞ –ø–æ –æ–±–≤–∏—Å–Ω—É—Ç, –Ω–∞—á–Ω—É—Ç –±–æ–ª–µ—Ç—å –æ—Ç –Ω–µ–¥–æ—Å—ã–ø–∞ . –Ω–∏–∫–æ–º—É –µ—â—ë –Ω–∏ —á–µ–≥–æ –Ω–µ –ø–æ–¥–Ω—è–ª–∏, –∞ —É–∂ –º–µ—Å—è—Ü —Ç—Ä—è—Å—É—Ç, —á—Ç–æ –ø–æ–¥–Ω–∏–º—É—Ç –∞–∂ –Ω–∞ 236—Ä.–ø–æ–∑–æ—Ä–Ω–∏–∫–∏ –Ω–∞ –≤–µ—Å—å –º–∏—Ä.',
        0.9852688312530518, 0.012497305870056152, 6.726165884174407e-05,
        