In [1]:
import numpy as np
np.random.seed(42)
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import keras
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback
import tensorflow as tf
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint
import pickle
import string
from num2words import num2words

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
positive = pd.read_csv('../toxicity/pos_phrases.csv', encoding='utf8')
positive = positive['text'].fillna("fillna").values

negative = pd.read_csv('../toxicity/neg_phrases.csv', encoding='utf8')
negative = negative['text'].fillna("fillna").values
y_pos = np.zeros((positive.shape[0], 1))
y_neg = np.ones((negative.shape[0], 1))

X = np.concatenate((positive,negative))
y = np.concatenate((y_pos,y_neg))

In [3]:
X1 = []
with open('../toxicity/X_f__1.txt', 'r') as f:
        for i in f.readlines():
            X1.append(i)          
X1 = np.array(X1)
X = np.concatenate((X,X1))
y = np.tile(y,(2,1))

In [4]:
import re
phrases = []
alph = [' ','ё','й','ц','у','к','е','н','г','ш','щ','з','х','ъ','ф','ы','в','а','п','р','о','л','д','ж','э','я','ч','с','м','и','т','ь','б','ю']
alph = set(alph)
for string in X:
    string = ''.join(i if i in alph else ' ' for i in string.lower())
    string = re.sub(' +',' ',string)
    phrases.append(string.strip())
X = phrases
X = np.array(X)


In [5]:
import string
from num2words import num2words

def clear_format(text):
    text = text.lower()
    text = [i for i in text.split() if i.isdigit or i.isalpsha()]
    punctuation = """!"#$%&()*+,-./:;<=>?@[\]^_`{|}~¹³²⓶"""
    return ' '.join(text).translate(str.maketrans('', '', punctuation))

    
def num_to_words(text):
    buf = text.split()
    for i, word in enumerate(buf):
        if word.isdigit():
            if len(word) > 5:
                buf[i] = ''
            else:
                buf[i] = str(num2words(word))
    return ' '.join(buf)

In [6]:
import re

def preprocess_data(X, y):
        msk = np.random.rand(len(y)) < 0.9
        y_train = y[msk]
        y_test = y[~msk]
        X_train = X[msk]
        X_test = X[~msk]

        X_train = [clear_format(num_to_words(clear_format(i))) for i in  X_train]
        X_test = [clear_format(num_to_words(clear_format(i))) for i in  X_test]

        max_features = 30000
        maxlen = 100
        embed_size = 300

        tokenizer = text.Tokenizer(num_words=max_features)
        tokenizer.fit_on_texts(list(X_train) + list(X_test))
        X_train = tokenizer.texts_to_sequences(X_train)
        X_test = tokenizer.texts_to_sequences(X_test)
        x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
        x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

        return x_train, y_train, x_test, y_test,tokenizer

In [7]:
%%time
x_train, y_train, x_test, y_test, tokenizer = preprocess_data(X, y)

CPU times: user 39.2 s, sys: 699 ms, total: 39.9 s
Wall time: 39.5 s


In [8]:
word_index = tokenizer.word_index
print(len(word_index))

203095


In [9]:
%%time
EMBEDDING_FILE = 'cc.ru.300.vec'

def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE, encoding='utf8'))

max_features = 30000
maxlen = 100
embed_size = 300
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
print(embedding_matrix.shape)
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

(30000, 300)
CPU times: user 2min 32s, sys: 4.59 s, total: 2min 37s
Wall time: 2min 35s


In [10]:
import numpy as np 
import pandas as pd
from keras.models import Model
from keras.layers import Dense, Embedding, Input, Bidirectional, LSTM, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPool1D, Dropout, concatenate, GlobalAveragePooling1D
from keras.preprocessing import text as keras_text, sequence as keras_seq
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.optimizers import Adam

def build_model(embed_size = 300):
    inp = Input(shape=(None, ))
    x = Embedding(input_dim = max_features, 
                  output_dim = embed_size, weights=[embedding_matrix])(inp)
    prefilt_x = Dropout(0.5)(x) 
    x = Bidirectional(LSTM(128, return_sequences=True, dropout=0.15, recurrent_dropout=0.15))(prefilt_x)
    x = Conv1D(64, kernel_size=2, padding='valid', kernel_initializer='glorot_uniform')(x)
#     x = Dropout(0.2)(x)
#     x = Bidirectional(LSTM(64, return_sequences=True, dropout=0.15, recurrent_dropout=0.15))(x)
#     x = Conv1D(32, kernel_size=2, padding='valid', kernel_initializer='glorot_uniform')(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    x = concatenate([avg_pool, max_pool])
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.15)(x)
    x = Dense(32, activation='relu')(x)
    x = Dropout(0.15)(x)
    x = Dense(1, activation='sigmoid')(x)
    
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer=Adam(lr=1e-4),
                  metrics=['accuracy'])
    return model

model = build_model()
model.summary()

batch_size = 128 
epochs = 10

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 300)    9000000     input_1[0][0]                    
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, None, 300)    0           embedding_1[0][0]                
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, None, 256)    439296      dropout_1[0][0]                  
__________________________________________________________________________________________________
conv1d_1 (

In [22]:
file_path="rus_best_weights.h5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early = EarlyStopping(monitor="val_loss", mode="min", patience=5)

callbacks_list = [checkpoint, early] 
model.fit(x_train, y_train, 
          validation_data=(x_test, y_test),
          batch_size=batch_size, 
          epochs=epochs, 
          shuffle = True,
          callbacks=callbacks_list)


Train on 408482 samples, validate on 45186 samples
Epoch 1/10

Epoch 00001: val_loss improved from inf to 0.55268, saving model to rus_best_weights.h5
Epoch 2/10

Epoch 00002: val_loss improved from 0.55268 to 0.53226, saving model to rus_best_weights.h5
Epoch 3/10

Epoch 00003: val_loss improved from 0.53226 to 0.51921, saving model to rus_best_weights.h5
Epoch 4/10

Epoch 00004: val_loss improved from 0.51921 to 0.51248, saving model to rus_best_weights.h5
Epoch 5/10

Epoch 00005: val_loss improved from 0.51248 to 0.50915, saving model to rus_best_weights.h5
Epoch 6/10

Epoch 00006: val_loss improved from 0.50915 to 0.50483, saving model to rus_best_weights.h5
Epoch 7/10

Epoch 00007: val_loss improved from 0.50483 to 0.50200, saving model to rus_best_weights.h5
Epoch 8/10

Epoch 00008: val_loss improved from 0.50200 to 0.49999, saving model to rus_best_weights.h5
Epoch 9/10

Epoch 00009: val_loss did not improve from 0.49999
Epoch 10/10

Epoch 00010: val_loss improved from 0.49999 t

<keras.callbacks.History at 0x7fc5c24aadd8>

In [None]:
model.load_weights('rus_best_weights.h5')
file_path="test-rus_best_weights.h5"
checkpoint = ModelCheckpoint(file_path, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
early = EarlyStopping(monitor="val_acc", mode="max", patience=5)

callbacks_list = [checkpoint, early] 
model.fit(x_train, y_train, 
          validation_data=(x_test, y_test),
          batch_size=batch_size, 
          epochs=epochs, 
          shuffle = True,
          callbacks=callbacks_list)
# score, acc = model.evaluate(x_test, y_test)
# print('Test score:', score)
# print('Test accuracy:', acc)

Train on 408482 samples, validate on 45186 samples
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.76533, saving model to test-rus_best_weights.h5
Epoch 2/10

Epoch 00002: val_acc improved from 0.76533 to 0.76856, saving model to test-rus_best_weights.h5
Epoch 3/10

Epoch 00003: val_acc did not improve from 0.76856
Epoch 4/10

In [19]:
def build_model2(conv_layers = 2, 
                dilation_rates = [0, 2, 4, 6, 8], 
                embed_size = 300):
    inp = Input(shape=(None, ))
    x = Embedding(input_dim = len(tokenizer.word_index)+1, 
                  output_dim = embed_size)(inp)
    prefilt_x = Dropout(0.5)(x)
    out_conv = []
    # dilation rate lets us use ngrams and skip grams to process 
    for dilation_rate in dilation_rates:
        x = prefilt_x
        for i in range(2):
            if dilation_rate>0:
                x = Conv1D(16*2**(i), 
                           kernel_size = 3, 
                           dilation_rate = dilation_rate,
                          activation = 'relu',
                          name = 'ngram_{}_cnn_{}'.format(dilation_rate, i)
                          )(x)
            else:
                x = Conv1D(16*2**(i), 
                           kernel_size = 1,
                          activation = 'relu',
                          name = 'word_fcl_{}'.format(i))(x)
        out_conv += [Dropout(0.5)(GlobalMaxPool1D()(x))]
    x = concatenate(out_conv, axis = -1)    
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.15)(x)
    x = Dense(32, activation='relu')(x)
    x = Dropout(0.15)(x)
    x = Dense(1, activation='sigmoid')(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer=Adam(lr=1e-4),
                  metrics=['accuracy'])
    return model

model2 = build_model2()
model2.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_7 (Embedding)         (None, None, 300)    60928800    input_7[0][0]                    
__________________________________________________________________________________________________
dropout_19 (Dropout)            (None, None, 300)    0           embedding_7[0][0]                
__________________________________________________________________________________________________
word_fcl_0 (Conv1D)             (None, None, 16)     4816        dropout_19[0][0]                 
__________________________________________________________________________________________________
ngram_2_cn

In [20]:
batch_size = 128 
epochs = 30

file_path="rus_second_try_best_weights.h5"
checkpoint = ModelCheckpoint(file_path, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
early = EarlyStopping(monitor="val_acc", mode="max", patience=5)

callbacks_list = [checkpoint, early] 
model2.fit(x_train, y_train, 
          validation_data=(x_test, y_test),
          batch_size=batch_size, 
          epochs=epochs, 
          shuffle = True,
          callbacks=callbacks_list)

Train on 408482 samples, validate on 45186 samples
Epoch 1/30

Epoch 00001: val_acc improved from -inf to 0.72478, saving model to rus_second_try_best_weights.h5
Epoch 2/30

Epoch 00002: val_acc improved from 0.72478 to 0.73682, saving model to rus_second_try_best_weights.h5
Epoch 3/30

Epoch 00003: val_acc improved from 0.73682 to 0.74180, saving model to rus_second_try_best_weights.h5
Epoch 4/30

Epoch 00004: val_acc improved from 0.74180 to 0.74326, saving model to rus_second_try_best_weights.h5
Epoch 5/30

Epoch 00005: val_acc improved from 0.74326 to 0.74680, saving model to rus_second_try_best_weights.h5
Epoch 6/30

Epoch 00006: val_acc improved from 0.74680 to 0.74709, saving model to rus_second_try_best_weights.h5
Epoch 7/30

Epoch 00007: val_acc did not improve from 0.74709
Epoch 8/30

Epoch 00008: val_acc did not improve from 0.74709
Epoch 9/30

KeyboardInterrupt: 