In [1]:
import gc
import os

import keras
import keras_models
import numpy as np
import pandas as pd
import utils
from gensim.models import KeyedVectors
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from keras.optimizers import SGD, Adadelta, Adam, Nadam, RMSprop
from keras.preprocessing import sequence, text
from nltk.corpus import stopwords
from tqdm import tqdm

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

Using TensorFlow backend.


In [2]:
n_folds = 5
n_bags = 1
split_size = 0.1
max_features = 200000
sequence_length = 196
embedding_dim = 300
create_embedding = False
bidirectional = True

run_prefix = 'FastText_'
src = '/home/w/Projects/Toxic/data/'
model_name = 'Conv1DLSTMbranchedV2'
optimizer = 'Nadam'
data_type = 'SpacyClean'
kfold_run = 0
batch_size = 256
importance = 0
stratify = 0
save_models = 0
load_models = 0
save_oof = 0
prepare_submission = 1


if bidirectional and 'LSTM' in model_name or bidirectional and 'GRU' in model_name:
    run_prefix = 'Bidirectional_{}'.format(run_prefix)
if kfold_run:
    general_run_name = '{}{}fold_BS{}_{}'.format(
        run_prefix, n_folds, batch_size, optimizer)
else:
    general_run_name = '{}{}bag_BS{}_{}'.format(
        run_prefix, n_bags, batch_size, optimizer)


if len(data_type) > 0:
    general_run_name += '_{}'.format(data_type)
if importance:
    general_run_name += '_ImportanceTrain'
if stratify and kfold_run:
    general_run_name += '_Stratified'

run_name = '{}{}'.format(model_name, general_run_name)
print('Run name: {}'.format(run_name))


model_callbacks = [EarlyStopping(monitor='val_loss', patience=15, verbose=1),
                   ReduceLROnPlateau(monitor='val_loss', factor=0.5, verbose=1,
                                     patience=7, min_lr=1e-5)]

if optimizer == 'Adam':
    optimizer = Adam(lr=1e-3, decay=1e-3)
    # optimizer = 'adam'
if optimizer == 'Nadam':
    optimizer = Nadam(lr=1e-3, schedule_decay=1e-3)
    # optimizer = 'nadam'
if optimizer == 'SGD':
    optimizer = SGD(lr=1e-2, momentum=0.9,
                    decay=1e-4, nesterov=True)

Run name: Conv1DLSTMbranchedV2Bidirectional_FastText_1bag_BS256_Nadam_SpacyClean


In [3]:
src_data = '/home/w/Projects/Toxic/data/features/'


train = pd.read_pickle("../data/train_spacy_clean.pkl")
test = pd.read_pickle("../data/test_spacy_clean.pkl")
target_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

data_tokenized = pd.read_pickle(src_data + 'data_TokenizedSentences196len.pkl')
data_badwords300 = pd.read_pickle(src_data + 'data_Binary300Badwords.pkl')
data_badwordsCount = pd.read_pickle(src_data + 'data_BadwordsCount.pkl')

X = pd.concat([data_tokenized, data_badwords300], axis=1)
X['badwordsCount'] = data_badwordsCount

X_train_mlp = X.iloc[:train.shape[0], :]
X_test_mlp = X.iloc[train.shape[0]:, :]

features = np.setdiff1d(X_train_mlp.columns, target_columns)

del X, test
del data_tokenized, data_badwords300, data_badwordsCount
gc.collect()

24

In [4]:
train, test = utils.load_data(src, mode=data_type)
print(train.shape, test.shape)
list_classes = ["toxic", "severe_toxic",
                "obscene", "threat", "insult", "identity_hate"]
list_sentences_train = train["comment_text"].fillna("CVxTz").values
list_sentences_test = test["comment_text"].fillna("CVxTz").values

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index)) + 1

X_train = sequence.pad_sequences(list_tokenized_train, maxlen=sequence_length)  # [:1000]
y_train = train[list_classes].values  # [:1000]
X_test = sequence.pad_sequences(list_tokenized_test, maxlen=sequence_length)  # [:1000]
print(X_train.shape, y_train.shape, X_test.shape)

del train, test, list_tokenized_train, list_tokenized_test
gc.collect()

if create_embedding:
    embedding_file = '/home/w/Projects/Toxic/data/embeddings/GoogleNews-vectors-negative300.bin.gz'
    word2vec = KeyedVectors.load_word2vec_format(embedding_file, binary=True)
    print('Found %s word vectors of word2vec' % len(word2vec.vocab))

    embedding_matrix = np.zeros((nb_words, embedding_dim))
    for word, i in tqdm(word_index.items()):
        if word in word2vec.vocab:
            embedding_matrix[i] = word2vec.word_vec(word)
    print('Null word embeddings: %d' %
          np.sum(np.sum(embedding_matrix, axis=1) == 0))
else:
    embedding_matrix = pd.read_pickle(
        '../data/embeddings/FastText_300dim_embedding.pkl')

Load data cleaned with Spacy.
(95851, 8) (226998, 2)
(95851, 196) (95851, 6) (226998, 196)


In [5]:
from sklearn.model_selection import train_test_split

X_tr, X_val, X_tr_mlp, X_val_mlp, y_tr, y_val =  train_test_split(X_train, X_train_mlp, y_train, test_size=0.1,
                                                                  random_state=1337)

In [6]:
model_parameters = {
    'lstm_units': 128,
    'bidirectional': False,
    'nb_words': nb_words,
    'embedding_dim': embedding_dim,
    'embedding_matrix': embedding_matrix,
    'sequence_length': sequence_length,
    'optimizer': optimizer,
    'num_columns': X_train_mlp.shape[1],
}

pipeline_parameters = {
    'model_name': getattr(keras_models, model_name),
    'predict_test': True,
    'number_epochs': 1000,
    'batch_size': batch_size,
    'seed': 1337,
    'shuffle': True,
    'verbose': True,
    'run_save_name': run_name,
    'load_keras_model': load_models,
    'save_model': save_models,
    'save_history': True,
    'save_statistics': True,
    'output_statistics': True,
    'src_dir': os.getcwd(),
}

if kfold_run:
    oof_train, oof_test = utils.run_parametrized_kfold([X_train, X_train_mlp], y_train, 
                                                       [X_test, X_test_mlp],
                                                       pipeline_parameters,
                                                       model_parameters,
                                                       model_callbacks=model_callbacks,
                                                       n_folds=n_folds,
                                                       importance_training=importance,
                                                       save_oof=save_oof)
    print(oof_train.shape, oof_test.shape)
else:
    oof_valid, oof_test = utils.run_parametrized_bagging([X_tr, X_tr_mlp], y_tr,
                                                         [X_val, X_val_mlp], y_val,
                                                         [X_test, X_test_mlp],
                                                         pipeline_parameters,
                                                         model_parameters,
                                                         model_callbacks=model_callbacks,
                                                         n_bags=n_bags,
                                                         user_split=True,
                                                         split_size=split_size,
                                                         importance_training=importance)
    print(oof_valid.shape, oof_test.shape)


if prepare_submission:
    submission = utils.output_submission(oof_test.mean(axis=0), run_name, save=True)

Running parametrized bagging
Running: Conv1DLSTMbranchedV2Bidirectional_FastText_1bag_BS256_Nadam_SpacyClean
Training on bag: 1 

Saving CSV logs for model from current bag/fold: Conv1DLSTMbranchedV2Bidirectional_FastText_1bag_BS256_Nadam_SpacyClean, bag number 1 

Validating on subset of data specified by user.
Train on 86265 samples, validate on 9586 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 00014: reducing learning rate to 0.0005000000237487257.
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 00021: reducing learning rate to 0.0002500000118743628.
Epoch 00021: early stopping
Predicting on validation data.
Validation split - standard deviation for original target values: 0.18482944169037246 
                  for predicted target values: 0.1589135229587555 
 

Predicting o

In [7]:
submission

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,6044863,1.941725e-06,2.192029e-05,2.730490e-04,1.631916e-07,1.523067e-03,2.635694e-05
1,6102620,1.055743e-05,8.463983e-07,1.287056e-07,1.474623e-06,4.477732e-07,1.855221e-05
2,14563293,4.001760e-05,4.941148e-07,7.140839e-06,8.602462e-07,3.586517e-07,5.510609e-07
3,21086297,1.191878e-05,7.214878e-07,1.247002e-07,1.483950e-07,4.787833e-07,3.034560e-07
4,22982444,3.288344e-03,1.337887e-05,6.323788e-05,2.322342e-06,1.427402e-05,7.602604e-06
5,24388733,4.210123e-08,6.264135e-07,1.428857e-06,8.536973e-09,5.107952e-11,5.788672e-12
6,26195914,1.967274e-06,1.217756e-08,3.428612e-10,8.709697e-10,3.539047e-10,5.938107e-09
7,31769073,1.944222e-04,4.768367e-06,8.643319e-06,3.000999e-06,1.769753e-05,2.270721e-06
8,35289443,2.393291e-05,9.425373e-06,4.662940e-06,4.042770e-06,1.218085e-05,1.577528e-05
9,38393350,1.618364e-07,9.461331e-08,1.615171e-07,8.101845e-09,1.280530e-08,1.087200e-09
