# Packages

In [1]:
import pandas as pd
import numpy as np
import os, re, csv, codecs, operator, sys, gc
from collections import defaultdict, OrderedDict
import importlib
from tqdm import tqdm
from os import listdir
from os.path import isfile, join
import lightgbm as lgb
from itertools import repeat
from matplotlib import pyplot as plt
%matplotlib inline

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, roc_auc_score, accuracy_score
from sklearn.model_selection import cross_val_score, cross_val_predict, StratifiedKFold, KFold, train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.utils import shuffle

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation
from textblob import TextBlob    # For pos-tagging
from scipy.sparse import csr_matrix, hstack
from scipy.special import logit, expit

from keras import optimizers, initializers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Lambda, Embedding, Dropout, Activation, SpatialDropout1D, Reshape, \
GlobalAveragePooling1D, merge, Flatten, Bidirectional, CuDNNGRU, add, Conv1D, GlobalMaxPooling1D
from keras.layers.merge import concatenate
from keras.models import Model, Sequential
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.engine import InputSpec, Layer
from keras import backend as K

import torch
import torch.nn as nn
import torch.nn.init as init
import torch.nn.functional as F
from torch.autograd import Variable

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Parameters

In [2]:
PATH = 'datasets/'
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
FAST_TEXT_EMBEDDING = 'pretrain_embedding/crawl-300d-2M.vec'
GLOVE_EMBEDDING = 'pretrain_embedding/glove.840B.300d.txt'
CLEAN_WORD_PATH = None
TRAIN_DATA_FILE = 'train.csv'
TEST_DATA_FILE = 'test.csv'
MAX_SEQUENCE_LENGTH = 350
MAX_NB_WORDS = 100000
EMBEDDING_DIM = 300
FOLD_COUNT = 10
BATCH_SIZE = 200

# Load Pretrain Models

In [3]:
def load_pretrain_embedding(file):
    """
    Return a dictionary whose key is word, value is pretrained word embedding.
    """
    print('Indexing word vectors')
    embeddings_index = {}
    f = open(file, 'r', encoding='utf-8')
    for line in f:
        values = line.split()
        try:
            word = values[0]
            coefs = np.array(values[1:], dtype='float32')
            embeddings_index[word] = coefs
        except:
            print("Error on: ", values[:3])
    f.close()
    print("Total %s word vectors" % len(embeddings_index))
    return embeddings_index

In [4]:
def load_clean_words(file):
    """
    Return a dictionary whose key is typo, value is correct word.
    """
    clean_word_dict = {}
    with open(file, 'r', encoding='utf-8'):
        for line in f:
            line = line.strip('\n')
            typo, correct = line.split(',')
            clean_word_dict[typo] = correct
    return clean_word_dict

In [5]:
embeddings_index = load_pretrain_embedding(FAST_TEXT_EMBEDDING)

Indexing word vectors
Total 2000000 word vectors


In [None]:
glove_embeddings_index = load_pretrain_embedding(GLOVE_EMBEDDING)

# Data Overview

In [6]:
train_df = pd.read_csv('datasets/train.csv')
test_df = pd.read_csv('datasets/test.csv')

In [None]:
train_df.info()

In [None]:
train_df.describe()

In [None]:
train_df.corr()

In [None]:
train_df.head()

In [7]:
train_comments = train_df['comment_text'].values
test_comments = test_df['comment_text'].values
train_comments_lengths = [len(s) for s in train_comments]
test_comments_lengths = [len(s) for s in test_comments]

In [None]:
def explore_comments(arr):
    print("MAX LENGTH:\t\t", np.max(arr))
    print("AVG LENGTH:\t\t", np.average(arr))
    print("MIN LENGTH:\t\t", np.min(arr))
    print("STANDARD DIVISION:\t", np.std(arr))
    print("RANGE:\t\t\t", np.min(arr), " to ", np.average(arr) + 2 * np.std(arr))
    
print("------Train------")
explore_comments(train_comments_lengths)

print("------Test------")
explore_comments(test_comments_lengths)

In [None]:
for class_name in list_classes:
    print('{}\n{}\n'.format(class_name, train_df[class_name].value_counts()))

# Data Cleaning

## Load Cleaned Words

In [8]:
if CLEAN_WORD_PATH == None:
    ignored_words = set(stopwords.words('english'))
else:
    ignored_words = load_clean_words(CLEAN_WORD_PATH)

## Ensemble Features

In [None]:
feature_files = sorted([f for f in listdir('datasets/feature_files/') if isfile(join('datasets/feature_files/', f))])
for file in feature_files:
    fixed_df = pd.read_csv('datasets/feature_files/' + file)
    train_df = train_df.merge(fixed_df, on='id', how='left')
    test_df = test_df.merge(fixed_df, on='id',how='left')

In [None]:
def ensemble_data(path):
    files = sorted([f for f in listdir(path) if isfile(join(path, f))])
    for i, file in enumerate(files):
        temp_df = pd.read_csv(path + file)
        print('Datasets before ensemble null number:', temp_df.isnull().sum().sum())
        if i == 0:
            ensembled = temp_df
        else:
            ensembled = ensembled.merge(temp_df, on='id', how='left')
            print('Datasets after ensemble null number:', ensembled.isnull().sum().sum())
    
    column_numbers = ensembled.shape[1]
    toxic = ensembled.iloc[:, [i for i in range(2, column_numbers, len(list_classes))]]
    severe_toxic = ensembled.iloc[:, [i for i in range(3, column_numbers, len(list_classes))]]
    obscene = ensembled.iloc[:, [i for i in range(4, column_numbers, len(list_classes))]]
    threat = ensembled.iloc[:, [i for i in range(5, column_numbers, len(list_classes))]]
    insult = ensembled.iloc[:, [i for i in range(6, column_numbers, len(list_classes))]]
    identity_hate = ensembled.iloc[:, [i for i in range(7, column_numbers, len(list_classes))]]
    
    return toxic, severe_toxic, obscene, threat, insult, identity_hate

In [None]:
train_toxic, train_severe_toxic, train_obscene, train_threat, train_insult, train_identity_hate = ensemble_data('datasets/ensemble_files/train/')
test_toxic, test_severe_toxic, test_obscene, test_threat, test_insult, test_identity_hate = ensemble_data('datasets/ensemble_files/test/')

## Regex Regularization

In [9]:
# Regex to remove all Non-Alpha Numeric and space
special_character_removal = re.compile(r'[^?!.,:a-z\d ]', re.IGNORECASE)
# Regex to remove all numerics
replace_numbers = re.compile(r'\d+', re.IGNORECASE)

word_count_dict = defaultdict(int)

def clean_datasets(text, remove_stopwords=False, stem_words=False, count_null_words=True, clean_wiki_tokens=True):
    
    text = text.lower()
    text = re.sub(r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", "", text)
    text = re.sub(r"(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}", "", text)
    
    if clean_wiki_tokens:
        text = re.sub(r"\.jpg", " ", text)
        text = re.sub(r"\.png", " ", text)
        text = re.sub(r"\.gif", " ", text)
        text = re.sub(r"\.bmp", " ", text)
        text = re.sub(r"\.pdf", " ", text)
        text = re.sub(r"image:", " ", text)
        text = re.sub(r"#([A-Fa-f0-9]{6}|[A-Fa-f0-9]{3})", " ",text)
        
        text = re.sub(r"\{\|[^\}]*\|\}", " ", text)
        text = re.sub(r"\[?\[user:.*\]", " ", text)
        text = re.sub(r"\[?\[user:.*\|", " ", text)
        text = re.sub(r"\[?\[wikipedia:.*\]", " ", text)
        text = re.sub(r"\[?\[wikipedia:.*\|", " ", text)
        text = re.sub(r"\[?\[special:.*\]", " ", text)
        text = re.sub(r"\[?\[special:.*\|", " ", text)
        text = re.sub(r"\[?\[category:.*\]", " ", text)
        text = re.sub(r"\[?\[category:.*\|", " ", text)
        
        text = re.sub(r"wp:", " ", text)
        text = re.sub(r"file:", " ", text)
    
    # For Punctuation
    text = re.sub(r"”", "\"", text)
    text = re.sub(r"“", "\"", text)
    text = re.sub(r"´", "'", text)
    text = re.sub(r"—", " ", text)
    text = re.sub(r"’", "'", text)
    text = re.sub(r"‘", "'", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r"−", " ", text)
    text = re.sub(r"\?", " ? ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"#", " # ", text)
    
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"i’m", "i am", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"<3", " love ", text)
        
    text = replace_numbers.sub('', text)
    
    if remove_stopwords:
        for typo, correct in ignored_words.items():
            text = re.sub(typo, correct, text)
    
    if count_null_words:
        text = text.split()
        for t in text:
            word_count_dict[t] += 1
        text = " ".join(text)
        
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
        
    return text

In [10]:
list_sentences_train = train_df['comment_text'].fillna('no comment').values
list_sentences_test = test_df['comment_text'].fillna('no comment').values
train_labels = train_df[list_classes].values

In [11]:
cleaned_train_comments, cleaned_test_comments = [], []
print('Processing data cleaning...')

for text in list_sentences_train:
    cleaned_train_comments.append(clean_datasets(text))
for text in list_sentences_test:
    cleaned_test_comments.append(clean_datasets(text))
print('Done!')

Processing data cleaning...
Done!


In [12]:
train_df['comment_text_cleaned'] = cleaned_train_comments
test_df['comment_text_cleaned'] = cleaned_test_comments

## Create Meta (Extend) Features

In [None]:
def count_unknown_embedding(t, idx):
    t = t.split()
    res = 0
    for word in t:
        if word not in idx.keys():
            res += 1
    return res

In [None]:
def count_regexp_occ(regexp='', text=None):
    """
    Simple way to calculate the number of occurence of a regex
    """
    if len(text) == 0:
        return 0
    else:
        return len(re.findall(regexp, text)) / len(text)

In [None]:
def create_meta_feature(df):
    df['total_length'] = df['comment_text'].apply(len)
    df['capital'] = df['comment_text'].apply(lambda comment: sum(1 for c in comment if c.isupper()))
    df['caps_vs_length'] = df.apply(lambda row: float(row['capital']) / float(row['total_length']), axis=1)
    df["unknown_glove"] = df['comment_text_cleaned'].apply(lambda x: count_unknown_embedding(x, glove_embeddings_index))
    df["unknown_fasttext"] = df['comment_text_cleaned'].apply(lambda x: count_unknown_embedding(x, embeddings_index))
    df["unknown_glove_fasttext"] = df["unknown_glove"] + df["unknown_fasttext"]
    # Special Expressions Collection
    df['num_exclamation_marks'] = df['comment_text'].apply(lambda comment: comment.count('!'))
    df['num_question_marks'] = df['comment_text'].apply(lambda comment: comment.count('?'))
    df['num_punctuation'] = df['comment_text'].apply(lambda comment: sum(comment.count(w) for w in '.,;:'))
    df['num_symbols'] = df['comment_text'].apply(lambda comment: sum(comment.count(w) for w in '*&$%'))
    df['num_words'] = df['comment_text'].apply(lambda comment: len(comment.split()))
    df['num_unique_words'] = df['comment_text'].apply(lambda comment: len(set(w for w in comment.split())))
    df['words_vs_unique'] = df['num_unique_words'] / df['num_words']
    df['num_smilies'] = df['comment_text'].apply(lambda comment: sum(comment.count(w) for w in (':-)', ':)', ';-)', ';)')))
    df["ant_slash_n"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\n", x))
    df["start_with_columns"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"^\:+", x))
    df["has_emphasize_equal"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\={2}.+\={2}", x))
    df["has_emphasize_quotes"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\"{4}\S+\"{4}", x))
    df["has_star"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\*", x))
    ## Check for dates
    df["has_timestamp"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\d{2}|:\d{2}", x))
    df["has_date_long"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\D\d{2}:\d{2}, \d{1,2} \w+ \d{4}", x))    # example: 18:44, 8 December 2010
    df["has_date_short"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\D\d{1,2} \w+ \d{4}", x))
    ## Check for http links
    df["has_http"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"http[s]{0,1}://\S+", x))
    df["has_ip"] = df["comment_text"].apply(lambda x: count_regexp_occ("(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}", x))
    ## Check for mail
    df["has_mail"] = df["comment_text"].apply(lambda x: count_regexp_occ(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+', x))
    ## Check for image
    df["has_image"] = df["comment_text"].apply(lambda x: count_regexp_occ(r'image\:', x))
    # Dirty Languages Collection
    df["nb_fk"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"[Ff]\S{2}[Kk]", x))
    df["nb_sk"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"[Ss]\S{2}[Kk]", x))
    df["nb_dk"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"[dD]ick", x))
    df["nb_you"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\W[Yy]ou\W", x))
    df["nb_mother"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\Wmother\W", x))
    df["nb_ng"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\Wnigger\W", x))

In [None]:
print("Creating meta features...")
create_meta_feature(train_df)
create_meta_feature(test_df)
print('Done!')

In [None]:
train_cols = train_df.columns[9:]
print(train_cols)
column_numbers = len(train_df.columns)
print(column_numbers, len(train_cols))

In [None]:
for col in train_cols:
    train_toxic[col] = train_df[col]
    train_severe_toxic[col] = train_df[col]
    train_obscene[col] = train_df[col]
    train_threat[col] = train_df[col]
    train_insult[col] = train_df[col]
    train_identity_hate[col] = train_df[col]
for col in train_cols:
    test_toxic[col] = test_df[col]
    test_severe_toxic[col] = test_df[col]
    test_obscene[col] = test_df[col]
    test_threat[col] = test_df[col]
    test_insult[col] = test_df[col]
    test_identity_hate[col] = test_df[col]
train_ensembled_data = [train_toxic, train_severe_toxic, train_obscene, train_threat, train_insult, train_identity_hate]
test_ensembled_data = [test_toxic, test_severe_toxic, test_obscene, test_threat, test_insult, test_identity_hate]

In [None]:
del list_sentences_train, list_sentences_test
gc.collect()

# Data Preprocessing

In [13]:
all_comment_text = pd.concat([train_df['comment_text_cleaned'], test_df['comment_text_cleaned']], axis=0).fillna("unknown")
nrow_train = train_df.shape[0]
all_comment_text.shape[0]

312735

In [17]:
word_vect = TfidfVectorizer(max_features=5000, stop_words=ignored_words)
word_vect.fit(all_comment_text)
data = word_vect.fit_transform(all_comment_text)

In [18]:
norm_data = MaxAbsScaler().fit_transform(data)
print(norm_data.shape)

(312735, 5000)


## Tf-idf Feature Extraction

## Word-Level

In [None]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 1),
    max_features=20000
)
word_vectorizer.fit(all_comment_text)
train_word_features = word_vectorizer.transform(cleaned_train_comments)
test_word_features = word_vectorizer.transform(cleaned_test_comments)
print('Word vectorization process Done!')

## Char-Level

In [None]:
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    ngram_range=(1, 6),
    max_features=30000
)
char_vectorizer.fit(all_comment_text)
train_char_features = char_vectorizer.transform(cleaned_train_comments)
test_char_features = char_vectorizer.transform(cleaned_test_comments)
print('Char vectorization process Done!')

## Features Aggregation

In [None]:
del all_comment_text
gc.collect()

In [None]:
train_tfidf_features = hstack([train_char_features, train_word_features]).tocsr()
test_tfidf_features = hstack([test_char_features, test_word_features]).tocsr()

In [None]:
del train_char_features, train_word_features, test_char_features, test_word_features
gc.collect()

## Build Vocabulary & Tokenizer

## Word Level

In [19]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='"#$%&()+,-./:;<=>@[\\]^_`{|}~\t\n')

In [20]:
print('Automatically train vocab & tokenizer...')
tokenizer.fit_on_texts(cleaned_train_comments + cleaned_test_comments)

train_sequences = tokenizer.texts_to_sequences(cleaned_train_comments)
test_sequences = tokenizer.texts_to_sequences(cleaned_test_comments)

word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

train_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of train_data tensor: ', train_data.shape)
print('Shape of train_label tensor: ', train_labels.shape)

test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of test_data tensor: ', test_data.shape)

Automatically train vocab & tokenizer...
Found 330549 unique tokens
Shape of train_data tensor:  (159571, 350)
Shape of train_label tensor:  (159571, 6)
Shape of test_data tensor:  (153164, 350)


## Char Level

In [None]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, char_level=True, lower=False)

print('Automatically train vocab & tokenizer...')
tokenizer.fit_on_texts(cleaned_train_comments + cleaned_test_comments)

train_sequences = tokenizer.texts_to_sequences(cleaned_train_comments)
test_sequences = tokenizer.texts_to_sequences(cleaned_test_comments)

word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

train_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of train_data tensor: ', train_data.shape)
print('Shape of train_label tensor: ', train_labels.shape)

test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of test_data tensor: ', test_data.shape)

## Pos Feature Extraction

In [None]:
def sent2pos(sentence):
    try:
        tag = TextBlob(sentence).tags
    except:
        print(sentence)
        
    updated_sentence = ' '.join([i[0] for i in tag])
    tagged = ' '.join([i[1] for i in tag])
    return updated_sentence, tagged

In [None]:
inverse_word_index = {v: k for k, v in word_index.items()}

In [None]:
Pos_comments = []
Pos_updated_sentence = []
for text in tqdm(train_sequences):
    text_ = ' '.join([inverse_word_index[word] for word in text])    # convert to word format
    if not isinstance(text_, str):
        print(text, '\n', text_)
    updated_sentence, tags = sent2pos(text_)
    Pos_updated_sentence.append(updated_sentence)
    Pos_comments.append(tags)
    assert len(updated_sentence.split(' ')) == len(tags.split(' ')), "T1 {} T2 {}".format(len(text), len(tags.split()))
    
Pos_test_comments = []
Pos_test_updated_sentence = []
for text in tqdm(test_sequences):
    text_ = ' '.join([inverse_word_index[word] for word in text])
    updated_test_sentence, test_tags = sent2pos(text_)
    Pos_test_updated_sentence.append(updated_test_sentence)
    Pos_test_comments.append(test_tags)
    assert len(updated_test_sentence.split(' ')) == len(test_tags.split(' ')), "T1 {} T2 {}".format(len(text), len(test_tags.split()))

In [None]:
pos_tokenizer = Tokenizer(num_words=50, filters='"#$%&()+,-./:;<=>@[\\]^_`{|}~\t\n')

In [None]:
print('Automatically train pos tokenizer...')
pos_tokenizer.fit_on_texts(Pos_comments + Pos_test_comments)

train_pos_sequences = pos_tokenizer.texts_to_sequences(Pos_comments)
test_pos_sequences = pos_tokenizer.texts_to_sequences(Pos_test_comments)

pos_word_index = pos_tokenizer.word_index
print('Found %s unique tokens' % len(pos_word_index))

pos_train_data = pad_sequences(train_pos_sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', pos_train_data.shape)

pos_test_data = pad_sequences(test_pos_sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of test_data tensor:', pos_test_data.shape)

## Second time valid for tokenzier

In [None]:
print('Automatically train pos tokenizer secondly...')
cleaned_train_comments, cleaned_test_comments = [], []
for text in Pos_updated_sentence:
    cleaned_train_comments.append(clean_datasets(text))
for text in Pos_test_updated_sentence:
    cleaned_test_comments.append(clean_datasets(text))
    
train_sequences = tokenizer.texts_to_sequences(cleaned_train_comments)
test_sequences = tokenizer.texts_to_sequences(cleaned_test_comments)

word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

train_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of train_data tensor:', train_data.shape)
print('Shape of train_label tensor:', train_labels.shape)

test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of test_data tensor:', test_data.shape)

In [None]:
train_df['cleaned_comment_text'] = cleaned_train_comments
test_df['cleaned_comment_text'] = cleaned_test_comments
train_df.to_csv(PATH + 'cleaned_train.csv', index=False)
test_df.to_csv(PATH + 'cleaned_test.csv', index=False)

In [None]:
del cleaned_train_comments, cleaned_test_comments
gc.collect()

## Tokenizer for Pytorch Variable (Batch)

In [21]:
use_cuda = True

In [22]:
def mini_batches_generator(inputs, targets, batch_size, row_shuffle=False):
    inputs_data_size = len(inputs)
    targets_data_size = len(targets)
    assert inputs_data_size == targets_data_size, "The length of inputs({}) and targets({}) must be consistent.".format(inputs_data_size, targets_data_size)
    
    if row_shuffle:
        for input_seqs in inputs:
            np.random.shuffle(input_seqs)
            
    shuffled_input, shuffled_target = shuffle(inputs, targets)
    mini_batches = [
        (shuffled_input[k: k + batch_size], shuffled_target[k: k + batch_size])
        for k in range(0, inputs_data_size, batch_size)
    ]
    dp = EmbeddingDropout(p=0.2)
    
    for batch_xs, batch_ys in mini_batches:
        lengths = [len(s) for s in batch_xs]
        max_length = min(MAX_SEQUENCE_LENGTH, max(lengths))
        batch_tensors = pad_sequences(batch_xs, maxlen=max_length, padding='post', truncating='pre')
        
        lengths_var = Variable(torch.Tensor(lengths), requires_grad=False)
        inputs_tensor = torch.from_numpy(batch_tensors).long()
        inputs_dropped_tensor = dp.forward(inputs_tensor)
        inputs_var = Variable(inputs_dropped_tensor)
        targets_var = Variable(torch.from_numpy(batch_ys).long())
        
        if use_cuda:
            inputs_var = inputs_var.cuda()
            targets_var = targets_var.cuda()
            lengths_var = lengths_var.cuda()
        yield (inputs_var, lengths_var), targets_var

In [23]:
def test_batches_generator(inputs, batch_size):
    inputs_data_size = len(inputs)
    mini_batches = [
        inputs[k: k + batch_size]
        for k in range(0, inputs_data_size, batch_size)
    ]
    
    for batch_xs in mini_batches:
        lengths = [len(s) for s in batch_xs]
        max_length = min(MAX_SEQUENCE_LENGTH, max(lengths))
        batch_tensors = pad_sequences(batch_xs, maxlen=max_length, padding='post', truncating='pre')
        
        lengths_var = Variable(torch.Tensor(lengths))
        inputs_var = Variable(torch.from_numpy(batch_tensors).long())
        
        if use_cuda:
            inputs_var = inputs_var.cuda()
            lengths_var = lengths_var.cuda()
            
        yield (inputs_var, lengths_var)

## Sentence Embedding (Build a matrix)

In [24]:
print('Preparing embedding matrix...')
nb_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))

null_words = open(PATH + 'null_words.txt', 'w', encoding='utf-8')

for word, idx in word_index.items():
    if idx >= MAX_NB_WORDS:
        null_words.write(word + ', ' + str(word_count_dict[word]) + '\n')
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[idx] = embedding_vector
    else:
        null_words.write(word + ', ' + str(word_count_dict[word]) + '\n')
print('Null_word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Preparing embedding matrix...
Null_word embeddings: 24244


In [25]:
print('Sorting null_words...')
null_dict = {}
with open(PATH + 'null_words.txt', 'r', encoding='utf-8') as nullword:
    for line in nullword:
        word, count = line.strip('\n').split(', ')
        null_dict[word] = int(count)

null_dict = sorted(null_dict.items(), key=operator.itemgetter(1), reverse=True)

with open(PATH + 'null_words.txt', 'w', encoding='utf-8') as output:
    for word, count in null_dict:
        output.write(word + ', ' + str(count) + '\n')
print('Sorting operation Done!')

Sorting null_words...
Sorting operation Done!


# Model Training

## Baseline Model with Using Tfidf Features

In [None]:
kfold = KFold(n_splits=FOLD_COUNT, shuffle=False)
tfidf_models = []
for i, (train_idx, test_idx) in enumerate(kfold.split(train_tfidf_features)):
    print('## In fold {} ##'.format(i + 1))
    classifier = LogisticRegression(solver='sag', C=12.0)
    
    for class_name in list_classes:
        print('Processing {} ...'.format(class_name))
        train_target = train_df[class_name][train_idx]
        
        classifier.fit(train_tfidf_features[train_idx], train_target)
        y_pred = classifier.predict(train_tfidf_features[train_idx])
        
        print('Training accuracy is {}'.format(accuracy_score(y_pred, train_target)))
        
        val_target = train_df[class_name][test_idx]
        val_pred = classifier.predict(train_tfidf_features[test_idx])
        
        print('Validation accuracy is {}'.format(accuracy_score(val_pred, val_target)))
        
    tfidf_models.append(classifier)
print('K-fold cross validation Done!')

## Logistic Regression

In [None]:
for i, model in enumerate(tfidf_models):
    print('## In Model {} ##'.format(i + 1))
    # predictions = {'id': test_df['id']}
    predictions = OrderedDict()
    predictions['id'] = test_df['id']
    
    for class_name in list_classes:
        predictions[class_name] = model.predict_proba(test_tfidf_features)[:, 1]
        print('Predict the proba for {} Done!'.format(class_name))
        print(predictions.keys())
    
    print(predictions.keys())
    submission = pd.DataFrame.from_dict(predictions)
    submission.to_csv('results/TFIDF_Based/TFIDF_Logistic_Regression_Submission_{}.csv'.format(i), index=False)

In [None]:
result_list = ['results/TFIDF_Based/TFIDF_Logistic_Regression_Submission_{}.csv'.format(i) for i in range(0, FOLD_COUNT)]
bagging(result_list, 'results/TFIDF_Based/TFIDF_bagging.csv')
print('Bagging operation Done!')

## ExtraTreeClassifier

In [None]:
accs = []
# et_predictions = {'id': test_df['id']}
et_predictions = OrderedDict()
et_predictions['id'] = test_df['id']

for class_name in list_classes:
    train_target = train_df[class_name]
    classifier = ExtraTreesClassifier(n_estimators=30)
    
    cv_score = np.mean(cross_val_score(classifier, train_tfidf_features, train_target, cv=10, scoring='roc_auc'))
    accs.append(cv_score)
    print('CV Score for class {} is {}'.format(class_name, cv_score))
    
    classifier.fit(train_tfidf_features, train_target)
    print(classifier.feature_importances_)
    et_predictions[class_name] = classifier.predict_proba(test_tfidf_features)[:, 1]
    
submission = pd.DataFrame.from_dict(et_predictions)
submission.to_csv('result/TFIDF_Based/TFIDF_ExtraTreesClassifier_Submission.csv', index=False)

## Light-GBM For Ensembled_Datasets

In [None]:
def fit_every_feature_model(feature_data, label, feature_name, feature_test_data, fold_count, predict=False):
    predictions = np.zeros(shape=(len(feature_test_data)))
    fold_size = len(feature_data) // fold_count
    
    print('Feature name is {}'.format(feature_name))
    auc_score = 0
    for fold_id in range(0, fold_count):
        print('## Fold In {} ##'.format(fold_id + 1))
        fold_start = fold_size * fold_id
        fold_end = fold_start + fold_size
        if fold_id == fold_size - 1:
            fold_end = len(feature_data)
            
        train_x = np.concatenate((feature_data[:fold_start], feature_data[fold_end:]))
        train_y = np.concatenate((label[:fold_start], label[fold_end:]))
        
        val_x = feature_data[fold_start:fold_end]
        val_y = label[fold_start:fold_end]
        
        lgb_train = lgb.Dataset(train_x, train_y)
        lgb_val = lgb.Dataset(val_x, val_y)
        
        lgbm_model = lgb.LGBMClassifier(max_depth=5, metric='auc', n_estimators=10000, num_leaves=32, boosting_type='gbdt', \
                                       learning_rate=0.01, feature_fraction=0.3, bagging_fraction=0.8, bagging_freq=5, reg_lambda=0)
        lgbm_model.fit(X=train_x, y=train_y, eval_metric=['auc', 'binary_logloss'], eval_set=(val_x, val_y), early_stopping_rounds=1000, verbose=500)
        auc_score += lgbm_model.best_score_['valid_0']['auc']
        lgb.plot_importance(lgbm_model, max_num_features=30)
        plt.show()
        if predict:
            prediction = lgbm_model.predict_proba(feature_test_data)[:, 1]
            predictions += prediction
            del lgbm_model
    predictions /= fold_count
    print('Training LightGBM Done!')
    return predictions, auc_score / fold_count

In [None]:
predictions, auc_scores = [], []
for i, feature_name in enumerate(list_classes):
    prediction, auc = fit_every_feature_model(train_ensembled_data[i], train_df[feature_name].values, feature_name, test_ensembled_data[i], 10, predict=True)
    auc_scores.append(auc)
    predictions.append(prediction)
print('Overall AUC Score is {}'.format(sum(auc_scores) / 6))
print('For each:'.format(auc_scores))

In [None]:
submission = pd.read_csv('datasets/sample_submission.csv')
for i, feature_name in enumerate(list_classes):
    submission[feature_name] = predictions[i]
submission.to_csv('results/LightGBM/LightGBM_with_Meta_data.csv', index=False)

## Training Styles

## Keras Based

In [None]:
STAMP = 'model_pool/Keras/av_pos_cnn/pavel_cnn_%.2f_%.2f'%(0.5, 0.5)

In [None]:
def _train_model_by_auc(model, batch_size, train_x, train_y, val_x, val_y):
    best_auc = -1
    best_weight = None
    best_epoch = 0
    current_epoch = 1
    
    while True:
        model.fit(train_x, train_y, batch_size=batch_size, epoch=1, validation_data=[val_x, val_y])
        y_pred = model.predict(val_x, batch_size=batch_size)
        current_auc = roc_auc_score(val_y, y_pred)
        print('Epoch {} auc {:.6f} best_auc {:.6f}'.format(current_epoch, current_auc, best_auc))
        current_epoch += 1
        if best_auc < current_auc or best_auc == -1:
            best_auc = current_auc
            best_weight = model.get_weights()
            best_epoch = current_epoch
        else:
            # early stop
            if current_epoch - best_epoch == 5:
                break
                
    model.set_weights(best_weights)
    return model, best_auc

def _train_model_by_logloss(model, batch_size, train_x, pos_train_x, train_y, val_x, pos_val_x, val_y, fold_id):
    early_stopping = EarlyStopping(monitor='val_loss', patience=7)
    best_model_path = STAMP + str(fold_id) + '.h5'
    model_checkpoint = ModelCheckpoint(best_model_path, save_best_only=True, save_weights_only=True)
    train_data = {'Onehot': train_x, 'POS': pos_train_x}
    val_data = {'Onehot': val_x, 'POS': pos_val_x}
    
    hist = model.fit(train_data, train_y, validation_data=(val_data, val_y), epochs=50, batch_size=batch_size, shuffle=True, callbacks=[early_stopping, model_checkpoint])
    best_val_score = min(hist.history['val_loss'])
    predictions = model.predict(val_data)
    auc = roc_auc_score(val_y, predictions)
    print('AUC Score', auc)
    return model, best_val_score, auc, predictions

def train_folds(x, pos_x, y, fold_count, batch_size, get_model_func):
    fold_size = len(x) // fold_count
    models = []
    fold_predictions = []
    score, total_auc = 0, 0
    for fold_id in range(0, fold_count):
        fold_start = fold_size * fold_id
        fold_end = fold_start + fold_size
        
        if fold_id == fold_count - 1:
            fold_end = len(x)
            
        train_x = np.concatenate((x[:fold_start], x[fold_end:]))
        train_y = np.concatenate((y[:fold_start], y[fold_end:]))
        
        val_x = x[fold_start: fold_end]
        val_y = y[fold_start: fold_end]
        
        pos_train_x = np.concatenate((pos_x[:fold_start], pos_x[fold_end:]))
        pos_val_x = pos_x[fold_start: fold_end]
        print('## In fold {} ## : '.format(fold_id + 1))
        model, best_val_score, auc, fold_prediction = _train_model_by_logloss(get_model_func, batch_size, train_x, pos_train_x, train_y, val_x, pos_val_x, val_y, fold_id)
        
        score += best_val_score
        total_auc += auc
        fold_predictions.append(fold_prediction)
        models.append(model)
    return models, score / fold_count, total_auc / fold_count , fold_predictions

## Pytorch Based

In [26]:
def self_log_loss(y, y_pred):
    total_loss = 0
    for j in range(6):
        loss = log_loss(y[:, j], y_pred[:, j])
        total_loss += loss
    total_loss /= 6.
    
    return total_loss

In [47]:
criterion = torch.nn.BCELoss(size_average=True)
epoch_num = 100
early_stop_round = 12
MODELSTAMP = 'model_pool/Pytorch/rhn/pavel_rhn'

In [28]:
def learning_rate_decay(optimizer):
    for param_group in optimizer.param_groups:
        param_group['lr'] = param_group['lr'] * 0.93
    return optimizer

def _train_batch(model, inputs_var, targets_var, optimizer):
    optimizer.zero_grad()
    model.train()
    preds_var = model.forward(inputs_var)
    
    # Pytorch version requires torch.cuda.FloatTensor rather than torch.cuda.LongTensor
    preds_var = preds_var.type(torch.cuda.FloatTensor)
    targets_var = targets_var.type(torch.cuda.FloatTensor)
    
    # training
    loss = F.binary_cross_entropy_with_logits(preds_var, targets_var)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 3)
    optimizer.step()
    return optimizer, loss.data.cpu().numpy()

def _pytorch_train_model_by_logloss(model, batch_size, train_x, train_y, val_x, val_y, fold_id):
    '''
    Train for 6-labels at the same time.
    '''
    print("## Training on fold {} ##".format(fold_id))
    
    if use_cuda:
        model = model.cuda()
    best_auc, best_logloss, best_epoch, current_epoch = -1, -1, 0, 1
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-3)
    
    for epoch in range(epoch_num):
        epoch_logloss = 0.0
        for batch_id, (inputs_var, targets_var) in enumerate(mini_batches_generator(train_x, train_y, batch_size, row_shuffle=True)):
            optimizer, loss = _train_batch(model=model, inputs_var=inputs_var, targets_var=targets_var, optimizer=optimizer)
            
            epoch_logloss += loss
            if batch_id % 40 == 0:
                print("Epoch: {} Batch: {} Log-loss: {}".format(epoch + 1, batch_id, loss))
                
        print("Epoch average log-loss: {}".format(epoch_logloss / batch_id))
        val_pred = model.predict(val_x)
        
        current_logloss = self_log_loss(val_y, val_pred)
        current_epoch += 1
        if best_logloss > current_logloss or best_logloss == -1:
            best_logloss = current_logloss
            model.save(MODELSTAMP + '-TEMP.pt')
            best_auc = roc_auc_score(val_y, val_pred)
            best_epoch = current_epoch
        else:
            if current_epoch - best_epoch == early_stop_round:
                break
        print("In Epoch: {}, val_loss: {}, best_val_loss: {}, best_auc: {}".format(epoch + 1, current_logloss, best_logloss, best_auc))
        optimizer = learning_rate_decay(optimizer)
        
    model.load(MODELSTAMP + '-TEMP.pt')
    best_val_pred = model.predict(val_x)
    model.save(MODELSTAMP + str(fold_id) + '.pt')
    return model, best_logloss, best_auc, best_val_pred

def pytorch_train_folds(x, y, fold_count, batch_size, get_model_func, skip_fold=0):
    fold_size = len(x) // fold_count
    models = []
    fold_predictions = []
    score, total_auc = 0, 0
    for fold_id in range(0, fold_count):
        fold_start = fold_size * fold_id
        fold_end = fold_start + fold_size
        
        if fold_id == fold_count - 1:
            fold_end = len(x)
            
        train_x = np.concatenate((x[:fold_start], x[fold_end:]))
        train_y = np.concatenate((y[:fold_start], y[fold_end:]))
        
        val_x = x[fold_start: fold_end]
        val_y = y[fold_start: fold_end]
        
        if fold_id < skip_fold:
            model = get_model_func()
            model.load(MODELSTAMP + str(fold_id) + '.pt')
            model = model.eval()
            model = model.cuda()
            fold_prediction = model.predict(val_x)
            auc = roc_auc_score(val_y, fold_prediction)
            bst_val_score = self_log_loss(y=val_y, y_pred=fold_prediction)
        
        else:
            model, best_val_score, auc, fold_prediction = _pytorch_train_model_by_logloss(get_model_func(), batch_size, train_x, train_y, val_x, val_y, fold_id)
        
        score += best_val_score
        total_auc += auc
        fold_predictions.append(fold_prediction)
        models.append(model)
    return models, score / fold_count, total_auc / fold_count , fold_predictions

## Model Zoo

## Keras Based

In [None]:
class AttentionWeightedAverage(Layer):
    """
    Model for computing a weighted average of the different channels across timesteps.
    Uses 1 parameter per channel to compute the attention value for a single timestep.
    """
    def __init__(self, return_attention=False, **kwargs):
        self.init = initializers.get('uniform')
        self.supports_masking = True
        self.return_attention = return_attention
        super(AttentionWeightedAverage, self).__init__(**kwargs)
        
    def build(self, input_shape):
        self.input_spec = [InputSpec(ndim=3)]
        assert len(input_shape) == 3
        
        self.W = self.add_weight(shape=(input_shape[2], 1),
                                 name='{}_W'.format(self.name),
                                 initializer=self.init)
        
        self.trainable_weights = [self.W]
        super(AttentionWeightedAverage, self).build(input_shape)
        
    def call(self, x, mask=None):
        logits = K.dot(x, self.W)
        x_shape = K.shape(x)
        logits = K.reshape(logits, (x_shape[0], x_shape[1]))
        ai = K.exp(logits - K.max(logits, axis=-1, keepdims=True))
        
        if mask is not None:
            mask = K.cast(mask, K.floatx())
            ai = ai * mask
        attn_weights = ai / (K.sum(ai, axis=1, keepdims=True) + K.epsilon())
        weighted_input = x * K.expand_dims(attn_weights)
        result = K.sum(weighted_input, axis=1)
        if self.return_attention:
            return [result, attn_weights]
        return result
    
    def get_output_shape_for(self, input_shape):
        return self.compute_output_shape(input_shape)
    
    def compute_output_shape(self, input_shape):
        output_len = input_shape[2]
        if self.return_attention:
            return [(input_shape[0], output_len), (input_shape[0], input_shape[1])]
        return (input_shape[0], output_len)
    
    def compute_mask(self, input, input_mask=None):
        if isinstance(input_mask, list):
            return [None] * len(input_mask)
        else:
            return None

In [None]:
class KMaxPooling(Layer):
    """
    K-max pooling layer that extracts the k-highest activations from a sequence (2nd dimension).
    With TensorFlow Backend.
    """
    def __init__(self, k=1, **kwargs):
        super().__init__(**kwargs)
        self.input_spec = InputSpec(ndim=3)
        self.k = k
    
    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[2] * self.k)
    
    def call(self, inputs):
        # top_k function can only be applied along the last dimension
        shifted_input = tf.transpose(inputs, [0, 2, 1])
        top_k = tf.nn.top_k(shifted_input, self.k, True, None)[0]
        return Flatten()(top_k)

In [None]:
def get_av_cnn(nb_words, embedding_dim, embedding_matrix, max_sequence_length, out_size):
    embedding_layer = Embedding(nb_words,
                                embedding_dim,
                                weights=[embedding_matrix],
                                input_length=max_sequence_length,
                                trainable=False)
    
    filter_nums = 300
    input_layer = Input(shape=(max_sequence_length,), dtype='int32', name='Onehot')
    embedding_sequences = embedding_layer(input_layer)
    final_embedding_sequences = SpatialDropout1D(0.25)(embedding_sequences)
    
    conv_0 = Conv1D(filter_nums, 1, kernel_initializer='normal', padding='valid', activation='relu')(final_embedding_sequences)
    conv_1 = Conv1D(filter_nums, 2, kernel_initializer='normal', padding='valid', activation='relu')(final_embedding_sequences)
    conv_2 = Conv1D(filter_nums, 3, kernel_initializer='normal', padding='valid', activation='relu')(final_embedding_sequences)
    conv_3 = Conv1D(filter_nums, 4, kernel_initializer='normal', padding='valid', activation='relu')(final_embedding_sequences)
    
    attn_0 = AttentionWeightedAverage()(conv_0)
    avg_0 = GlobalAveragePooling1D()(conv_0)
    maxpool_0 = GlobalMaxPooling1D()(conv_0)
    
    attn_1 = AttentionWeightedAverage()(conv_1)
    avg_1 = GlobalAveragePooling1D()(conv_1)
    maxpool_1 = GlobalMaxPooling1D()(conv_1)
    
    attn_2 = AttentionWeightedAverage()(conv_2)
    avg_2 = GlobalAveragePooling1D()(conv_2)
    maxpool_2 = GlobalMaxPooling1D()(conv_2)
    
    attn_3 = AttentionWeightedAverage()(conv_3)
    avg_3 = GlobalAveragePooling1D()(conv_3)
    maxpool_3 = GlobalMaxPooling1D()(conv_3)
    
    merged_maxpool = merge([maxpool_0, maxpool_1, maxpool_2, maxpool_3], mode='concat', concat_axis=1)
    merged_attn = merge([attn_0, attn_1, attn_2, attn_3], mode='concat', concat_axis=1)
    merged_avg = merge([avg_0, avg_1, avg_2, avg_3], mode='concat', concat_axis=1)
    merged_tensor = merge([merged_maxpool, merged_attn, merged_avg], mode='concat', concat_axis=1)
    
    output = Dropout(0.7)(merged_tensor)
    output = Dense(units=144, activation='relu')(output)
    output = Dense(units=out_size, activation='sigmoid')(output)
    
    model = Model(inputs=input_layer, outputs=output)
    adam_optimizer = optimizers.Adam(lr=1e-3, decay=1e-6)
    model.compile(loss='binary_crossentropy', optimizer=adam_optimizer, metrics=['accuracy'])
    return model

In [None]:
def get_av_pos_cnn(nb_words, embedding_dim, embedding_matrix, max_sequence_length, out_size):
    embedding_layer = Embedding(nb_words, 
                                embedding_dim, 
                                weights=[embedding_matrix], 
                                input_length=max_sequence_length, 
                                trainable=False)
    
    pos_embedding_layer = Embedding(50,
                                    30,
                                    input_length=max_sequence_length,
                                    trainable=True)
    
    filter_nums = 325
    input_layer = Input(shape=(max_sequence_length,), dtype='int32', name='Onehot')
    pos_input_layer = Input(shape=(max_sequence_length,), dtype='int32', name='POS')
    
    embedding_sequences = embedding_layer(input_layer)
    pos_sequences = pos_embedding_layer(pos_input_layer)
    merged_embedding_layer = concatenate([embedding_sequences, pos_sequences])
    final_embedding_sequences = SpatialDropout1D(0.25)(merged_embedding_layer)
    
    conv_0 = Conv1D(filter_nums, 1, kernel_initializer='normal', padding='valid', activation='relu')(final_embedding_sequences)
    conv_1 = Conv1D(filter_nums, 2, kernel_initializer='normal', padding='valid', activation='relu')(final_embedding_sequences)
    conv_2 = Conv1D(filter_nums, 3, kernel_initializer='normal', padding='valid', activation='relu')(final_embedding_sequences)
    conv_3 = Conv1D(filter_nums, 4, kernel_initializer='normal', padding='valid', activation='relu')(final_embedding_sequences)

    attn_0 = AttentionWeightedAverage()(conv_0)
    avg_0 = GlobalAveragePooling1D()(conv_0)
    maxpool_0 = GlobalMaxPooling1D()(conv_0)
    
    attn_1 = AttentionWeightedAverage()(conv_1)
    avg_1 = GlobalAveragePooling1D()(conv_1)
    maxpool_1 = GlobalMaxPooling1D()(conv_1)
    
    attn_2 = AttentionWeightedAverage()(conv_2)
    avg_2 = GlobalAveragePooling1D()(conv_2)
    maxpool_2 = GlobalMaxPooling1D()(conv_2)
    
    attn_3 = AttentionWeightedAverage()(conv_3)
    avg_3 = GlobalAveragePooling1D()(conv_3)
    maxpool_3 = GlobalMaxPooling1D()(conv_3)
    
    merged_tensor_maxpool = merge([maxpool_0, maxpool_1, maxpool_2, maxpool_3], mode='concat', concat_axis=1)
    merged_tensor_attn = merge([attn_0, attn_1, attn_2, attn_3], mode='concat', concat_axis=1)
    merged_tensor_avg = merge([avg_0, avg_1, avg_2, avg_3], mode='concat', concat_axis=1)
    merged_tensor = merge([merged_tensor_maxpool, merged_tensor_attn, merged_tensor_avg], mode='concat', concat_axis=1)
    
    output = Dropout(0.7)(merged_tensor)
    output = Dense(units=144, activation='relu')(output)
    output = Dense(units=out_size, activation='sigmoid')(output)
    
    model = Model(inputs=[input_layer, pos_input_layer], outputs=output)
    adam_optimizer = optimizers.Adam(lr=1e-3, decay=1e-6, clipvalue=5)
    model.compile(loss='binary_crossentropy', optimizer=adam_optimizer, metrics=['accuracy'])
    return model

In [None]:
def get_kmax_text_cnn(nb_words, embedding_dim, embedding_matrix, max_sequence_length, out_size):
    embedding_layer = Embedding(nb_words,
                                embedding_dim,
                                weights=[embedding_matrix],
                                input_length=max_sequence_length,
                                trainable=False)
    
    filter_nums = 180
    
    input_layer = Input(shape=(max_sequence_length,), dtype='int32', name='Onehot')
    embedding_sequences = embedding_layer(input_layer)
    final_embedding_sequences = SpatialDropout1D(embedding_sequences)
    
    conv_0 = Conv1D(filter_nums, 1, kernel_initializer='normal', padding='valid', activation='relu')(final_embedding_sequences)
    conv_1 = Conv1D(filter_nums, 2, kernel_initializer='normal', padding='valid', activation='relu')(final_embedding_sequences)
    conv_2 = Conv1D(filter_nums, 3, kernel_initializer='normal', padding='valid', activation='relu')(final_embedding_sequences)
    conv_3 = Conv1D(filter_nums, 4, kernel_initializer='normal', padding='valid', activation='relu')(final_embedding_sequences)
    
    maxpool_0 = KMaxPooling(k=3)(conv_0)
    maxpool_1 = KMaxPooling(k=3)(conv_1)
    maxpool_2 = KMaxPooling(k=3)(conv_2)
    maxpool_3 = KMaxPooling(k=3)(conv_3)
    
    merged_tensor = merge([maxpool_0, maxpool_1, maxpool_2, maxpool_3], mode='concat', concat_axis=1)
    output = Dropout(0.6)(merged_tensor)
    output = Dense(units=144, activation='relu')(output)
    output = Dense(units=out_size, activation='sigmoid')(output)
    
    model = Model(inputs=input_layer, outputs=output)
    adam_optimizer = optimizers.Adam(lr=1e-3, decay=1e-6)
    model.compile(loss='binary_crossentropy', optimizer=adam_optimizer, metrics=['accuracy'])
    return model

In [None]:
def get_rcnn(nb_words, embedding_dim, embedding_matrix, max_sequence_length, out_size):
    embedding_layer = Embedding(nb_words,
                                embedding_dim,
                                weights=[embedding_matrix],
                                input_length=max_sequence_length,
                                trainable=False)
    
    recurrent_units = 64
    filter_nums = 128
    
    input_layer = Input(shape=(max_sequence_length,), dtype='int32', name='Onehot')
    embedding_sequences = embedding_layer(input_layer)
    final_embedding_sequences = SpatialDropout1D(0.2)(embedding_sequences)
    
    rnn_layer = Bidirectional(CuDNNGRU(recurrent_units, return_sequences=True))(final_embedding_sequences)
    conv_layer = Conv1D(filter_nums, 2, kernel_initializer='normal', padding='valid', activation='relu', strides=1)(rnn_layer)
    
    maxpool = GlobalMaxPooling1D()(conv_layer)
    attn = AttentionWeightedAverage()(conv_layer)
    avg = GlobalAveragePooling1D()(conv_layer)
    
    merged_tensor = merge([maxpool, attn, avg], mode='concat', concat_axis=1)
    output = Dropout(0.5)(merged_tensor)
    output = Dense(units=120, activation='relu')(output)
    output = Dense(units=out_size, activation='sigmoid')(output)
    
    model = Model(inputs=input_layer, outputs=output)
    adam_optimizer = optimizers.Adam(lr=1e-3, decay=1e-6, clipvalue=5)
    model.compile(loss='binary_crossentropy', optimizer=adam_optimizer, metrics=['accuracy'])
    return model

In [None]:
def get_av_rnn(nb_words, embedding_dim, embedding_matrix, max_sequence_length, out_size):
    embedding_layer = Embedding(nb_words,
                                embedding_dim,
                                weights=[embedding_matrix],
                                input_length=max_sequence_length,
                                trainable=False)
    
    recurrent_units = 64
    
    input_layer = Input(shape=(max_sequence_length,), dtype='int32', name='Onehot')
    embedding_sequences = embedding_layer(input_layer)
    final_embedding_sequences = SpatialDropout1D(0.25)(embedding_sequences)
    
    rnn_layer_0 = Bidirectional(CuDNNGRU(recurrent_units, return_sequences=True))(final_embedding_sequences)
    rnn_layer_1 = Bidirectional(CuDNNGRU(recurrent_units, return_sequences=True))(rnn_layer_0)
    merged_rnn_layer = merge([rnn_layer_0, rnn_layer_1], mode='concat', concat_axis=2)
    
    last_layer = Lambda(lambda t: t[:, -1], name='last_layer')(merged_rnn_layer)
    maxpool = GlobalMaxPooling1D()(merged_rnn_layer)
    attn = AttentionWeightedAverage()(merged_rnn_layer)
    avg = GlobalAveragePooling1D()(merged_rnn_layer)
    
    merged_tensor = merge([last_layer, maxpool, attn, avg], mode='concat', concat_axis=1)
    output = Dropout(0.5)(merged_tensor)
    output = Dense(units=144, activation='relu')(output)
    output = Dense(units=out_size, activation='sigmoid')(output)
    
    model = Model(inputs=input_layer, outputs=output)
    adam_optimizer = optimizers.Adam(lr=1e-3, decay=1e-6, clipvalue=5)
    model.compile(loss='binary_crossentropy', optimizer=adam_optimizer, metrics=['accuracy'])
    return model

In [None]:
def get_av_pos_rnn(nb_words, embedding_dim, embedding_matrix, max_sequence_length, out_size):
    embedding_layer = Embedding(nb_words,
                                embedding_dim,
                                weights=[embedding_matrix],
                                input_length=max_sequence_length,
                                trainable=False)
    
    pos_embedding_layer = Embedding(50,
                                    35,
                                    input_length=max_sequence_length,
                                    trainable=True)
    
    recurrent_units = 64
    
    input_layer = Input(shape=(max_sequence_length,), dtype='int32', name='Onehot')
    pos_input_layer = Input(shape=(max_sequence_length,), dtype='int32', name='POS')
    embedding_sequences = embedding_layer(input_layer)
    pos_sequences = pos_embedding_layer(pos_input_layer)
    merged_embedding_layer = concatenate([embedding_sequences, pos_sequences], axis=2)
    final_embedding_sequences = SpatialDropout1D(0.2)(merged_embedding_layer)
    
    rnn_layer_0 = Bidirectional(CuDNNGRU(recurrent_units, return_sequences=True))(final_embedding_sequences)
    rnn_layer_0 = SpatialDropout1D(0.3)(rnn_layer_0)
    rnn_layer_1 = Bidirectional(CuDNNGRU(recurrent_units, return_sequences=True))(rnn_layer_0)
    
    last_layer = Lambda(lambda t: t[:, -1], name='last_layer')(rnn_layer_1)
    maxpool = GlobalMaxPooling1D()(rnn_layer_1)
    attn = AttentionWeightedAverage()(rnn_layer_1)
    avg = GlobalAveragePooling1D()(rnn_layer_1)
    
    merged_tensor = merge([last_layer, maxpool, attn, avg], mode='concat', concat_axis=1)
    output = Dropout(0.5)(merged_tensor)
    output = Dense(units=144, activation='relu')(output)
    output = Dense(units=out_size, activation='sigmoid')(output)
    
    model = Model(inputs=[input_layer, pos_input_layer], outputs=output)
    adam_optimizer = optimizers.Adam(lr=1e-3, decay=1e-6, clipvalue=5)
    model.compile(loss='binary_crossentropy', optimizer=adam_optimizer, metrics=['accuracy'])
    return model

In [None]:
def get_dropout_bigru(nb_words, embedding_dims, embedding_matrix, max_sequence_length, out_size):
    embedding_layer = Emebdding(nb_words,
                                embedding_dims,
                                weights=[embedding_matrix],
                                input_length=max_sequence_length,
                                trainable=False)
    
    recurrent_units = 64
    
    input_layer = Input(shape=(max_sequence_length,), dtype='int32', name='Onehot')
    embedding_sequences = embedding_layer(input_layer)
    final_embedding_sequences = SpatialDropout1D(0.2)(embedding_sequences)
    
    rnn_layer = Bidirectional(CuDNNGRU(recurrent_units, return_sequences=True))(final_embedding_sequences)
    rnn_layer = Dropout(0.35)(rnn_layer)
    rnn_layer = Bidirectional(CuDNNGRU(recurrent_units, return_sequences=True))(rnn_layer)
    
    last_layer = Lambda(lambda t: t[:, -1])(rnn_layer)
    maxpool = GlobalMaxPooling1D()(rnn_layer)
    avg = GlobalAveragePooling1D()(rnn_layer)
    
    merged_tensor = merge([last_layer, maxpool, avg], mode='concat', concat_axis=1)
    output = Dropout(0.5)(merged_tensor)
    output = Dense(units=72, activation='relu')(output)
    output = Dense(units=out_size, activation='sigmoid')(output)
    
    model = Model(inputs=input_layer, outputs=output)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

## Pytorch Based

In [29]:
class DotAttention(nn.Module):
    
    def __init__(self, hidden_size):
        super(DotAttention, self).__init__()
        self.hidden_size = hidden_size
        self.attn_vector = nn.Parameter(torch.Tensor(1, hidden_size), requires_grad=True)
        
        init.xavier_uniform(self.attn_vector.data)
        
    def forward(self, inputs, lengths=None):
        """
        Return a scalar (All hiddens to only one weight for one output)
        (batch_size, max_len, hidden_size) * (batch_size, hidden_size, 1) --> (batch_size, max_len, 1)
        """
        batch_size, max_len = inputs.size()[:2]
        
        weights = torch.bmm(inputs,
                            self.attn_vector              # (1, hidden)
                            .unsqueeze(0)                 # (1, 1, hidden)
                            .transpose(2, 1)              # (1, hidden, 1)
                            .repeat(batch_size, 1, 1))    # (batch_size, hidden, 1)
        
        attn_energies = F.softmax(F.relu(weights.squeeze()))
        
        # create mask based on the sentence length
        mask = Variable(torch.ones(attn_energies.size())).cuda()
        for i, l in enumerate(lengths):
            if l < max_len:
                mask[:, l:] = 0
                
        # apply mask and renormalize attention scores (weights)
        masked = attn_energies * mask
        _sums = masked.sum(-1).expand_as(attn_energies)
        attention_weights = masked.div(_sum)
        
        # apply attention weights
        weighted = torch.mul(inputs, attention_weights.unsqueeze(-1).expand_as(inputs))
        
        # get the final fixed vector representations of the sentences
        representations = weighted.sum(1).squeeze()
        
        return representations, attention_weights

In [30]:
class EmbeddingDropout():
    """
    Implement of word embedding dropout.
    """
    def __init__(self, p=0.5):
        super(EmbeddingDropout, self).__init__()
        if p < 0 or p > 1:
            raise ValueError("dropout probability has to be between 0 and 1, but got {}".format(p))
            
        self.p = p
        self.trainable = True
    
    def forward(self, inputs):
        if self.p > 0 and self.trainable:
            dim = inputs.dim()
            if dim == 1:
                inputs = inputs.view(1, -1)
            batch_size = inputs.size(0)
            for i in range(batch_size):
                x = np.unique(inputs[i].numpy())
                x = np.nonzero(x)[0]
                if len(x) == 0:
                    return inputs
                x = torch.from_numpy(x)
                noise = x.new().resize_as_(x)
                noise.bernoulli_(self.p)
                x = x.mul(noise)
                for value in x:
                    if value > 0:
                        mask = inputs[i].eq(value)
                        inputs[i].masked_fill_(mask, 0)
            if dim == 1:
                inputs = inputs.view(-1)
        return inputs

In [31]:
class SequentialDropout(nn.Module):
    
    def __init__(self, p=0.5):
        super(SequentialDropout, self).__init__()
        if p < 0 or p > 1:
            raise ValueError("dropout probability has to be between 0 and 1, but got {}".format(p))
            
        self.p = p
        self.restart = True
        self.trainable = True
        
    def _make_noise(self, inputs):
        return Variable(inputs.data.new().resize_as_(inputs.data))
        
    def forward(self, inputs):
        if self.p > 0 and self.trainable:
            if self.restart:
                self.noise = self._make_noise(inputs)
                self.noise.data.bernoulli_(1 - self.p).div_(1 - self.p)
                if self.p == 1:
                    self.noise.data.fill_(0)
                self.noise = self.noise.expand_as(inputs)
                self.restart = False
            return inputs.mul(self.noise)
        
        return inputs
    
    def end_of_sequence(self):
        self.restart = True
        
    def backward(self, grad_output):
        self.end_of_sequence()
        if self.p > 0 and self.trainable:
            return grad_output.mul(self.noise)
        else:
            return grad_output
        
    def __repr__(self):
        return type(self).__name__ + '({:.4f})'.format(self.p)

In [32]:
# Test Embedding & Sequential Dropout
seq_drop_model = SequentialDropout(p=0.5)
input_data= Variable(torch.ones(1, 10), volatile=True)

dist_total = torch.zeros(1)
output_last = seq_drop_model(input_data)
for i in range(50):
    output_new = seq_drop_model(input_data)
    dist_total += torch.dist(output_new, output_last).data
    output_last = output_new
    
if not torch.equal(dist_total, torch.zeros(1)):
    print('Error')
    print(dist_total)
    
seq_drop_model.end_of_sequence()

dist_total = torch.zeros(1)
for i in range(50):
    dist_total += torch.dist(output_last, seq_drop_model(input_data)).data
    seq_drop_model.end_of_sequence()
    
if torch.equal(dist_total, torch.zeros(1)):
    print('Error')
    
emb_drop_model = EmbeddingDropout(p=0.15)
input_data = torch.Tensor([[1,2,3,0,0], [5,3,2,2,0]]).long()
print(input_data)
print(emb_drop_model.forward(input_data))

tensor([[ 1,  2,  3,  0,  0],
        [ 5,  3,  2,  2,  0]])
tensor([[ 1,  2,  3,  0,  0],
        [ 5,  3,  2,  2,  0]])


  This is separate from the ipykernel package so we can avoid doing imports until


In [33]:
class AbstractGRUCell(nn.Module):
    
    def __init__(self, input_size, hidden_size, bias_ih=True, bias_hh=False):
        super(AbstractGRUCell, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.bias_ih = bias_ih
        self.bias_hh = bias_hh
        
        self.weight_wr = nn.Linear(input_size, hidden_size, bias=bias_ih)
        self.weight_wz = nn.Linear(input_size, hidden_size, bias=bias_ih)
        self.weight_wh = nn.Linear(input_size, hidden_size, bias=bias_ih)
        self.weight_ur = nn.Linear(hidden_size, hidden_size, bias=bias_hh)
        self.weight_uz = nn.Linear(hidden_size, hidden_size, bias=bias_hh)
        self.weight_uh = nn.Linear(hidden_size, hidden_size, bias=bias_hh)
        
    def forward(self, x, hx=None):
        # Interface
        raise NotImplementedError

In [34]:
class GRUCell(AbstractGRUCell):
    
    def __init__(self, input_size, hidden_size, bias_ih=True, bias_hh=False):
        super(GRUCell, self).__init__(input_size, hidden_size, bias_ih, bias_hh)
        
    def forward(self, x, hx=None):
        if hx is None:
            hx = Variable(x.data.new().resize_((x.size(0), self.hidden_size)).fill_(0))
        r = F.sigmoid(self.weight_wr(x) + self.weight_ur(hx))
        z = F.sigmoid(self.weight_wz(x) + self.weight_uz(hx))
        ht = F.tanh(self.weight_wh(x) + self.weight_uh(r * hx))
        hx = (1 - i) * hx  + z * ht
        return hx

In [35]:
class BayesianGRUCell(AbstractGRUCell):
    
    def __init__(self, input_size, hidden_size, bias_ih=True, bias_hh=False, dropout=0.25):
        super(BayesianGRUCell, self).__init__(input_size, hidden_size, bias_ih, bias_hh)
        self.dropout = dropout
        self.set_dropout(self.dropout)
        
    def set_dropout(self, dropout):
        self.drop_wr = SequentialDropout(p=dropout)
        self.drop_wz = SequentialDropout(p=dropout)
        self.drop_wh = SequentialDropout(p=dropout)
        self.drop_ur = SequentialDropout(p=dropout)
        self.drop_uz = SequentialDropout(p=dropout)
        self.drop_uh = SequentialDropout(p=dropout)
    
    def end_of_sequence(self):
        self.drop_wr.end_of_sequence()
        self.drop_wz.end_of_sequence()
        self.drop_wh.end_of_sequence()
        self.drop_ur.end_of_sequence()
        self.drop_uz.end_of_sequence()
        self.drop_uh.end_of_sequence()
        
    def forward(self, x, hx=None):
        if hx is None:
            hx = Variable(x.data.new().resize_((x.size(0), self.hidden_size)).fill_(0))
        x_wr = self.drop_wr(x)
        x_wz = self.drop_wz(x)
        x_wh = self.drop_wh(x)
        x_ur = self.drop_ur(hx)
        x_uz = self.drop_uz(hx)
        x_uh = self.drop_uh(hx)
        r = F.sigmoid(self.weight_wr(x_wr) + self.weight_ur(x_ur))
        z = F.sigmoid(self.weight_wz(x_wz) + self.weight_uz(x_uz))
        ht = F.tanh(self.weight_wh(x_wh) + self.weight_uh(r * x_uh))
        hx = (1 - z) * hx + z * ht
        return hx

In [53]:
class RHNCell(nn.Module):
    
    def __init__(self, input_size, hidden_size, is_first_layer, recurrent_dropout):
        super(RHNCell, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.is_first_layer = is_first_layer
        
        self.set_dropout(recurrent_dropout)
        
        if self.is_first_layer:
            self.W_H = nn.Linear(input_size, hidden_size)
            self.W_C = nn.Linear(input_size, hidden_size)
        
        self.R_H = nn.Linear(hidden_size, hidden_size, bias=True)
        self.R_C = nn.Linear(hidden_size, hidden_size, bias=True)
        
    def set_dropout(self, dropout):
        self.dropout = dropout
        self.drop_ir = SequentialDropout(p=self.dropout)
        self.drop_ii = SequentialDropout(p=self.dropout)
        self.drop_hr = SequentialDropout(p=self.dropout)
        self.drop_hi = SequentialDropout(p=self.dropout)
        
    def end_of_sequence(self):
        self.drop_ir.end_of_sequence()
        self.drop_ii.end_of_sequence()
        self.drop_hr.end_of_sequence()
        self.drop_hi.end_of_sequence()
        
    def forward(self, _input, prev_hidden):
        c_i = self.drop_hr(prev_hidden)
        h_i = self.drop_hi(prev_hidden)
        
        if self.is_first_layer:
            x_i = self.drop_ii(_input)
            x_r = self.drop_ir(_input)
            h1 = nn.Tanh()(self.W_H(x_i) + self.R_H(h_i))
            t1 = nn.Sigmoid()(self.W_C(x_r) + self.R_C(c_i))
        else:
            h1 = nn.Tanh()(self.R_H(h_i))
            t1 = nn.Sigmoid()(self.R_C(c_i))
            
        h = (h1 * t1) + (prev_hidden * (1 - t1))
        
        return h

In [37]:
class AbstractGRU(nn.Module):
    
    def __init__(self, input_size, hidden_size, bias_ih=True, bias_hh=False):
        super(AbstractGRU, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.bias_ih = bias_ih
        self.bias_hh = bias_hh
        self.gru_cell = None
        self._load_gru_cell()
        
    def _load_gru_cell(self):
        # Interface
        raise NotImplementError
        
    def forward(self, x, hx=None, max_length=None):
        batch_size = x.size(0)
        seq_length = x.size(1)
        if max_length is None:
            max_length = seq_length
        output = []
        for i in range(max_length):
            # hidden output of every time-step
            hx = self.gru_cell(x[:, i, :], hx=hx)
            output.append(hx.view(batch_size, 1, self.hidden_size))
        result = torch.cat(output, 1)
        return result, hx

In [38]:
class GRU(AbstractGRU):
    
    def __init__(self, input_size, hidden_size, bias_ih=True, bias_hh=False):
        super(GRU, self).__init__(input_size, hidden_size, bias_ih, bias_hh)
        
    def _load_gru_cell(self):
        self.gru_cell = GRUCell(self.input_size, self.hidden_size, self.bias_ih, self.hh)

In [39]:
class BiBayesianGRU(AbstractGRU):
    
    def __init__(self, input_size, hidden_size, bias_ih=True, bias_hh=False, dropout=0.25):
        self.dropout = dropout
        super(BiBayesianGRU, self).__init__(input_size, hidden_size, bias_ih, bias_hh)
        
    def _load_gru_cell(self):
        self.gru_cell = BayesianGRUCell(self.input_size, self.hidden_size, self.bias_ih, self.bias_hh, dropout=self.dropout)
        
    def set_dropout(self, dropout):
        self.dropout = dropout
        self.gru_cell.set_dropout(dropout)
        
    def init_hidden(self, batch_size):
        return Variable(torch.zeros(1, batch_size, self.hidden_size)).cuda()
    
    def forward(self, x, hx=None, max_length=None, lengths=None):
        batch_size = x.size(0)
        seq_length = x.size(1)
        if max_length is None:
            max_length = seq_length
        lefts, rights = [], []
        
        # left part
        lhx = self.init_hidden(batch_size)
        for i in range(max_length):
            new_hx = self.gru_cell(x[:, i, :], hx=hx)
            mask = (i < lengths).float().unsqueeze(1).expand_as(new_hx)
            lhx = new_hx * mask + lhx * (1 - mask)
            lefts.append(lhx.view(batch_size, 1, self.hidden_size))
        self.gru_cell.end_of_sequence()
        lefts = torch.cat(lefts, 1)
        
        # right part
        rhx = self.init_hidden(batch_size)
        for i in range(max_length - 1, -1, -1):
            new_hx = self.gru_cell(x[:, i, :], hx=hx)
            mask = (i < lengths).float().unsqueeze(1).expand_as(new_hx)
            rhx = new_hx * mask + rhx * (1 - mask)
            rights.append(rhx.view(batch_size, 1, self.hidden_size))
        self.gru_cell.end_of_sequence()
        rights = torch.cat(rights, 1)
        
        output = torch.cat((lefts, rights), dim=2)
        return output, lhx

In [40]:
class ModelManager(object):
    
    def __init__(self, path=None):
        self.path = path
        
    def save_model(self, model, path=None):
        path = self.path if path is None else path
        torch.save(model.state_dict(), path)
        print("Model has been saved as %s.\n" % path)
        
    def load_model(self, model, path=None):
        path = self.path if path is None else path
        model.load_state_dict(torch.load(path))
        model.eval()
        print("A pre-trained model at %s has been loaded." % path)

In [41]:
class BaseModel(nn.Module):
    
    def __init__(self):
        super(BaseModel, self).__init__()
        print("Choose the torch base model.")
        self.manager = ModelManager()
        
    def save(self, path):
        self.manager.save_model(self, path)
        
    def load(self, path):
        self.manager.load_model(self, path)
        
    def forward(self, x):
        raise NotImplementedError
        
    def predict(self, x, batch_size=256, verbose=0):
        self.eval_model()
        predictions = []
        for batch_xs in test_batches_generator(x, batch_size):
            preds_var = self.forward(batch_xs)
            preds_logits = nn.Sigmoid()(preds_var)
            predictions.append(preds_logits.data.cpu().numpy())
        predictions = np.concatenate(predictions, axis=0)
        return predictions
        
    def set_dropout(self, p):
        pass
    
    def train_model(self, p):
        self.set_dropout(p)
        self.train()
        
    def eval_model(self):
        self.eval()

In [42]:
class GRUClassifier(BaseModel):
    
    def __init__(self, input_size, hidden_size, embedding):
        super(GRUClassifier, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.embedding = embedding
        
        self.gru = nn.GRU(input_size=self.input_size, hidden_size=self.hidden_size, batch_first=True, num_layers=2, dropout=0.35, bidirectional=True)
        
        self.attn = DotAttention(hidden_size=2 * hidden_size)
        
        self.classifier = nn.Sequential(
            OrderDict([
                ('gru_dropout', nn.Dropout(0.5)),
                ('h1', nn.Linear(self.hidden_size * 6, 108)),
                ('relu1', nn.ReLU()),
                ('out', nn.Linear(108, 6)),
            ])
        )
        
    def set_dropout(self, p):
        pass
    
    def forward(self, _input, hidden=None, lengths=None):
        _input, lengths = _input
        embedded = self.embedding(_input)
        
        out, _ = self.gru(embedded)
        last = out[:, -1, :]
        attn, _ = self.attn.forward(out)
        max_num, _ = torch.max(out, dim=1)
        concatenated = torch.cat([last, max_num, attn], dim=1)
        result = self.classifier(concatenated)
        
        return result

In [43]:
class BayesianGRUClassifier(BaseModel):
    
    def __init__(self, input_size, hidden_size, embedding):
        super(BayesianGRUClassifier, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.embedding = embedding
        
        self.gru1 = BiBayesianGRU(input_size=self.input_size, hidden_size=self.hidden_size, dropout=0.3)
        self.gru2 = BiBayesianGRU(input_size=2 * self.hidden_size, hidden_size=hidden_size, dropout=0.3)
        
        self.classifier = nn.Sequential(
            OrderedDict([
                ('gru_dropout', nn.Dropout(0.5)),
                ('h1', nn.Linear(self.hidden_size * 4, 72)),
                ('relu1', nn.ReLU()),
                ('out', nn.Linear(72, 6)),
            ])
        )
        
    def set_dropout(self, p):
        self.gru1.set_dropout(p)
        self.gru2.set_dropout(p)
        
    def forward(self, _input, hidden=None, lengths=None):
        _input, lengths = _input
        embedded = self.embedding(_input)
        
        out1, _ = self.gru1.forward(embedded, lengths=lengths)
        out2, _ = self.gru2.forward(out1, lengths=lengths)
        
        last = out2[:, -1, :]
        max_num, _ = torch.max(out2, dim=1)
                
        concatenated = torch.cat([last, max_num], dim=1)
        result = self.classifier(concatenated)
                
        return result

In [54]:
class RecurrentHighwayClassifier(BaseModel):
    
    def __init__(self, input_size, hidden_size, recurrent_length, embedding, recurrent_dropout=0.3):
        super(RecurrentHighwayClassifier, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.L = recurrent_length
        self.recurrent_dropout = recurrent_dropout
        self.highways = nn.ModuleList()
        self.highways.append(RHNCell(self.input_size, self.hidden_size, is_first_layer=True, recurrent_dropout=self.recurrent_dropout))
        
        for _ in range(self.L - 1):
            self.highways.append(RHNCell(self.input_size, self.hidden_size, is_first_layer=False, recurrent_dropout=self.recurrent_dropout))
            
        self.embedding = embedding
        
        self.classifier = nn.Sequential(
            OrderedDict([
                ('h1_dropout', nn.Dropout(0.5)),
                ('h1', nn.Linear(self.hidden_size * 4, 74)),
                ('relu1', nn.ReLU()),
                ('out', nn.Linear(74, 6)),
            ])
        )
        
    def init_state(self, batch_size):
        hidden = Variable(torch.zeros(batch_size, self.hidden_size).cuda())
        return hidden
    
    def set_dropout(self, p):
        for rhn_cell in self.highways:
            rhn_cell.set_dropout(p)
            
    def forward(self, _input, hidden=None, lengths=None):
        '''
        Input including input_sequences and sequence_length.
        '''
        _input, lengths = _input
        batch_size = _input.size(0)
        seq_length = _input.size(1)
        
        if hidden is None:
            hidden = self.init_state(batch_size)
        embed_batch = self.embedding(_input)
        
        lefts, rights = [], []
        
        for time in range(seq_length):
            for tick in range(self.L):
                next_hidden = self.highways[tick](embed_batch[:, time, :], hidden)
                mask = (time < lengths).float().unsqueeze(1).expand_as(next_hidden)
                hidden = next_hidden * mask + hidden * (1 - mask)
            lefts.append(hidden.unsqueeze(1))
        lefts = torch.cat(lefts, dim=1)
        
        for rhn_cell in self.highways:
            rhn_cell.end_of_sequence()
            
        for time in range(seq_length - 1, -1, -1):
            for tick in range(self.L):
                next_hidden = self.highways[tick](embed_batch[:, time, :], hidden)
                mask = (time < lengths).float().unsqueeze(1).expand_as(next_hidden)
                hidden = next_hidden * mask + hidden * (1 - mask)
            rights.append(hidden.unsqueeze(1))
        rights = torch.cat(rights, dim=1)
        
        for rhn_cell in self.highways:
            rhn_cell.end_of_sequence()
            
        outputs = torch.cat([lefts, rights], dim=2)
        
        last = outputs[:, -1, :]
        max_num, _ = torch.max(outputs, dim=1)
        
        concatenated = torch.cat([last, max_num], dim=1)
        result = self.classifier(concatenated)
        
        return result

# Start Training

## Keras Based

In [None]:
model_name = 'fasttext-avcnn-pos-' + str(nb_words) + 'vocabulary-' + str(MAX_SEQUENCE_LENGTH) + 'length'
model = get_av_pos_cnn(nb_words, EMBEDDING_DIM, embedding_matrix, MAX_SEQUENCE_LENGTH, 6)
model.summary()

In [None]:
models, val_loss, total_auc, fold_predictions = train_folds(train_data, pos_train_data, train_labels, FOLD_COUNT, BATCH_SIZE, model)

In [None]:
print('Overall val-loss: {}, AUC {}'.format(val_loss, total_auc))

## Pytorch Based

In [None]:
def get_bgru_network():
    embedding = nn.Embedding(MAX_NB_WORDS, EMBEDDING_DIM)
    embedding.weight.data.copy_(torch.from_numpy(embedding_matrix))
    embedding.weight.requires_grad=False
    return BayesianGRUClassifier(input_size=EMBEDDING_DIM, hidden_size=60, embedding=embedding)

In [55]:
def get_recurrent_higtway_classifier():
    embedding = nn.Embedding(MAX_NB_WORDS, EMBEDDING_DIM)
    embedding.weight.data.copy_(torch.from_numpy(embedding_matrix))
    embedding.weight.requires_grad=False
    return RecurrentHighwayClassifier(input_size=EMBEDDING_DIM, hidden_size=60, embedding=embedding, recurrent_length=2, recurrent_dropout=0.25)

In [56]:
models, val_loss, total_auc, fold_predictions = pytorch_train_folds(x=train_sequences, y=train_labels, fold_count=10, batch_size=256, get_model_func=get_recurrent_highway_classifier, skip_fold=0)

Choose the torch base model.
## Training on fold 0 ##
Epoch: 1 Batch: 0 Log-loss: 0.6909138560295105
Epoch: 1 Batch: 40 Log-loss: 0.09789622575044632
Epoch: 1 Batch: 80 Log-loss: 0.1381751447916031
Epoch: 1 Batch: 120 Log-loss: 0.08953512459993362
Epoch: 1 Batch: 160 Log-loss: 0.0665510818362236
Epoch: 1 Batch: 200 Log-loss: 0.08710252493619919
Epoch: 1 Batch: 240 Log-loss: 0.05753612890839577
Epoch: 1 Batch: 280 Log-loss: 0.07375114411115646
Epoch: 1 Batch: 320 Log-loss: 0.07871861010789871
Epoch: 1 Batch: 360 Log-loss: 0.07296248525381088
Epoch: 1 Batch: 400 Log-loss: 0.06677073985338211
Epoch: 1 Batch: 440 Log-loss: 0.04778880253434181
Epoch: 1 Batch: 480 Log-loss: 0.034908317029476166
Epoch: 1 Batch: 520 Log-loss: 0.05689697340130806
Epoch: 1 Batch: 560 Log-loss: 0.06855592131614685
Epoch average log-loss: 0.09195422819216868
Model has been saved as model_pool/Pytorch/rhn/pavel_rhn-TEMP.pt.

In Epoch: 1, val_loss: 0.053436464985758686, best_val_loss: 0.053436464985758686, best_auc:

Epoch: 9 Batch: 320 Log-loss: 0.029772600159049034
Epoch: 9 Batch: 360 Log-loss: 0.03644464537501335
Epoch: 9 Batch: 400 Log-loss: 0.04935148358345032
Epoch: 9 Batch: 440 Log-loss: 0.04174357280135155
Epoch: 9 Batch: 480 Log-loss: 0.03916306048631668
Epoch: 9 Batch: 520 Log-loss: 0.03705568239092827
Epoch: 9 Batch: 560 Log-loss: 0.051291849464178085
Epoch average log-loss: 0.0445412934624723
Model has been saved as model_pool/Pytorch/rhn/pavel_rhn-TEMP.pt.

In Epoch: 9, val_loss: 0.041479047999132455, best_val_loss: 0.041479047999132455, best_auc: 0.9860580803359587
Epoch: 10 Batch: 0 Log-loss: 0.05114366486668587
Epoch: 10 Batch: 40 Log-loss: 0.04128624126315117
Epoch: 10 Batch: 80 Log-loss: 0.043263595551252365
Epoch: 10 Batch: 120 Log-loss: 0.03587421774864197
Epoch: 10 Batch: 160 Log-loss: 0.04254782199859619
Epoch: 10 Batch: 200 Log-loss: 0.047971997410058975
Epoch: 10 Batch: 240 Log-loss: 0.052927348762750626
Epoch: 10 Batch: 280 Log-loss: 0.036784302443265915
Epoch: 10 Batch: 32

Epoch: 18 Batch: 0 Log-loss: 0.032104719430208206
Epoch: 18 Batch: 40 Log-loss: 0.04511968418955803
Epoch: 18 Batch: 80 Log-loss: 0.03888026997447014
Epoch: 18 Batch: 120 Log-loss: 0.04201468452811241
Epoch: 18 Batch: 160 Log-loss: 0.030432039871811867
Epoch: 18 Batch: 200 Log-loss: 0.029706992208957672
Epoch: 18 Batch: 240 Log-loss: 0.05270456150174141
Epoch: 18 Batch: 280 Log-loss: 0.04667198285460472
Epoch: 18 Batch: 320 Log-loss: 0.03768365830183029
Epoch: 18 Batch: 360 Log-loss: 0.038321856409311295
Epoch: 18 Batch: 400 Log-loss: 0.04278748854994774
Epoch: 18 Batch: 440 Log-loss: 0.035271767526865005
Epoch: 18 Batch: 480 Log-loss: 0.05709590017795563
Epoch: 18 Batch: 520 Log-loss: 0.047594550997018814
Epoch: 18 Batch: 560 Log-loss: 0.046552181243896484
Epoch average log-loss: 0.04169747363443353
In Epoch: 18, val_loss: 0.04024400187725147, best_val_loss: 0.03986488318055153, best_auc: 0.9889411308654562
Epoch: 19 Batch: 0 Log-loss: 0.03365618735551834
Epoch: 19 Batch: 40 Log-loss:

Epoch: 26 Batch: 520 Log-loss: 0.0369560644030571
Epoch: 26 Batch: 560 Log-loss: 0.0576544888317585
Epoch average log-loss: 0.04041304625570774
In Epoch: 26, val_loss: 0.03928247877733998, best_val_loss: 0.03927607626804285, best_auc: 0.9886309826532315
Epoch: 27 Batch: 0 Log-loss: 0.044079214334487915
Epoch: 27 Batch: 40 Log-loss: 0.039344530552625656
Epoch: 27 Batch: 80 Log-loss: 0.04814763739705086
Epoch: 27 Batch: 120 Log-loss: 0.04104357957839966
Epoch: 27 Batch: 160 Log-loss: 0.057558655738830566
Epoch: 27 Batch: 200 Log-loss: 0.05325515568256378
Epoch: 27 Batch: 240 Log-loss: 0.04926185682415962
Epoch: 27 Batch: 280 Log-loss: 0.04049883782863617
Epoch: 27 Batch: 320 Log-loss: 0.052252158522605896
Epoch: 27 Batch: 360 Log-loss: 0.031765151768922806
Epoch: 27 Batch: 400 Log-loss: 0.029723526909947395
Epoch: 27 Batch: 440 Log-loss: 0.033663827925920486
Epoch: 27 Batch: 480 Log-loss: 0.03940055891871452
Epoch: 27 Batch: 520 Log-loss: 0.05548809468746185
Epoch: 27 Batch: 560 Log-loss

Epoch: 35 Batch: 440 Log-loss: 0.03520004823803902
Epoch: 35 Batch: 480 Log-loss: 0.058036744594573975
Epoch: 35 Batch: 520 Log-loss: 0.04026423394680023
Epoch: 35 Batch: 560 Log-loss: 0.031175360083580017
Epoch average log-loss: 0.03995569550897926
In Epoch: 35, val_loss: 0.039889466217785934, best_val_loss: 0.03905088162707284, best_auc: 0.9891555659193183
Epoch: 36 Batch: 0 Log-loss: 0.03982589766383171
Epoch: 36 Batch: 40 Log-loss: 0.03836211934685707
Epoch: 36 Batch: 80 Log-loss: 0.0388035774230957
Epoch: 36 Batch: 120 Log-loss: 0.03995800390839577
Epoch: 36 Batch: 160 Log-loss: 0.04431477561593056
Epoch: 36 Batch: 200 Log-loss: 0.025040989741683006
Epoch: 36 Batch: 240 Log-loss: 0.04572805389761925
Epoch: 36 Batch: 280 Log-loss: 0.03557261452078819
Epoch: 36 Batch: 320 Log-loss: 0.033415939658880234
Epoch: 36 Batch: 360 Log-loss: 0.04244757071137428
Epoch: 36 Batch: 400 Log-loss: 0.038047727197408676
Epoch: 36 Batch: 440 Log-loss: 0.05842546746134758
Epoch: 36 Batch: 480 Log-loss

Epoch: 1 Batch: 320 Log-loss: 0.06599095463752747
Epoch: 1 Batch: 360 Log-loss: 0.05448780581355095
Epoch: 1 Batch: 400 Log-loss: 0.07055876404047012
Epoch: 1 Batch: 440 Log-loss: 0.053293779492378235
Epoch: 1 Batch: 480 Log-loss: 0.05364666506648064
Epoch: 1 Batch: 520 Log-loss: 0.04730821028351784
Epoch: 1 Batch: 560 Log-loss: 0.04034842923283577
Epoch average log-loss: 0.08703740330945169
Model has been saved as model_pool/Pytorch/rhn/pavel_rhn-TEMP.pt.

In Epoch: 1, val_loss: 0.052948672087820954, best_val_loss: 0.052948672087820954, best_auc: 0.9712086616844072
Epoch: 2 Batch: 0 Log-loss: 0.05144227668642998
Epoch: 2 Batch: 40 Log-loss: 0.055775925517082214
Epoch: 2 Batch: 80 Log-loss: 0.04900525137782097
Epoch: 2 Batch: 120 Log-loss: 0.05491988733410835
Epoch: 2 Batch: 160 Log-loss: 0.07973628491163254
Epoch: 2 Batch: 200 Log-loss: 0.07194353640079498
Epoch: 2 Batch: 240 Log-loss: 0.03282599523663521
Epoch: 2 Batch: 280 Log-loss: 0.03812393918633461
Epoch: 2 Batch: 320 Log-loss: 

Epoch: 10 Batch: 0 Log-loss: 0.049696892499923706
Epoch: 10 Batch: 40 Log-loss: 0.045723553746938705
Epoch: 10 Batch: 80 Log-loss: 0.03784541040658951
Epoch: 10 Batch: 120 Log-loss: 0.05052245035767555
Epoch: 10 Batch: 160 Log-loss: 0.0345381535589695
Epoch: 10 Batch: 200 Log-loss: 0.035462457686662674
Epoch: 10 Batch: 240 Log-loss: 0.0559101365506649
Epoch: 10 Batch: 280 Log-loss: 0.03880618140101433
Epoch: 10 Batch: 320 Log-loss: 0.057563215494155884
Epoch: 10 Batch: 360 Log-loss: 0.05677991732954979
Epoch: 10 Batch: 400 Log-loss: 0.05409931018948555
Epoch: 10 Batch: 440 Log-loss: 0.05427367612719536
Epoch: 10 Batch: 480 Log-loss: 0.05000513419508934
Epoch: 10 Batch: 520 Log-loss: 0.041558366268873215
Epoch: 10 Batch: 560 Log-loss: 0.04385824874043465
Epoch average log-loss: 0.04262197836568313
Model has been saved as model_pool/Pytorch/rhn/pavel_rhn-TEMP.pt.

In Epoch: 10, val_loss: 0.04353801126065093, best_val_loss: 0.04353801126065093, best_auc: 0.9876119636687406
Epoch: 11 Batch

Epoch: 18 Batch: 480 Log-loss: 0.029355796054005623
Epoch: 18 Batch: 520 Log-loss: 0.029866425320506096
Epoch: 18 Batch: 560 Log-loss: 0.03735651820898056
Epoch average log-loss: 0.04063368532806635
Model has been saved as model_pool/Pytorch/rhn/pavel_rhn-TEMP.pt.

In Epoch: 18, val_loss: 0.041949441635225536, best_val_loss: 0.041949441635225536, best_auc: 0.9882905549486795
Epoch: 19 Batch: 0 Log-loss: 0.03200962394475937
Epoch: 19 Batch: 40 Log-loss: 0.04702135920524597
Epoch: 19 Batch: 80 Log-loss: 0.044384900480508804
Epoch: 19 Batch: 120 Log-loss: 0.044657547026872635
Epoch: 19 Batch: 160 Log-loss: 0.040187519043684006
Epoch: 19 Batch: 200 Log-loss: 0.03203034773468971
Epoch: 19 Batch: 240 Log-loss: 0.04374602809548378
Epoch: 19 Batch: 280 Log-loss: 0.03994901850819588
Epoch: 19 Batch: 320 Log-loss: 0.03671443089842796
Epoch: 19 Batch: 360 Log-loss: 0.031438905745744705
Epoch: 19 Batch: 400 Log-loss: 0.04317675903439522
Epoch: 19 Batch: 440 Log-loss: 0.023264160379767418
Epoch: 19

Epoch: 27 Batch: 240 Log-loss: 0.03636728972196579
Epoch: 27 Batch: 280 Log-loss: 0.03952385112643242
Epoch: 27 Batch: 320 Log-loss: 0.04357675835490227
Epoch: 27 Batch: 360 Log-loss: 0.03546091541647911
Epoch: 27 Batch: 400 Log-loss: 0.02720884419977665
Epoch: 27 Batch: 440 Log-loss: 0.026531359180808067
Epoch: 27 Batch: 480 Log-loss: 0.03320727497339249
Epoch: 27 Batch: 520 Log-loss: 0.027285272255539894
Epoch: 27 Batch: 560 Log-loss: 0.045950476080179214
Epoch average log-loss: 0.03935169238996293
In Epoch: 27, val_loss: 0.04190073196046846, best_val_loss: 0.041566256228733355, best_auc: 0.9885440142963783
Epoch: 28 Batch: 0 Log-loss: 0.053408920764923096
Epoch: 28 Batch: 40 Log-loss: 0.03792562708258629
Epoch: 28 Batch: 80 Log-loss: 0.039173971861600876
Epoch: 28 Batch: 120 Log-loss: 0.03182966262102127
Epoch: 28 Batch: 160 Log-loss: 0.03742187097668648
Epoch: 28 Batch: 200 Log-loss: 0.04261190816760063
Epoch: 28 Batch: 240 Log-loss: 0.03193444386124611
Epoch: 28 Batch: 280 Log-los

Epoch: 36 Batch: 40 Log-loss: 0.029747089371085167
Epoch: 36 Batch: 80 Log-loss: 0.038398709148168564
Epoch: 36 Batch: 120 Log-loss: 0.04459230229258537
Epoch: 36 Batch: 160 Log-loss: 0.04199961945414543
Epoch: 36 Batch: 200 Log-loss: 0.04024933651089668
Epoch: 36 Batch: 240 Log-loss: 0.04414758086204529
Epoch: 36 Batch: 280 Log-loss: 0.047311991453170776
Epoch: 36 Batch: 320 Log-loss: 0.03965149447321892
Epoch: 36 Batch: 360 Log-loss: 0.054626647382974625
Epoch: 36 Batch: 400 Log-loss: 0.02454730123281479
Epoch: 36 Batch: 440 Log-loss: 0.04280513897538185
Epoch: 36 Batch: 480 Log-loss: 0.03159680590033531
Epoch: 36 Batch: 520 Log-loss: 0.040469687432050705
Epoch: 36 Batch: 560 Log-loss: 0.02757883444428444
Epoch average log-loss: 0.039072232098052544
In Epoch: 36, val_loss: 0.04171421951889098, best_val_loss: 0.04140714119245084, best_auc: 0.9890981337434974
Epoch: 37 Batch: 0 Log-loss: 0.04768766835331917
Epoch: 37 Batch: 40 Log-loss: 0.053777992725372314
Epoch: 37 Batch: 80 Log-loss

Epoch: 45 Batch: 0 Log-loss: 0.03174015134572983
Epoch: 45 Batch: 40 Log-loss: 0.07204743474721909
Epoch: 45 Batch: 80 Log-loss: 0.04195719584822655
Epoch: 45 Batch: 120 Log-loss: 0.03763991966843605
Epoch: 45 Batch: 160 Log-loss: 0.03925725445151329
Epoch: 45 Batch: 200 Log-loss: 0.035029087215662
Epoch: 45 Batch: 240 Log-loss: 0.03618140146136284
Epoch: 45 Batch: 280 Log-loss: 0.04382982477545738
Epoch: 45 Batch: 320 Log-loss: 0.046890854835510254
Epoch: 45 Batch: 360 Log-loss: 0.04026982560753822
Epoch: 45 Batch: 400 Log-loss: 0.025770707055926323
Epoch: 45 Batch: 440 Log-loss: 0.02677927352488041
Epoch: 45 Batch: 480 Log-loss: 0.046928659081459045
Epoch: 45 Batch: 520 Log-loss: 0.044834885746240616
Epoch: 45 Batch: 560 Log-loss: 0.03872470185160637
Epoch average log-loss: 0.03878934763238898
A pre-trained model at model_pool/Pytorch/rhn/pavel_rhn-TEMP.pt has been loaded.
Model has been saved as model_pool/Pytorch/rhn/pavel_rhn1.pt.

Choose the torch base model.
## Training on fold 

Epoch: 8 Batch: 360 Log-loss: 0.04403582215309143
Epoch: 8 Batch: 400 Log-loss: 0.041644614189863205
Epoch: 8 Batch: 440 Log-loss: 0.0430566668510437
Epoch: 8 Batch: 480 Log-loss: 0.06036766991019249
Epoch: 8 Batch: 520 Log-loss: 0.041684720665216446
Epoch: 8 Batch: 560 Log-loss: 0.030884424224495888
Epoch average log-loss: 0.04345810001915587
In Epoch: 8, val_loss: 0.04188464199946818, best_val_loss: 0.041542296041397586, best_auc: 0.9891236366433569
Epoch: 9 Batch: 0 Log-loss: 0.06072479486465454
Epoch: 9 Batch: 40 Log-loss: 0.04781827703118324
Epoch: 9 Batch: 80 Log-loss: 0.02761070430278778
Epoch: 9 Batch: 120 Log-loss: 0.05236664041876793
Epoch: 9 Batch: 160 Log-loss: 0.05475163832306862
Epoch: 9 Batch: 200 Log-loss: 0.0396842435002327
Epoch: 9 Batch: 240 Log-loss: 0.03905150294303894
Epoch: 9 Batch: 280 Log-loss: 0.04884999617934227
Epoch: 9 Batch: 320 Log-loss: 0.04391162097454071
Epoch: 9 Batch: 360 Log-loss: 0.03227635845541954
Epoch: 9 Batch: 400 Log-loss: 0.04545612633228302

Epoch: 17 Batch: 40 Log-loss: 0.03628319874405861
Epoch: 17 Batch: 80 Log-loss: 0.04652724042534828
Epoch: 17 Batch: 120 Log-loss: 0.038793306797742844
Epoch: 17 Batch: 160 Log-loss: 0.049183014780282974
Epoch: 17 Batch: 200 Log-loss: 0.05843858793377876
Epoch: 17 Batch: 240 Log-loss: 0.04956461116671562
Epoch: 17 Batch: 280 Log-loss: 0.04241880774497986
Epoch: 17 Batch: 320 Log-loss: 0.043617114424705505
Epoch: 17 Batch: 360 Log-loss: 0.03677980229258537
Epoch: 17 Batch: 400 Log-loss: 0.02205084078013897
Epoch: 17 Batch: 440 Log-loss: 0.03506985679268837
Epoch: 17 Batch: 480 Log-loss: 0.03242848441004753
Epoch: 17 Batch: 520 Log-loss: 0.038468364626169205
Epoch: 17 Batch: 560 Log-loss: 0.03726230561733246
Epoch average log-loss: 0.041010015328148644
In Epoch: 17, val_loss: 0.0405329259181784, best_val_loss: 0.04019114703902692, best_auc: 0.9895064230716057
Epoch: 18 Batch: 0 Log-loss: 0.044072508811950684
Epoch: 18 Batch: 40 Log-loss: 0.03758889436721802
Epoch: 18 Batch: 80 Log-loss: 

Epoch: 25 Batch: 440 Log-loss: 0.04173193499445915
Epoch: 25 Batch: 480 Log-loss: 0.022600390017032623
Epoch: 25 Batch: 520 Log-loss: 0.04010700806975365
Epoch: 25 Batch: 560 Log-loss: 0.034383464604616165
Epoch average log-loss: 0.039839775690675844
Model has been saved as model_pool/Pytorch/rhn/pavel_rhn-TEMP.pt.

In Epoch: 25, val_loss: 0.039670934016694186, best_val_loss: 0.039670934016694186, best_auc: 0.9905380881008642
Epoch: 26 Batch: 0 Log-loss: 0.03860387206077576
Epoch: 26 Batch: 40 Log-loss: 0.03736722096800804
Epoch: 26 Batch: 80 Log-loss: 0.041188325732946396
Epoch: 26 Batch: 120 Log-loss: 0.02929905615746975
Epoch: 26 Batch: 160 Log-loss: 0.03927440196275711
Epoch: 26 Batch: 200 Log-loss: 0.043954864144325256
Epoch: 26 Batch: 240 Log-loss: 0.04968782141804695
Epoch: 26 Batch: 280 Log-loss: 0.04779147729277611
Epoch: 26 Batch: 320 Log-loss: 0.04404182359576225
Epoch: 26 Batch: 360 Log-loss: 0.03052383102476597
Epoch: 26 Batch: 400 Log-loss: 0.03867020830512047
Epoch: 26 B

Epoch: 34 Batch: 160 Log-loss: 0.03244984522461891
Epoch: 34 Batch: 200 Log-loss: 0.05142946541309357
Epoch: 34 Batch: 240 Log-loss: 0.035950917750597
Epoch: 34 Batch: 280 Log-loss: 0.025924669578671455
Epoch: 34 Batch: 320 Log-loss: 0.04563179612159729
Epoch: 34 Batch: 360 Log-loss: 0.040022678673267365
Epoch: 34 Batch: 400 Log-loss: 0.04930157959461212
Epoch: 34 Batch: 440 Log-loss: 0.040117260068655014
Epoch: 34 Batch: 480 Log-loss: 0.025376876816153526
Epoch: 34 Batch: 520 Log-loss: 0.03224528953433037
Epoch: 34 Batch: 560 Log-loss: 0.04670834541320801
Epoch average log-loss: 0.0393310227630926
In Epoch: 34, val_loss: 0.039422999766547666, best_val_loss: 0.03920818030979369, best_auc: 0.9903700016365362
Epoch: 35 Batch: 0 Log-loss: 0.040138646960258484
Epoch: 35 Batch: 40 Log-loss: 0.03879069164395332
Epoch: 35 Batch: 80 Log-loss: 0.03312084451317787
Epoch: 35 Batch: 120 Log-loss: 0.027538934722542763
Epoch: 35 Batch: 160 Log-loss: 0.04145190492272377
Epoch: 35 Batch: 200 Log-loss:

Epoch: 43 Batch: 120 Log-loss: 0.04365473613142967
Epoch: 43 Batch: 160 Log-loss: 0.04615509882569313
Epoch: 43 Batch: 200 Log-loss: 0.04120107367634773
Epoch: 43 Batch: 240 Log-loss: 0.030953818932175636
Epoch: 43 Batch: 280 Log-loss: 0.06346815079450607
Epoch: 43 Batch: 320 Log-loss: 0.048835840076208115
Epoch: 43 Batch: 360 Log-loss: 0.03193243220448494
Epoch: 43 Batch: 400 Log-loss: 0.04063098505139351
Epoch: 43 Batch: 440 Log-loss: 0.029473720118403435
Epoch: 43 Batch: 480 Log-loss: 0.049528565257787704
Epoch: 43 Batch: 520 Log-loss: 0.0453464575111866
Epoch: 43 Batch: 560 Log-loss: 0.032125066965818405
Epoch average log-loss: 0.039208267106940704
In Epoch: 43, val_loss: 0.03941661095882591, best_val_loss: 0.03920818030979369, best_auc: 0.9903700016365362
Epoch: 44 Batch: 0 Log-loss: 0.03996959701180458
Epoch: 44 Batch: 40 Log-loss: 0.04313095286488533
Epoch: 44 Batch: 80 Log-loss: 0.03195556253194809
Epoch: 44 Batch: 120 Log-loss: 0.03052748180925846
Epoch: 44 Batch: 160 Log-loss

Epoch: 7 Batch: 480 Log-loss: 0.04782577231526375
Epoch: 7 Batch: 520 Log-loss: 0.04095962643623352
Epoch: 7 Batch: 560 Log-loss: 0.04207990691065788
Epoch average log-loss: 0.043992608263423404
Model has been saved as model_pool/Pytorch/rhn/pavel_rhn-TEMP.pt.

In Epoch: 7, val_loss: 0.04415017307607541, best_val_loss: 0.04415017307607541, best_auc: 0.984707366748618
Epoch: 8 Batch: 0 Log-loss: 0.06348206847906113
Epoch: 8 Batch: 40 Log-loss: 0.044476237148046494
Epoch: 8 Batch: 80 Log-loss: 0.03903960809111595
Epoch: 8 Batch: 120 Log-loss: 0.02411440946161747
Epoch: 8 Batch: 160 Log-loss: 0.04496742784976959
Epoch: 8 Batch: 200 Log-loss: 0.04574362561106682
Epoch: 8 Batch: 240 Log-loss: 0.03364004194736481
Epoch: 8 Batch: 280 Log-loss: 0.051981013268232346
Epoch: 8 Batch: 320 Log-loss: 0.04269278049468994
Epoch: 8 Batch: 360 Log-loss: 0.041359927505254745
Epoch: 8 Batch: 400 Log-loss: 0.06058087572455406
Epoch: 8 Batch: 440 Log-loss: 0.03174297884106636
Epoch: 8 Batch: 480 Log-loss: 0

Epoch: 16 Batch: 160 Log-loss: 0.055156409740448
Epoch: 16 Batch: 200 Log-loss: 0.05040520802140236
Epoch: 16 Batch: 240 Log-loss: 0.03421206399798393
Epoch: 16 Batch: 280 Log-loss: 0.0357184037566185
Epoch: 16 Batch: 320 Log-loss: 0.03231619670987129
Epoch: 16 Batch: 360 Log-loss: 0.050858233124017715
Epoch: 16 Batch: 400 Log-loss: 0.0367361418902874
Epoch: 16 Batch: 440 Log-loss: 0.037022776901721954
Epoch: 16 Batch: 480 Log-loss: 0.039808083325624466
Epoch: 16 Batch: 520 Log-loss: 0.023804642260074615
Epoch: 16 Batch: 560 Log-loss: 0.05193246901035309
Epoch average log-loss: 0.04073957859405449
In Epoch: 16, val_loss: 0.04253875446571498, best_val_loss: 0.0422868816796255, best_auc: 0.9868429722666323
Epoch: 17 Batch: 0 Log-loss: 0.03952150046825409
Epoch: 17 Batch: 40 Log-loss: 0.044496387243270874
Epoch: 17 Batch: 80 Log-loss: 0.043847810477018356
Epoch: 17 Batch: 120 Log-loss: 0.04595677927136421
Epoch: 17 Batch: 160 Log-loss: 0.04383542761206627
Epoch: 17 Batch: 200 Log-loss: 0.

Epoch: 25 Batch: 0 Log-loss: 0.03611734136939049
Epoch: 25 Batch: 40 Log-loss: 0.05352422967553139
Epoch: 25 Batch: 80 Log-loss: 0.0320788212120533
Epoch: 25 Batch: 120 Log-loss: 0.04427117481827736
Epoch: 25 Batch: 160 Log-loss: 0.034133996814489365
Epoch: 25 Batch: 200 Log-loss: 0.04796934500336647
Epoch: 25 Batch: 240 Log-loss: 0.03925077244639397
Epoch: 25 Batch: 280 Log-loss: 0.047123413532972336
Epoch: 25 Batch: 320 Log-loss: 0.046512797474861145
Epoch: 25 Batch: 360 Log-loss: 0.041517406702041626
Epoch: 25 Batch: 400 Log-loss: 0.05257832631468773
Epoch: 25 Batch: 440 Log-loss: 0.03543580695986748
Epoch: 25 Batch: 480 Log-loss: 0.034210748970508575
Epoch: 25 Batch: 520 Log-loss: 0.03406984731554985
Epoch: 25 Batch: 560 Log-loss: 0.0286001767963171
Epoch average log-loss: 0.03961672610270658
In Epoch: 25, val_loss: 0.04182684428150176, best_val_loss: 0.04181238094054135, best_auc: 0.9873598219084686
Epoch: 26 Batch: 0 Log-loss: 0.03211689367890358
Epoch: 26 Batch: 40 Log-loss: 0.0

Epoch: 33 Batch: 480 Log-loss: 0.04294782876968384
Epoch: 33 Batch: 520 Log-loss: 0.04075121879577637
Epoch: 33 Batch: 560 Log-loss: 0.030854465439915657
Epoch average log-loss: 0.03878066487211202
In Epoch: 33, val_loss: 0.04177879731612528, best_val_loss: 0.0416213014229791, best_auc: 0.9866807491133921
Epoch: 34 Batch: 0 Log-loss: 0.02281244285404682
Epoch: 34 Batch: 40 Log-loss: 0.05169738829135895
Epoch: 34 Batch: 80 Log-loss: 0.035210635513067245
Epoch: 34 Batch: 120 Log-loss: 0.029563473537564278
Epoch: 34 Batch: 160 Log-loss: 0.03337475284934044
Epoch: 34 Batch: 200 Log-loss: 0.047140974551439285
Epoch: 34 Batch: 240 Log-loss: 0.033271726220846176
Epoch: 34 Batch: 280 Log-loss: 0.04380737617611885
Epoch: 34 Batch: 320 Log-loss: 0.03285035490989685
Epoch: 34 Batch: 360 Log-loss: 0.038936879485845566
Epoch: 34 Batch: 400 Log-loss: 0.030008971691131592
Epoch: 34 Batch: 440 Log-loss: 0.0343145877122879
Epoch: 34 Batch: 480 Log-loss: 0.03468986228108406
Epoch: 34 Batch: 520 Log-loss

Epoch: 42 Batch: 400 Log-loss: 0.05925353989005089
Epoch: 42 Batch: 440 Log-loss: 0.03504970297217369
Epoch: 42 Batch: 480 Log-loss: 0.03468737751245499
Epoch: 42 Batch: 520 Log-loss: 0.05605269968509674
Epoch: 42 Batch: 560 Log-loss: 0.03757371008396149
Epoch average log-loss: 0.03849809550613697
Model has been saved as model_pool/Pytorch/rhn/pavel_rhn-TEMP.pt.

In Epoch: 42, val_loss: 0.04146947996036714, best_val_loss: 0.04146947996036714, best_auc: 0.987378816496406
Epoch: 43 Batch: 0 Log-loss: 0.050220787525177
Epoch: 43 Batch: 40 Log-loss: 0.03169426694512367
Epoch: 43 Batch: 80 Log-loss: 0.029376229271292686
Epoch: 43 Batch: 120 Log-loss: 0.05160730704665184
Epoch: 43 Batch: 160 Log-loss: 0.034931741654872894
Epoch: 43 Batch: 200 Log-loss: 0.05077892914414406
Epoch: 43 Batch: 240 Log-loss: 0.05829188600182533
Epoch: 43 Batch: 280 Log-loss: 0.03727436810731888
Epoch: 43 Batch: 320 Log-loss: 0.03658908233046532
Epoch: 43 Batch: 360 Log-loss: 0.0444166325032711
Epoch: 43 Batch: 400

Epoch: 51 Batch: 240 Log-loss: 0.026826458051800728
Epoch: 51 Batch: 280 Log-loss: 0.03982740640640259
Epoch: 51 Batch: 320 Log-loss: 0.05488337576389313
Epoch: 51 Batch: 360 Log-loss: 0.03144208714365959
Epoch: 51 Batch: 400 Log-loss: 0.048389192670583725
Epoch: 51 Batch: 440 Log-loss: 0.045446936041116714
Epoch: 51 Batch: 480 Log-loss: 0.03606998175382614
Epoch: 51 Batch: 520 Log-loss: 0.04017603024840355
Epoch: 51 Batch: 560 Log-loss: 0.03340761363506317
Epoch average log-loss: 0.038412581909714
In Epoch: 51, val_loss: 0.04181287365074984, best_val_loss: 0.04133642792965824, best_auc: 0.9875212725198687
Epoch: 52 Batch: 0 Log-loss: 0.022777898237109184
Epoch: 52 Batch: 40 Log-loss: 0.03185644373297691
Epoch: 52 Batch: 80 Log-loss: 0.046848878264427185
Epoch: 52 Batch: 120 Log-loss: 0.029825745150446892
Epoch: 52 Batch: 160 Log-loss: 0.028851039707660675
Epoch: 52 Batch: 200 Log-loss: 0.026060840114951134
Epoch: 52 Batch: 240 Log-loss: 0.054420020431280136
Epoch: 52 Batch: 280 Log-lo

Epoch: 5 Batch: 0 Log-loss: 0.02862723357975483
Epoch: 5 Batch: 40 Log-loss: 0.05800408124923706
Epoch: 5 Batch: 80 Log-loss: 0.04098288714885712
Epoch: 5 Batch: 120 Log-loss: 0.043399497866630554
Epoch: 5 Batch: 160 Log-loss: 0.040821027010679245
Epoch: 5 Batch: 200 Log-loss: 0.047052279114723206
Epoch: 5 Batch: 240 Log-loss: 0.03904081508517265
Epoch: 5 Batch: 280 Log-loss: 0.029493553563952446
Epoch: 5 Batch: 320 Log-loss: 0.045554619282484055
Epoch: 5 Batch: 360 Log-loss: 0.05004723370075226
Epoch: 5 Batch: 400 Log-loss: 0.04886845126748085
Epoch: 5 Batch: 440 Log-loss: 0.030934540554881096
Epoch: 5 Batch: 480 Log-loss: 0.030163967981934547
Epoch: 5 Batch: 520 Log-loss: 0.04704664275050163
Epoch: 5 Batch: 560 Log-loss: 0.06035754829645157
Epoch average log-loss: 0.045731993741355836
Model has been saved as model_pool/Pytorch/rhn/pavel_rhn-TEMP.pt.

In Epoch: 5, val_loss: 0.04354338793246459, best_val_loss: 0.04354338793246459, best_auc: 0.9859634410360395
Epoch: 6 Batch: 0 Log-loss

Epoch: 13 Batch: 320 Log-loss: 0.03228360414505005
Epoch: 13 Batch: 360 Log-loss: 0.04339493811130524
Epoch: 13 Batch: 400 Log-loss: 0.051560044288635254
Epoch: 13 Batch: 440 Log-loss: 0.04266371950507164
Epoch: 13 Batch: 480 Log-loss: 0.06025100126862526
Epoch: 13 Batch: 520 Log-loss: 0.03210156410932541
Epoch: 13 Batch: 560 Log-loss: 0.04835810512304306
Epoch average log-loss: 0.041417966250862394
Model has been saved as model_pool/Pytorch/rhn/pavel_rhn-TEMP.pt.

In Epoch: 13, val_loss: 0.04147807552197166, best_val_loss: 0.04147807552197166, best_auc: 0.9884452844855526
Epoch: 14 Batch: 0 Log-loss: 0.043237220495939255
Epoch: 14 Batch: 40 Log-loss: 0.04092416167259216
Epoch: 14 Batch: 80 Log-loss: 0.04390789195895195
Epoch: 14 Batch: 120 Log-loss: 0.08547388762235641
Epoch: 14 Batch: 160 Log-loss: 0.03801707178354263
Epoch: 14 Batch: 200 Log-loss: 0.03999768942594528
Epoch: 14 Batch: 240 Log-loss: 0.04601162672042847
Epoch: 14 Batch: 280 Log-loss: 0.05602709576487541
Epoch: 14 Batch

Epoch: 22 Batch: 80 Log-loss: 0.023718489333987236
Epoch: 22 Batch: 120 Log-loss: 0.050586849451065063
Epoch: 22 Batch: 160 Log-loss: 0.04478336498141289
Epoch: 22 Batch: 200 Log-loss: 0.05544924736022949
Epoch: 22 Batch: 240 Log-loss: 0.043481845408678055
Epoch: 22 Batch: 280 Log-loss: 0.0417424738407135
Epoch: 22 Batch: 320 Log-loss: 0.04553303122520447
Epoch: 22 Batch: 360 Log-loss: 0.03806564211845398
Epoch: 22 Batch: 400 Log-loss: 0.0421479307115078
Epoch: 22 Batch: 440 Log-loss: 0.04149748012423515
Epoch: 22 Batch: 480 Log-loss: 0.03341905400156975
Epoch: 22 Batch: 520 Log-loss: 0.03355581313371658
Epoch: 22 Batch: 560 Log-loss: 0.04663657769560814
Epoch average log-loss: 0.03992026270965913
Model has been saved as model_pool/Pytorch/rhn/pavel_rhn-TEMP.pt.

In Epoch: 22, val_loss: 0.040286046444048436, best_val_loss: 0.040286046444048436, best_auc: 0.9889374024185579
Epoch: 23 Batch: 0 Log-loss: 0.034281231462955475
Epoch: 23 Batch: 40 Log-loss: 0.029296962544322014
Epoch: 23 Bat

Epoch: 31 Batch: 0 Log-loss: 0.039852727204561234
Epoch: 31 Batch: 40 Log-loss: 0.06481961160898209
Epoch: 31 Batch: 80 Log-loss: 0.03420821577310562
Epoch: 31 Batch: 120 Log-loss: 0.03263763710856438
Epoch: 31 Batch: 160 Log-loss: 0.0378069244325161
Epoch: 31 Batch: 200 Log-loss: 0.053424134850502014
Epoch: 31 Batch: 240 Log-loss: 0.04135115444660187
Epoch: 31 Batch: 280 Log-loss: 0.030789831653237343
Epoch: 31 Batch: 320 Log-loss: 0.0650985836982727
Epoch: 31 Batch: 360 Log-loss: 0.05280139669775963
Epoch: 31 Batch: 400 Log-loss: 0.029761195182800293
Epoch: 31 Batch: 440 Log-loss: 0.04201870039105415
Epoch: 31 Batch: 480 Log-loss: 0.056899216026067734
Epoch: 31 Batch: 520 Log-loss: 0.033996984362602234
Epoch: 31 Batch: 560 Log-loss: 0.021790795028209686
Epoch average log-loss: 0.03939591767266393
In Epoch: 31, val_loss: 0.04055739081373827, best_val_loss: 0.040286046444048436, best_auc: 0.9889374024185579
Epoch: 32 Batch: 0 Log-loss: 0.04677179455757141
Epoch: 32 Batch: 40 Log-loss: 

Epoch: 5 Batch: 440 Log-loss: 0.0558314323425293
Epoch: 5 Batch: 480 Log-loss: 0.03545612469315529
Epoch: 5 Batch: 520 Log-loss: 0.061670735478401184
Epoch: 5 Batch: 560 Log-loss: 0.053573161363601685
Epoch average log-loss: 0.045734232381385354
Model has been saved as model_pool/Pytorch/rhn/pavel_rhn-TEMP.pt.

In Epoch: 5, val_loss: 0.04312738564701038, best_val_loss: 0.04312738564701038, best_auc: 0.9841317802807378
Epoch: 6 Batch: 0 Log-loss: 0.04084121808409691
Epoch: 6 Batch: 40 Log-loss: 0.06315923482179642
Epoch: 6 Batch: 80 Log-loss: 0.05243361368775368
Epoch: 6 Batch: 120 Log-loss: 0.05588553845882416
Epoch: 6 Batch: 160 Log-loss: 0.036649931222200394
Epoch: 6 Batch: 200 Log-loss: 0.044969309121370316
Epoch: 6 Batch: 240 Log-loss: 0.03859952464699745
Epoch: 6 Batch: 280 Log-loss: 0.04363716021180153
Epoch: 6 Batch: 320 Log-loss: 0.06361734122037888
Epoch: 6 Batch: 360 Log-loss: 0.044245537370443344
Epoch: 6 Batch: 400 Log-loss: 0.051547180861234665
Epoch: 6 Batch: 440 Log-loss

Epoch: 14 Batch: 200 Log-loss: 0.0419657863676548
Epoch: 14 Batch: 240 Log-loss: 0.029108747839927673
Epoch: 14 Batch: 280 Log-loss: 0.03704996034502983
Epoch: 14 Batch: 320 Log-loss: 0.037760671228170395
Epoch: 14 Batch: 360 Log-loss: 0.03789113089442253
Epoch: 14 Batch: 400 Log-loss: 0.05496269837021828
Epoch: 14 Batch: 440 Log-loss: 0.031972724944353104
Epoch: 14 Batch: 480 Log-loss: 0.028412558138370514
Epoch: 14 Batch: 520 Log-loss: 0.03947991877794266
Epoch: 14 Batch: 560 Log-loss: 0.03353649377822876
Epoch average log-loss: 0.04139187710492739
Model has been saved as model_pool/Pytorch/rhn/pavel_rhn-TEMP.pt.

In Epoch: 14, val_loss: 0.04054266284817117, best_val_loss: 0.04054266284817117, best_auc: 0.987541433340792
Epoch: 15 Batch: 0 Log-loss: 0.03405486419796944
Epoch: 15 Batch: 40 Log-loss: 0.05207743123173714
Epoch: 15 Batch: 80 Log-loss: 0.050435956567525864
Epoch: 15 Batch: 120 Log-loss: 0.03532165661454201
Epoch: 15 Batch: 160 Log-loss: 0.05245046690106392
Epoch: 15 Batch

Epoch: 23 Batch: 0 Log-loss: 0.041823867708444595
Epoch: 23 Batch: 40 Log-loss: 0.041080739349126816
Epoch: 23 Batch: 80 Log-loss: 0.03779733553528786
Epoch: 23 Batch: 120 Log-loss: 0.05411575734615326
Epoch: 23 Batch: 160 Log-loss: 0.033252034336328506
Epoch: 23 Batch: 200 Log-loss: 0.03434687480330467
Epoch: 23 Batch: 240 Log-loss: 0.02286229096353054
Epoch: 23 Batch: 280 Log-loss: 0.041796233505010605
Epoch: 23 Batch: 320 Log-loss: 0.06717391312122345
Epoch: 23 Batch: 360 Log-loss: 0.03653847053647041
Epoch: 23 Batch: 400 Log-loss: 0.039779748767614365
Epoch: 23 Batch: 440 Log-loss: 0.035373982042074203
Epoch: 23 Batch: 480 Log-loss: 0.03532235696911812
Epoch: 23 Batch: 520 Log-loss: 0.03239138051867485
Epoch: 23 Batch: 560 Log-loss: 0.06409061700105667
Epoch average log-loss: 0.040055926376953724
In Epoch: 23, val_loss: 0.04033086101568331, best_val_loss: 0.03998695262659272, best_auc: 0.9876625643682232
Epoch: 24 Batch: 0 Log-loss: 0.029798155650496483
Epoch: 24 Batch: 40 Log-loss

Epoch: 31 Batch: 560 Log-loss: 0.04841785132884979
Epoch average log-loss: 0.039468022864977165
In Epoch: 31, val_loss: 0.03995899585801625, best_val_loss: 0.03983140285767282, best_auc: 0.9879188229994146
Epoch: 32 Batch: 0 Log-loss: 0.042432818561792374
Epoch: 32 Batch: 40 Log-loss: 0.04259462654590607
Epoch: 32 Batch: 80 Log-loss: 0.030873598530888557
Epoch: 32 Batch: 120 Log-loss: 0.046750690788030624
Epoch: 32 Batch: 160 Log-loss: 0.04897401109337807
Epoch: 32 Batch: 200 Log-loss: 0.03417744114995003
Epoch: 32 Batch: 240 Log-loss: 0.03756428882479668
Epoch: 32 Batch: 280 Log-loss: 0.03847377002239227
Epoch: 32 Batch: 320 Log-loss: 0.027525009587407112
Epoch: 32 Batch: 360 Log-loss: 0.044916342943906784
Epoch: 32 Batch: 400 Log-loss: 0.03706689551472664
Epoch: 32 Batch: 440 Log-loss: 0.025280645117163658
Epoch: 32 Batch: 480 Log-loss: 0.03511154279112816
Epoch: 32 Batch: 520 Log-loss: 0.05112038180232048
Epoch: 32 Batch: 560 Log-loss: 0.037292178720235825
Epoch average log-loss: 0.

Epoch: 40 Batch: 480 Log-loss: 0.043814241886138916
Epoch: 40 Batch: 520 Log-loss: 0.02988274395465851
Epoch: 40 Batch: 560 Log-loss: 0.0433126837015152
Epoch average log-loss: 0.038878118130378427
In Epoch: 40, val_loss: 0.039897821663461126, best_val_loss: 0.03979638476401761, best_auc: 0.9877723851563719
Epoch: 41 Batch: 0 Log-loss: 0.036502476781606674
Epoch: 41 Batch: 40 Log-loss: 0.03334697708487511
Epoch: 41 Batch: 80 Log-loss: 0.05660061910748482
Epoch: 41 Batch: 120 Log-loss: 0.041099730879068375
Epoch: 41 Batch: 160 Log-loss: 0.04040705785155296
Epoch: 41 Batch: 200 Log-loss: 0.040701113641262054
Epoch: 41 Batch: 240 Log-loss: 0.0401398129761219
Epoch: 41 Batch: 280 Log-loss: 0.03750528395175934
Epoch: 41 Batch: 320 Log-loss: 0.04438940808176994
Epoch: 41 Batch: 360 Log-loss: 0.030881604179739952
Epoch: 41 Batch: 400 Log-loss: 0.048665035516023636
Epoch: 41 Batch: 440 Log-loss: 0.03181444853544235
Epoch: 41 Batch: 480 Log-loss: 0.030766338109970093
Epoch: 41 Batch: 520 Log-lo

Epoch: 49 Batch: 440 Log-loss: 0.028102077543735504
Epoch: 49 Batch: 480 Log-loss: 0.0458376444876194
Epoch: 49 Batch: 520 Log-loss: 0.03974338248372078
Epoch: 49 Batch: 560 Log-loss: 0.05366545170545578
Epoch average log-loss: 0.038693537614640915
Model has been saved as model_pool/Pytorch/rhn/pavel_rhn-TEMP.pt.

In Epoch: 49, val_loss: 0.039745029597783386, best_val_loss: 0.039745029597783386, best_auc: 0.9876435400755758
Epoch: 50 Batch: 0 Log-loss: 0.03581385686993599
Epoch: 50 Batch: 40 Log-loss: 0.03389548882842064
Epoch: 50 Batch: 80 Log-loss: 0.03419680520892143
Epoch: 50 Batch: 120 Log-loss: 0.05455152690410614
Epoch: 50 Batch: 160 Log-loss: 0.03931952640414238
Epoch: 50 Batch: 200 Log-loss: 0.030357418581843376
Epoch: 50 Batch: 240 Log-loss: 0.03431927040219307
Epoch: 50 Batch: 280 Log-loss: 0.05310246720910072
Epoch: 50 Batch: 320 Log-loss: 0.03371511027216911
Epoch: 50 Batch: 360 Log-loss: 0.03383579105138779
Epoch: 50 Batch: 400 Log-loss: 0.047167111188173294
Epoch: 50 Bat

Epoch: 58 Batch: 280 Log-loss: 0.03231087699532509
Epoch: 58 Batch: 320 Log-loss: 0.03578801080584526
Epoch: 58 Batch: 360 Log-loss: 0.044385697692632675
Epoch: 58 Batch: 400 Log-loss: 0.03543980047106743
Epoch: 58 Batch: 440 Log-loss: 0.03648184612393379
Epoch: 58 Batch: 480 Log-loss: 0.033902473747730255
Epoch: 58 Batch: 520 Log-loss: 0.03670363873243332
Epoch: 58 Batch: 560 Log-loss: 0.04079356789588928
Epoch average log-loss: 0.038637869785140666
In Epoch: 58, val_loss: 0.039939581177519605, best_val_loss: 0.03946678895966707, best_auc: 0.9878831375442512
Epoch: 59 Batch: 0 Log-loss: 0.03637649863958359
Epoch: 59 Batch: 40 Log-loss: 0.044730618596076965
Epoch: 59 Batch: 80 Log-loss: 0.04293164238333702
Epoch: 59 Batch: 120 Log-loss: 0.02865287847816944
Epoch: 59 Batch: 160 Log-loss: 0.031877268105745316
Epoch: 59 Batch: 200 Log-loss: 0.04718269780278206
Epoch: 59 Batch: 240 Log-loss: 0.03435098007321358
Epoch: 59 Batch: 280 Log-loss: 0.025717003270983696
Epoch: 59 Batch: 320 Log-lo

Epoch: 1 Batch: 160 Log-loss: 0.059100180864334106
Epoch: 1 Batch: 200 Log-loss: 0.07215171307325363
Epoch: 1 Batch: 240 Log-loss: 0.09960321336984634
Epoch: 1 Batch: 280 Log-loss: 0.048518598079681396
Epoch: 1 Batch: 320 Log-loss: 0.06745100021362305
Epoch: 1 Batch: 360 Log-loss: 0.05052800476551056
Epoch: 1 Batch: 400 Log-loss: 0.05163412168622017
Epoch: 1 Batch: 440 Log-loss: 0.055034056305885315
Epoch: 1 Batch: 480 Log-loss: 0.058810070157051086
Epoch: 1 Batch: 520 Log-loss: 0.04238016530871391
Epoch: 1 Batch: 560 Log-loss: 0.05133184418082237
Epoch average log-loss: 0.0912608400519405
Model has been saved as model_pool/Pytorch/rhn/pavel_rhn-TEMP.pt.

In Epoch: 1, val_loss: 0.049592238115194476, best_val_loss: 0.049592238115194476, best_auc: 0.9707809426413018
Epoch: 2 Batch: 0 Log-loss: 0.04388587549328804
Epoch: 2 Batch: 40 Log-loss: 0.06613855808973312
Epoch: 2 Batch: 80 Log-loss: 0.04671518877148628
Epoch: 2 Batch: 120 Log-loss: 0.05018557235598564
Epoch: 2 Batch: 160 Log-loss:

Epoch: 9 Batch: 520 Log-loss: 0.05491280555725098
Epoch: 9 Batch: 560 Log-loss: 0.03436688706278801
Epoch average log-loss: 0.04281850759206074
In Epoch: 9, val_loss: 0.04158534748820714, best_val_loss: 0.04134581443254828, best_auc: 0.9865092542048345
Epoch: 10 Batch: 0 Log-loss: 0.04637132212519646
Epoch: 10 Batch: 40 Log-loss: 0.049199048429727554
Epoch: 10 Batch: 80 Log-loss: 0.03722526878118515
Epoch: 10 Batch: 120 Log-loss: 0.037554871290922165
Epoch: 10 Batch: 160 Log-loss: 0.037786856293678284
Epoch: 10 Batch: 200 Log-loss: 0.05026208981871605
Epoch: 10 Batch: 240 Log-loss: 0.04039931297302246
Epoch: 10 Batch: 280 Log-loss: 0.03960171714425087
Epoch: 10 Batch: 320 Log-loss: 0.04141657426953316
Epoch: 10 Batch: 360 Log-loss: 0.03949221223592758
Epoch: 10 Batch: 400 Log-loss: 0.037827763706445694
Epoch: 10 Batch: 440 Log-loss: 0.037705231457948685
Epoch: 10 Batch: 480 Log-loss: 0.0501534640789032
Epoch: 10 Batch: 520 Log-loss: 0.04624855890870094
Epoch: 10 Batch: 560 Log-loss: 0.

Epoch: 18 Batch: 360 Log-loss: 0.030525239184498787
Epoch: 18 Batch: 400 Log-loss: 0.036135587841272354
Epoch: 18 Batch: 440 Log-loss: 0.030867604538798332
Epoch: 18 Batch: 480 Log-loss: 0.03671945631504059
Epoch: 18 Batch: 520 Log-loss: 0.05561250075697899
Epoch: 18 Batch: 560 Log-loss: 0.02169940061867237
Epoch average log-loss: 0.04029120976837086
Model has been saved as model_pool/Pytorch/rhn/pavel_rhn-TEMP.pt.

In Epoch: 18, val_loss: 0.04055607176224434, best_val_loss: 0.04055607176224434, best_auc: 0.9883170474248234
Epoch: 19 Batch: 0 Log-loss: 0.046518176794052124
Epoch: 19 Batch: 40 Log-loss: 0.054171815514564514
Epoch: 19 Batch: 80 Log-loss: 0.03534221649169922
Epoch: 19 Batch: 120 Log-loss: 0.03687874600291252
Epoch: 19 Batch: 160 Log-loss: 0.04797367751598358
Epoch: 19 Batch: 200 Log-loss: 0.041512154042720795
Epoch: 19 Batch: 240 Log-loss: 0.04101502150297165
Epoch: 19 Batch: 280 Log-loss: 0.019217751920223236
Epoch: 19 Batch: 320 Log-loss: 0.038907039910554886
Epoch: 19 

Epoch: 27 Batch: 160 Log-loss: 0.04445083439350128
Epoch: 27 Batch: 200 Log-loss: 0.04302610084414482
Epoch: 27 Batch: 240 Log-loss: 0.04082886874675751
Epoch: 27 Batch: 280 Log-loss: 0.04320888593792915
Epoch: 27 Batch: 320 Log-loss: 0.050552982836961746
Epoch: 27 Batch: 360 Log-loss: 0.04534360393881798
Epoch: 27 Batch: 400 Log-loss: 0.040201324969530106
Epoch: 27 Batch: 440 Log-loss: 0.04261687770485878
Epoch: 27 Batch: 480 Log-loss: 0.05068185552954674
Epoch: 27 Batch: 520 Log-loss: 0.027292611077427864
Epoch: 27 Batch: 560 Log-loss: 0.03557008132338524
Epoch average log-loss: 0.03939937364775688
In Epoch: 27, val_loss: 0.04050555761631245, best_val_loss: 0.04026700785373786, best_auc: 0.9885021811710502
Epoch: 28 Batch: 0 Log-loss: 0.037180572748184204
Epoch: 28 Batch: 40 Log-loss: 0.04345827177166939
Epoch: 28 Batch: 80 Log-loss: 0.031039198860526085
Epoch: 28 Batch: 120 Log-loss: 0.04836682975292206
Epoch: 28 Batch: 160 Log-loss: 0.04740059748291969
Epoch: 28 Batch: 200 Log-loss

Epoch: 36 Batch: 80 Log-loss: 0.05629301443696022
Epoch: 36 Batch: 120 Log-loss: 0.03914165124297142
Epoch: 36 Batch: 160 Log-loss: 0.04202214255928993
Epoch: 36 Batch: 200 Log-loss: 0.03558553382754326
Epoch: 36 Batch: 240 Log-loss: 0.031003592535853386
Epoch: 36 Batch: 280 Log-loss: 0.03468476980924606
Epoch: 36 Batch: 320 Log-loss: 0.03438184782862663
Epoch: 36 Batch: 360 Log-loss: 0.03372533246874809
Epoch: 36 Batch: 400 Log-loss: 0.030062591657042503
Epoch: 36 Batch: 440 Log-loss: 0.03302799165248871
Epoch: 36 Batch: 480 Log-loss: 0.03464243933558464
Epoch: 36 Batch: 520 Log-loss: 0.031690459698438644
Epoch: 36 Batch: 560 Log-loss: 0.037094637751579285
Epoch average log-loss: 0.03904217526516212
In Epoch: 36, val_loss: 0.04060404463356092, best_val_loss: 0.040186706425053946, best_auc: 0.9883710208829597
Epoch: 37 Batch: 0 Log-loss: 0.03825089707970619
Epoch: 37 Batch: 40 Log-loss: 0.04768577218055725
Epoch: 37 Batch: 80 Log-loss: 0.03868989273905754
Epoch: 37 Batch: 120 Log-loss:

Epoch: 45 Batch: 40 Log-loss: 0.04276290163397789
Epoch: 45 Batch: 80 Log-loss: 0.046612005680799484
Epoch: 45 Batch: 120 Log-loss: 0.0325748585164547
Epoch: 45 Batch: 160 Log-loss: 0.026742802932858467
Epoch: 45 Batch: 200 Log-loss: 0.05154842138290405
Epoch: 45 Batch: 240 Log-loss: 0.03629693761467934
Epoch: 45 Batch: 280 Log-loss: 0.06662656366825104
Epoch: 45 Batch: 320 Log-loss: 0.041639167815446854
Epoch: 45 Batch: 360 Log-loss: 0.048293378204107285
Epoch: 45 Batch: 400 Log-loss: 0.04073016718029976
Epoch: 45 Batch: 440 Log-loss: 0.029411233961582184
Epoch: 45 Batch: 480 Log-loss: 0.04373687878251076
Epoch: 45 Batch: 520 Log-loss: 0.024737780913710594
Epoch: 45 Batch: 560 Log-loss: 0.03696656599640846
Epoch average log-loss: 0.03863581693066018
In Epoch: 45, val_loss: 0.040446733205950984, best_val_loss: 0.040186706425053946, best_auc: 0.9883710208829597
Epoch: 46 Batch: 0 Log-loss: 0.038462307304143906
Epoch: 46 Batch: 40 Log-loss: 0.032674603164196014
Epoch: 46 Batch: 80 Log-lo

Epoch: 54 Batch: 0 Log-loss: 0.030182503163814545
Epoch: 54 Batch: 40 Log-loss: 0.05473877117037773
Epoch: 54 Batch: 80 Log-loss: 0.05677187815308571
Epoch: 54 Batch: 120 Log-loss: 0.034550223499536514
Epoch: 54 Batch: 160 Log-loss: 0.035177625715732574
Epoch: 54 Batch: 200 Log-loss: 0.03984302654862404
Epoch: 54 Batch: 240 Log-loss: 0.033442337065935135
Epoch: 54 Batch: 280 Log-loss: 0.038646306842565536
Epoch: 54 Batch: 320 Log-loss: 0.039395157247781754
Epoch: 54 Batch: 360 Log-loss: 0.03454142063856125
Epoch: 54 Batch: 400 Log-loss: 0.03911645710468292
Epoch: 54 Batch: 440 Log-loss: 0.03354009985923767
Epoch: 54 Batch: 480 Log-loss: 0.046332817524671555
Epoch: 54 Batch: 520 Log-loss: 0.04332655295729637
Epoch: 54 Batch: 560 Log-loss: 0.03787366300821304
Epoch average log-loss: 0.038455198196295115
In Epoch: 54, val_loss: 0.04076072098576161, best_val_loss: 0.04018010180569651, best_auc: 0.9885332830454797
Epoch: 55 Batch: 0 Log-loss: 0.04144451394677162
Epoch: 55 Batch: 40 Log-loss

In Epoch: 62, val_loss: 0.04035477878829759, best_val_loss: 0.04017193529996758, best_auc: 0.9884878069519111
Epoch: 63 Batch: 0 Log-loss: 0.03785282373428345
Epoch: 63 Batch: 40 Log-loss: 0.04228709265589714
Epoch: 63 Batch: 80 Log-loss: 0.032104965299367905
Epoch: 63 Batch: 120 Log-loss: 0.04533010721206665
Epoch: 63 Batch: 160 Log-loss: 0.0485551618039608
Epoch: 63 Batch: 200 Log-loss: 0.0406598262488842
Epoch: 63 Batch: 240 Log-loss: 0.05346819758415222
Epoch: 63 Batch: 280 Log-loss: 0.03446628898382187
Epoch: 63 Batch: 320 Log-loss: 0.03861179202795029
Epoch: 63 Batch: 360 Log-loss: 0.041025567799806595
Epoch: 63 Batch: 400 Log-loss: 0.04565812274813652
Epoch: 63 Batch: 440 Log-loss: 0.03800908103585243
Epoch: 63 Batch: 480 Log-loss: 0.029123537242412567
Epoch: 63 Batch: 520 Log-loss: 0.03334012255072594
Epoch: 63 Batch: 560 Log-loss: 0.03838293254375458
Epoch average log-loss: 0.038482333726382684
Model has been saved as model_pool/Pytorch/rhn/pavel_rhn-TEMP.pt.

In Epoch: 63, va

Epoch: 71 Batch: 520 Log-loss: 0.027956413105130196
Epoch: 71 Batch: 560 Log-loss: 0.039534032344818115
Epoch average log-loss: 0.03840566971838208
In Epoch: 71, val_loss: 0.04042383342511236, best_val_loss: 0.040002458294543956, best_auc: 0.9885077247716612
Epoch: 72 Batch: 0 Log-loss: 0.043388232588768005
Epoch: 72 Batch: 40 Log-loss: 0.04257170855998993
Epoch: 72 Batch: 80 Log-loss: 0.03889037296175957
Epoch: 72 Batch: 120 Log-loss: 0.05236242711544037
Epoch: 72 Batch: 160 Log-loss: 0.03927769139409065
Epoch: 72 Batch: 200 Log-loss: 0.04462650045752525
Epoch: 72 Batch: 240 Log-loss: 0.03997233137488365
Epoch: 72 Batch: 280 Log-loss: 0.023599199950695038
Epoch: 72 Batch: 320 Log-loss: 0.05646069347858429
Epoch: 72 Batch: 360 Log-loss: 0.03739509359002113
Epoch: 72 Batch: 400 Log-loss: 0.03624780476093292
Epoch: 72 Batch: 440 Log-loss: 0.03445405140519142
Epoch: 72 Batch: 480 Log-loss: 0.02888009138405323
Epoch: 72 Batch: 520 Log-loss: 0.036881472915410995
Epoch: 72 Batch: 560 Log-los

Epoch: 80 Batch: 440 Log-loss: 0.035078734159469604
Epoch: 80 Batch: 480 Log-loss: 0.03405631706118584
Epoch: 80 Batch: 520 Log-loss: 0.025001270696520805
Epoch: 80 Batch: 560 Log-loss: 0.05431564897298813
Epoch average log-loss: 0.038385085548673356
In Epoch: 80, val_loss: 0.04040217150289287, best_val_loss: 0.03997769121831931, best_auc: 0.988435859517228
Epoch: 81 Batch: 0 Log-loss: 0.03548869118094444
Epoch: 81 Batch: 40 Log-loss: 0.03781747445464134
Epoch: 81 Batch: 80 Log-loss: 0.049095768481492996
Epoch: 81 Batch: 120 Log-loss: 0.059041690081357956
Epoch: 81 Batch: 160 Log-loss: 0.05756339803338051
Epoch: 81 Batch: 200 Log-loss: 0.04720650985836983
Epoch: 81 Batch: 240 Log-loss: 0.0412043072283268
Epoch: 81 Batch: 280 Log-loss: 0.041975826025009155
Epoch: 81 Batch: 320 Log-loss: 0.030223965644836426
Epoch: 81 Batch: 360 Log-loss: 0.0456855408847332
Epoch: 81 Batch: 400 Log-loss: 0.044119179248809814
Epoch: 81 Batch: 440 Log-loss: 0.03884464129805565
Epoch: 81 Batch: 480 Log-loss

Epoch: 3 Batch: 240 Log-loss: 0.050572603940963745
Epoch: 3 Batch: 280 Log-loss: 0.055014465004205704
Epoch: 3 Batch: 320 Log-loss: 0.06919942796230316
Epoch: 3 Batch: 360 Log-loss: 0.054632220417261124
Epoch: 3 Batch: 400 Log-loss: 0.04156980291008949
Epoch: 3 Batch: 440 Log-loss: 0.03298483416438103
Epoch: 3 Batch: 480 Log-loss: 0.06467358022928238
Epoch: 3 Batch: 520 Log-loss: 0.04641348496079445
Epoch: 3 Batch: 560 Log-loss: 0.029278375208377838
Epoch average log-loss: 0.04843877702618816
Model has been saved as model_pool/Pytorch/rhn/pavel_rhn-TEMP.pt.

In Epoch: 3, val_loss: 0.04614800242663497, best_val_loss: 0.04614800242663497, best_auc: 0.9797320860707441
Epoch: 4 Batch: 0 Log-loss: 0.031952887773513794
Epoch: 4 Batch: 40 Log-loss: 0.040962833911180496
Epoch: 4 Batch: 80 Log-loss: 0.04062600061297417
Epoch: 4 Batch: 120 Log-loss: 0.044870760291814804
Epoch: 4 Batch: 160 Log-loss: 0.043198361992836
Epoch: 4 Batch: 200 Log-loss: 0.04755878075957298
Epoch: 4 Batch: 240 Log-loss:

Epoch: 12 Batch: 0 Log-loss: 0.022738443687558174
Epoch: 12 Batch: 40 Log-loss: 0.0653163343667984
Epoch: 12 Batch: 80 Log-loss: 0.04289734363555908
Epoch: 12 Batch: 120 Log-loss: 0.04228740930557251
Epoch: 12 Batch: 160 Log-loss: 0.042702335864305496
Epoch: 12 Batch: 200 Log-loss: 0.03556590899825096
Epoch: 12 Batch: 240 Log-loss: 0.04150617495179176
Epoch: 12 Batch: 280 Log-loss: 0.037033479660749435
Epoch: 12 Batch: 320 Log-loss: 0.02882070280611515
Epoch: 12 Batch: 360 Log-loss: 0.0343131385743618
Epoch: 12 Batch: 400 Log-loss: 0.03597128763794899
Epoch: 12 Batch: 440 Log-loss: 0.03851523995399475
Epoch: 12 Batch: 480 Log-loss: 0.021456904709339142
Epoch: 12 Batch: 520 Log-loss: 0.06273675709962845
Epoch: 12 Batch: 560 Log-loss: 0.046096064150333405
Epoch average log-loss: 0.04211072962997215
In Epoch: 12, val_loss: 0.04309883768748272, best_val_loss: 0.04224544904938834, best_auc: 0.9863885012355368
Epoch: 13 Batch: 0 Log-loss: 0.06609100848436356
Epoch: 13 Batch: 40 Log-loss: 0.0

Epoch: 20 Batch: 400 Log-loss: 0.040024496614933014
Epoch: 20 Batch: 440 Log-loss: 0.03727440536022186
Epoch: 20 Batch: 480 Log-loss: 0.038656119257211685
Epoch: 20 Batch: 520 Log-loss: 0.028499729931354523
Epoch: 20 Batch: 560 Log-loss: 0.04096108675003052
Epoch average log-loss: 0.04034778165764042
Model has been saved as model_pool/Pytorch/rhn/pavel_rhn-TEMP.pt.

In Epoch: 20, val_loss: 0.04152159129044746, best_val_loss: 0.04152159129044746, best_auc: 0.9875796939669147
Epoch: 21 Batch: 0 Log-loss: 0.030038869008421898
Epoch: 21 Batch: 40 Log-loss: 0.04503236711025238
Epoch: 21 Batch: 80 Log-loss: 0.0461677610874176
Epoch: 21 Batch: 120 Log-loss: 0.025918984785676003
Epoch: 21 Batch: 160 Log-loss: 0.051742956042289734
Epoch: 21 Batch: 200 Log-loss: 0.04208174720406532
Epoch: 21 Batch: 240 Log-loss: 0.034745845943689346
Epoch: 21 Batch: 280 Log-loss: 0.03198826685547829
Epoch: 21 Batch: 320 Log-loss: 0.03341188654303551
Epoch: 21 Batch: 360 Log-loss: 0.0373055525124073
Epoch: 21 Bat

Epoch: 29 Batch: 320 Log-loss: 0.03630012273788452
Epoch: 29 Batch: 360 Log-loss: 0.031387750059366226
Epoch: 29 Batch: 400 Log-loss: 0.058537017554044724
Epoch: 29 Batch: 440 Log-loss: 0.034526847302913666
Epoch: 29 Batch: 480 Log-loss: 0.03541191294789314
Epoch: 29 Batch: 520 Log-loss: 0.02692347764968872
Epoch: 29 Batch: 560 Log-loss: 0.03639565780758858
Epoch average log-loss: 0.03928831862618348
In Epoch: 29, val_loss: 0.041781023667074785, best_val_loss: 0.04152159129044746, best_auc: 0.9875796939669147
Epoch: 30 Batch: 0 Log-loss: 0.04013305529952049
Epoch: 30 Batch: 40 Log-loss: 0.04646116495132446
Epoch: 30 Batch: 80 Log-loss: 0.04831978306174278
Epoch: 30 Batch: 120 Log-loss: 0.02642126940190792
Epoch: 30 Batch: 160 Log-loss: 0.030945828184485435
Epoch: 30 Batch: 200 Log-loss: 0.04967764392495155
Epoch: 30 Batch: 240 Log-loss: 0.04990263655781746
Epoch: 30 Batch: 280 Log-loss: 0.03926141932606697
Epoch: 30 Batch: 320 Log-loss: 0.04467589035630226
Epoch: 30 Batch: 360 Log-loss

Epoch: 38 Batch: 240 Log-loss: 0.030341118574142456
Epoch: 38 Batch: 280 Log-loss: 0.05550031736493111
Epoch: 38 Batch: 320 Log-loss: 0.03902146592736244
Epoch: 38 Batch: 360 Log-loss: 0.039430782198905945
Epoch: 38 Batch: 400 Log-loss: 0.040364932268857956
Epoch: 38 Batch: 440 Log-loss: 0.02870199829339981
Epoch: 38 Batch: 480 Log-loss: 0.0351371243596077
Epoch: 38 Batch: 520 Log-loss: 0.0452023446559906
Epoch: 38 Batch: 560 Log-loss: 0.03655380755662918
Epoch average log-loss: 0.03915527346543968
In Epoch: 38, val_loss: 0.04190331217903309, best_val_loss: 0.04124207907387364, best_auc: 0.9874693744633388
Epoch: 39 Batch: 0 Log-loss: 0.036014676094055176
Epoch: 39 Batch: 40 Log-loss: 0.05332806333899498
Epoch: 39 Batch: 80 Log-loss: 0.05382205918431282
Epoch: 39 Batch: 120 Log-loss: 0.03557989373803139
Epoch: 39 Batch: 160 Log-loss: 0.02984657883644104
Epoch: 39 Batch: 200 Log-loss: 0.02779170125722885
Epoch: 39 Batch: 240 Log-loss: 0.03147390857338905
Epoch: 39 Batch: 280 Log-loss: 0

Epoch: 47 Batch: 160 Log-loss: 0.03379477187991142
Epoch: 47 Batch: 200 Log-loss: 0.03051295131444931
Epoch: 47 Batch: 240 Log-loss: 0.03222961351275444
Epoch: 47 Batch: 280 Log-loss: 0.0376417450606823
Epoch: 47 Batch: 320 Log-loss: 0.0408441387116909
Epoch: 47 Batch: 360 Log-loss: 0.035791344940662384
Epoch: 47 Batch: 400 Log-loss: 0.028622793033719063
Epoch: 47 Batch: 440 Log-loss: 0.05428829416632652
Epoch: 47 Batch: 480 Log-loss: 0.04175570234656334
Epoch: 47 Batch: 520 Log-loss: 0.039322543889284134
Epoch: 47 Batch: 560 Log-loss: 0.04553065821528435
Epoch average log-loss: 0.03871911936106959
In Epoch: 47, val_loss: 0.04193246710667498, best_val_loss: 0.04121824086557969, best_auc: 0.987068847244755
Epoch: 48 Batch: 0 Log-loss: 0.034154266119003296
Epoch: 48 Batch: 40 Log-loss: 0.03722543269395828
Epoch: 48 Batch: 80 Log-loss: 0.0395892933011055
Epoch: 48 Batch: 120 Log-loss: 0.028329206630587578
Epoch: 48 Batch: 160 Log-loss: 0.028646821156144142
Epoch: 48 Batch: 200 Log-loss: 0

Epoch: 2 Batch: 0 Log-loss: 0.07638002932071686
Epoch: 2 Batch: 40 Log-loss: 0.05059297755360603
Epoch: 2 Batch: 80 Log-loss: 0.039266034960746765
Epoch: 2 Batch: 120 Log-loss: 0.05587725713849068
Epoch: 2 Batch: 160 Log-loss: 0.06839199364185333
Epoch: 2 Batch: 200 Log-loss: 0.05092044547200203
Epoch: 2 Batch: 240 Log-loss: 0.04532298073172569
Epoch: 2 Batch: 280 Log-loss: 0.02517469972372055
Epoch: 2 Batch: 320 Log-loss: 0.04865586385130882
Epoch: 2 Batch: 360 Log-loss: 0.05257861316204071
Epoch: 2 Batch: 400 Log-loss: 0.05393114313483238
Epoch: 2 Batch: 440 Log-loss: 0.03961683437228203
Epoch: 2 Batch: 480 Log-loss: 0.052094969898462296
Epoch: 2 Batch: 520 Log-loss: 0.047183990478515625
Epoch: 2 Batch: 560 Log-loss: 0.06203000992536545
Epoch average log-loss: 0.05185389698017388
Model has been saved as model_pool/Pytorch/rhn/pavel_rhn-TEMP.pt.

In Epoch: 2, val_loss: 0.04834407754120119, best_val_loss: 0.04834407754120119, best_auc: 0.9778770611110832
Epoch: 3 Batch: 0 Log-loss: 0.0

Epoch: 10 Batch: 440 Log-loss: 0.042495325207710266
Epoch: 10 Batch: 480 Log-loss: 0.04940229654312134
Epoch: 10 Batch: 520 Log-loss: 0.0287406574934721
Epoch: 10 Batch: 560 Log-loss: 0.043612707406282425
Epoch average log-loss: 0.0425050803859319
Model has been saved as model_pool/Pytorch/rhn/pavel_rhn-TEMP.pt.

In Epoch: 10, val_loss: 0.0421839625307055, best_val_loss: 0.0421839625307055, best_auc: 0.9863014890860726
Epoch: 11 Batch: 0 Log-loss: 0.03817583620548248
Epoch: 11 Batch: 40 Log-loss: 0.043250229209661484
Epoch: 11 Batch: 80 Log-loss: 0.03536013141274452
Epoch: 11 Batch: 120 Log-loss: 0.05493398383259773
Epoch: 11 Batch: 160 Log-loss: 0.041810568422079086
Epoch: 11 Batch: 200 Log-loss: 0.02901049517095089
Epoch: 11 Batch: 240 Log-loss: 0.0365234799683094
Epoch: 11 Batch: 280 Log-loss: 0.038357626646757126
Epoch: 11 Batch: 320 Log-loss: 0.04665851220488548
Epoch: 11 Batch: 360 Log-loss: 0.039293184876441956
Epoch: 11 Batch: 400 Log-loss: 0.032901111990213394
Epoch: 11 Batch:

Epoch: 19 Batch: 160 Log-loss: 0.03569536656141281
Epoch: 19 Batch: 200 Log-loss: 0.036494556814432144
Epoch: 19 Batch: 240 Log-loss: 0.029886886477470398
Epoch: 19 Batch: 280 Log-loss: 0.049888353794813156
Epoch: 19 Batch: 320 Log-loss: 0.03292352706193924
Epoch: 19 Batch: 360 Log-loss: 0.04140729084610939
Epoch: 19 Batch: 400 Log-loss: 0.05428403615951538
Epoch: 19 Batch: 440 Log-loss: 0.050015199929475784
Epoch: 19 Batch: 480 Log-loss: 0.030075551941990852
Epoch: 19 Batch: 520 Log-loss: 0.05124678090214729
Epoch: 19 Batch: 560 Log-loss: 0.041234418749809265
Epoch average log-loss: 0.04058452345364328
In Epoch: 19, val_loss: 0.041257631911711536, best_val_loss: 0.04052805196964017, best_auc: 0.9885207688225965
Epoch: 20 Batch: 0 Log-loss: 0.06254526227712631
Epoch: 20 Batch: 40 Log-loss: 0.04560096934437752
Epoch: 20 Batch: 80 Log-loss: 0.027760453522205353
Epoch: 20 Batch: 120 Log-loss: 0.04748222604393959
Epoch: 20 Batch: 160 Log-loss: 0.029363468289375305
Epoch: 20 Batch: 200 Log-

Epoch: 28 Batch: 0 Log-loss: 0.04569888114929199
Epoch: 28 Batch: 40 Log-loss: 0.03954225033521652
Epoch: 28 Batch: 80 Log-loss: 0.03710247576236725
Epoch: 28 Batch: 120 Log-loss: 0.0352601520717144
Epoch: 28 Batch: 160 Log-loss: 0.048035044223070145
Epoch: 28 Batch: 200 Log-loss: 0.026302969083189964
Epoch: 28 Batch: 240 Log-loss: 0.048927996307611465
Epoch: 28 Batch: 280 Log-loss: 0.027175001800060272
Epoch: 28 Batch: 320 Log-loss: 0.04799836501479149
Epoch: 28 Batch: 360 Log-loss: 0.039019979536533356
Epoch: 28 Batch: 400 Log-loss: 0.04688639938831329
Epoch: 28 Batch: 440 Log-loss: 0.046519070863723755
Epoch: 28 Batch: 480 Log-loss: 0.03836689889431
Epoch: 28 Batch: 520 Log-loss: 0.058969441801309586
Epoch: 28 Batch: 560 Log-loss: 0.03842558711767197
Epoch average log-loss: 0.03957701981001135
In Epoch: 28, val_loss: 0.040125366491837226, best_val_loss: 0.040072344823061996, best_auc: 0.9889285339485134
Epoch: 29 Batch: 0 Log-loss: 0.04343480244278908
Epoch: 29 Batch: 40 Log-loss: 0

Epoch: 37 Batch: 0 Log-loss: 0.025943731889128685
Epoch: 37 Batch: 40 Log-loss: 0.03225911781191826
Epoch: 37 Batch: 80 Log-loss: 0.04560138285160065
Epoch: 37 Batch: 120 Log-loss: 0.041854213923215866
Epoch: 37 Batch: 160 Log-loss: 0.04141495004296303
Epoch: 37 Batch: 200 Log-loss: 0.05910206213593483
Epoch: 37 Batch: 240 Log-loss: 0.02977190911769867
Epoch: 37 Batch: 280 Log-loss: 0.03059864602982998
Epoch: 37 Batch: 320 Log-loss: 0.03570556640625
Epoch: 37 Batch: 360 Log-loss: 0.04414568841457367
Epoch: 37 Batch: 400 Log-loss: 0.059603411704301834
Epoch: 37 Batch: 440 Log-loss: 0.04640400782227516
Epoch: 37 Batch: 480 Log-loss: 0.0462687723338604
Epoch: 37 Batch: 520 Log-loss: 0.03475667163729668
Epoch: 37 Batch: 560 Log-loss: 0.026855910196900368
Epoch average log-loss: 0.039234114644516795
A pre-trained model at model_pool/Pytorch/rhn/pavel_rhn-TEMP.pt has been loaded.
Model has been saved as model_pool/Pytorch/rhn/pavel_rhn8.pt.

Choose the torch base model.
## Training on fold 9

Epoch: 8 Batch: 400 Log-loss: 0.03382719308137894
Epoch: 8 Batch: 440 Log-loss: 0.048083167523145676
Epoch: 8 Batch: 480 Log-loss: 0.05418485030531883
Epoch: 8 Batch: 520 Log-loss: 0.04740704968571663
Epoch: 8 Batch: 560 Log-loss: 0.0408833883702755
Epoch average log-loss: 0.04323709427512118
Model has been saved as model_pool/Pytorch/rhn/pavel_rhn-TEMP.pt.

In Epoch: 8, val_loss: 0.043749109520896945, best_val_loss: 0.043749109520896945, best_auc: 0.9857527683808179
Epoch: 9 Batch: 0 Log-loss: 0.04207947850227356
Epoch: 9 Batch: 40 Log-loss: 0.03316400572657585
Epoch: 9 Batch: 80 Log-loss: 0.04960491880774498
Epoch: 9 Batch: 120 Log-loss: 0.04337206482887268
Epoch: 9 Batch: 160 Log-loss: 0.04126513749361038
Epoch: 9 Batch: 200 Log-loss: 0.04335455223917961
Epoch: 9 Batch: 240 Log-loss: 0.03849118947982788
Epoch: 9 Batch: 280 Log-loss: 0.04522884264588356
Epoch: 9 Batch: 320 Log-loss: 0.033341165632009506
Epoch: 9 Batch: 360 Log-loss: 0.05232580378651619
Epoch: 9 Batch: 400 Log-loss: 0

Epoch: 17 Batch: 120 Log-loss: 0.05397900938987732
Epoch: 17 Batch: 160 Log-loss: 0.046282440423965454
Epoch: 17 Batch: 200 Log-loss: 0.04273634031414986
Epoch: 17 Batch: 240 Log-loss: 0.038886964321136475
Epoch: 17 Batch: 280 Log-loss: 0.04354344680905342
Epoch: 17 Batch: 320 Log-loss: 0.02584017626941204
Epoch: 17 Batch: 360 Log-loss: 0.02790120057761669
Epoch: 17 Batch: 400 Log-loss: 0.0509486086666584
Epoch: 17 Batch: 440 Log-loss: 0.030340520665049553
Epoch: 17 Batch: 480 Log-loss: 0.03737952560186386
Epoch: 17 Batch: 520 Log-loss: 0.044431690126657486
Epoch: 17 Batch: 560 Log-loss: 0.03767431527376175
Epoch average log-loss: 0.040437045746615954
Model has been saved as model_pool/Pytorch/rhn/pavel_rhn-TEMP.pt.

In Epoch: 17, val_loss: 0.042334016915048435, best_val_loss: 0.042334016915048435, best_auc: 0.9878003029687994
Epoch: 18 Batch: 0 Log-loss: 0.04020917788147926
Epoch: 18 Batch: 40 Log-loss: 0.046628281474113464
Epoch: 18 Batch: 80 Log-loss: 0.039797861129045486
Epoch: 18 

Epoch: 26 Batch: 0 Log-loss: 0.03883567452430725
Epoch: 26 Batch: 40 Log-loss: 0.045384105294942856
Epoch: 26 Batch: 80 Log-loss: 0.024999646469950676
Epoch: 26 Batch: 120 Log-loss: 0.03816399723291397
Epoch: 26 Batch: 160 Log-loss: 0.04577105864882469
Epoch: 26 Batch: 200 Log-loss: 0.05936066433787346
Epoch: 26 Batch: 240 Log-loss: 0.042776674032211304
Epoch: 26 Batch: 280 Log-loss: 0.034909214824438095
Epoch: 26 Batch: 320 Log-loss: 0.04442470893263817
Epoch: 26 Batch: 360 Log-loss: 0.028683951124548912
Epoch: 26 Batch: 400 Log-loss: 0.049075350165367126
Epoch: 26 Batch: 440 Log-loss: 0.039184246212244034
Epoch: 26 Batch: 480 Log-loss: 0.036265257745981216
Epoch: 26 Batch: 520 Log-loss: 0.05032311752438545
Epoch: 26 Batch: 560 Log-loss: 0.032920077443122864
Epoch average log-loss: 0.03944106190798006
In Epoch: 26, val_loss: 0.04262586031051133, best_val_loss: 0.04207448748068642, best_auc: 0.9878801992444403
Epoch: 27 Batch: 0 Log-loss: 0.022795936092734337
Epoch: 27 Batch: 40 Log-lo

In Epoch: 34, val_loss: 0.042443047534341936, best_val_loss: 0.04195208273176713, best_auc: 0.9874712577696828
Epoch: 35 Batch: 0 Log-loss: 0.040370915085077286
Epoch: 35 Batch: 40 Log-loss: 0.03420058265328407
Epoch: 35 Batch: 80 Log-loss: 0.043225616216659546
Epoch: 35 Batch: 120 Log-loss: 0.029145510867238045
Epoch: 35 Batch: 160 Log-loss: 0.0430169440805912
Epoch: 35 Batch: 200 Log-loss: 0.041503969579935074
Epoch: 35 Batch: 240 Log-loss: 0.030428245663642883
Epoch: 35 Batch: 280 Log-loss: 0.03583500534296036
Epoch: 35 Batch: 320 Log-loss: 0.03483636677265167
Epoch: 35 Batch: 360 Log-loss: 0.023957595229148865
Epoch: 35 Batch: 400 Log-loss: 0.0330234058201313
Epoch: 35 Batch: 440 Log-loss: 0.031945422291755676
Epoch: 35 Batch: 480 Log-loss: 0.043881550431251526
Epoch: 35 Batch: 520 Log-loss: 0.024249091744422913
Epoch: 35 Batch: 560 Log-loss: 0.03303859010338783
Epoch average log-loss: 0.03883973265100005
In Epoch: 35, val_loss: 0.04225873007797046, best_val_loss: 0.041952082731767

# Prediction

## Pos-Version

In [None]:
submit_path_prefix = 'results/CNN_Based/' + model_name

print('Predicting testing results...')
test_predicts_list = []
for fold_id, model in enumerate(models):
    test_data_dict = {'Onehot': test_data, 'POS': pos_test_data}
    test_predict = model.predict(test_data_dict, batch_size=BATCH_SIZE, verbose=1)
    test_predicts_list.append(test_predict)
    np.save('parameters_pool/AVPOSCNN/{}-AV-POS-CNN.npy'.format(fold_id), test_predict)
    
test_predicts = np.zeros(test_predicts_list[0].shape)
for fold_predict in test_predicts_list:
    test_predicts += fold_predict
test_predicts /= len(test_predicts_list)

test_ids = test_df['id'].values
test_ids = test_ids.reshape((len(test_ids), 1))

test_predicts = pd.DataFrame(data=test_predicts, columns=list_classes)
test_predicts['id'] = test_ids
test_predicts = test_predicts[['id'] + list_classes]
submit_path = submit_path_prefix + '-L{:4f}-A{:4f}.csv'.format(val_loss, total_auc)
test_predicts.to_csv(submit_path, index=False)

## General-Version

In [None]:
submit_path_prefix = 'results/rhn/Fasttext-rhn-' + str(MAX_SEQUENCE_LENGTH)

print('Predicting testing results...')
test_predicts_list = []
for fold_id, model in enumerate(models):
    test_predict = model.predict(test_sequences, batch_size=BATCH_SIZE, verbose=1)
    test_predicts_list.append(test_predict)
    
test_predicts = np.zeros(test_predicts_list[0].shape)
for fold_predict in test_predicts_list:
    test_predicts += fold_predict
test_predicts /= len(test_predicts_list)

test_ids = test_df['id'].values
test_ids = test_ids.reshape((len(test_ids), 1))

test_predicts = pd.DataFrame(data=test_predicts, columns=list_classes)
test_predicts['id'] = test_ids
test_predicts = test_predicts[['id'] + list_classes]
submit_path = submit_path_prefix + '-L{:4f}-A{:4f}.csv'.format(val_loss, total_auc)
test_predicts.to_csv(submit_path, index=False)

## OOB (Out-of-Bag) Evaluation

In [None]:
train_fold_predictions = np.concatenate((fold_predictions), axis=0)
train_auc = roc_auc_score(train_labels, train_fold_predictions)
print('Training AUC', train_auc)

In [None]:
print('Predicting training results...')
train_ids = train_df['id'].values
train_ids = train_ids.reshape((len(train_ids), 1))

train_predicts = pd.DataFrame(data=train_fold_predictions, columns=list_classes)
train_predicts['id'] = train_ids
train_predicts = train_predicts[['id'] + list_classes]
submit_path = submit_path_prefix + '-(Train)-L{:4f}-A{:4f}.csv'.format(val_loss, train_auc)
train_predicts.to_csv(submit_path, index=False)
print('Done!')

# Result Ensemble (For Test Format)

## Bagging

In [None]:
def bagging(arrs, path):
    print("Doing ensemble on")
    subs = []
    for arr in arrs:
        print(arr)
        subs.append(pd.read_csv(arr))
    
    for sub in subs[1:]:
        for c in list_classes:
            subs[0][c] += sub[c]
    
    for c in list_classes:
        subs[0][c] /= len(subs)
        
    subs[0].to_csv(path, index=False)

# Check Correlation

In [None]:
def check_corr(arr1, arr2):
    res = 0
    for col in arr1.columns.values[1:]:
        cur = arr1[col].corr(arr2[col])
        corr = (arr1[col].rank() / len(arr1)).corr(arr2[col].rank() / len(arr2))
        print(col, corr)
        res += corr
    print("Avg Rank: ", res / len(arr1.columns.values[1:]))

# Result Estimation (For Train Format)

In [None]:
ground_truth = train_df.iloc[:, :8]
ground_truth.head()

In [None]:
check_prediction = pd.read_csv('results/RNN_Based/fasttext-avrnn-100000vocabulary-350length-(Train)-L0.013377-A0.998050.csv')

In [None]:
check_prediction.head()

In [None]:
def get_error_term_pos(train, pred, check_column):
    sub_train = train[check_column]
    sub_pred = pred[check_column]
    sub_pred = sub_pred.round()
    diff = (sub_pred != sub_train) & (sub_train == 1)
    print('Wrong pos-predictions number is {}\tWrong rate: {}'.format(diff.sum(), diff.sum() / len(sub_pred)))
    pos = pd.DataFrame()
    pos['id'] = train[diff]['id']
    pos['text'] = train[diff]['comment_text']
    pos['pred_val'] = pred[diff][check_column]
    pos['label'] = train[diff][check_column]
    return pos

def get_error_term_neg(train, pred, check_column):
    sub_train = train[check_column]
    sub_pred = pred[check_column]
    sub_pred = sub_pred.round()
    diff = (sub_pred != sub_train) & (sub_train == 0)
    print('Wrong neg-predictions number is {}\tWrong rate: {}'.format(diff.sum(), diff.sum() / len(sub_pred)))
    neg = pd.DataFrame()
    neg['id'] = train[diff]['id']
    neg['text'] = train[diff]['comment_text']
    neg['pred_val'] = pred[diff][check_column]
    neg['label'] = train[diff][check_column]
    return neg

In [None]:
for term in list_classes:
    print('In term: ', term)
    err_neg = get_error_term_neg(train_df, check_prediction, term)
    err_pos = get_error_term_pos(train_df, check_prediction, term)