# Homework 2 - TF-IDF Classifier

Ваша цель обучить классификатор который будет находить "токсичные" комментарии и опубликовать решения на Kaggle [Toxic Comment Classification Challenge](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge)

В процессе обучения нужно ответить на ***[вопросы](https://docs.google.com/forms/d/e/1FAIpQLSd9mQx8EFpSH6FhCy1M_FmISzy3lhgyyqV3TN0pmtop7slmTA/viewform?usp=sf_link)***

Данные можно скачать тут - https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data



In [1]:
import numpy as np
import pandas as pd

from scipy import sparse

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_union

In [2]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train = pd.read_csv('./input/train.csv').fillna('Unknown')
test = pd.read_csv('./input/test.csv').fillna('Unknown')
submission = pd.DataFrame.from_dict({'id': test['id']})
train_submission = pd.DataFrame.from_dict({'id': train['id']})

In [3]:
import re, string

#re_tok = re.compile('([%s“”¨«»®´·º½¾¿¡§£₤‘’])' % string.punctuation)
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

In [4]:
from keras.models import Model, Layer
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback
from keras.optimizers import Adam, RMSprop
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, CuDNNGRU, CuDNNLSTM, Add, Concatenate
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras import initializers, regularizers, constraints
import keras.backend as K
from keras.layers import Conv1D, GaussianNoise, MaxPooling1D, GlobalMaxPooling1D, SpatialDropout1D
from keras.regularizers import l2

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [5]:
EMBEDDING_FILE = './input/crawl-300d-2M.vec'

In [6]:
use_pseudo_labeling = True

In [7]:
max_features = 300000
maxlen = 200
embed_size = 300

In [8]:
targets_train = train[class_names].values
train = train["comment_text"].fillna("fillna").map(clean_text).values
test = test["comment_text"].fillna("fillna").map(clean_text).values

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train) + list(test))
X_train = tokenizer.texts_to_sequences(train)
X_test = tokenizer.texts_to_sequences(test)
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

pseudo_labeling_data, pseudo_labeling_targets = np.zeros(shape=(0, maxlen)), np.zeros(shape=(0, 6), dtype=np.int32)
if use_pseudo_labeling:
    pseudo_labeling_df = pd.read_csv('./submission_ensemble_005.csv')
    pred = np.array(pseudo_labeling_df[class_names])
    indexes_to_pick = np.all(((pred > 0.999) | (pred < 0.001)), axis=1)
    
    pseudo_labeling_data = x_test[indexes_to_pick, :]
    pseudo_labeling_targets = np.round(pseudo_labeling_df[class_names].iloc[indexes_to_pick]).astype(np.int32)

In [9]:
pseudo_labeling_data.shape

(68964, 200)

In [10]:
x_test.shape

(153164, 200)

In [11]:
x_train.shape

(159571, 200)

In [12]:
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE, encoding='utf-8'))

In [13]:
from sklearn.metrics import roc_auc_score
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector


class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, batch_size=512, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))
            
            
class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        """
        Keras Layer that implements an Attention mechanism for temporal data.
        Supports Masking.
        Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
        # Input shape
            3D tensor with shape: `(samples, steps, features)`.
        # Output shape
            2D tensor with shape: `(samples, features)`.
        :param kwargs:
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
        The dimensions are inferred based on the output shape of the RNN.
        Example:
            model.add(LSTM(64, return_sequences=True))
            model.add(Attention())
        """
        self.supports_masking = True
        #self.init = initializations.get('glorot_uniform')
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        # eij = K.dot(x, self.W) TF backend doesn't support it

        # features_dim = self.W.shape[0]
        # step_dim = x._keras_shape[1]

        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
    #print weigthted_input.shape
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        #return input_shape[0], input_shape[-1]
        return input_shape[0],  self.features_dim

# CV

## GRU

In [20]:
def _bn_elu():
    def func(x):
        x = BatchNormalization()(x)
        x = Activation('elu')(x)
        return x
    return func    

def get_gru_v2(dropout=0., dropout_dense=0.):
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(inp)
    x = GaussianNoise(stddev=0.15)(x)
    
    x = Bidirectional(CuDNNGRU(128, return_sequences=True))(x)
    x = _bn_elu()(x)
    
    x = SpatialDropout1D(0.4)(x)
    
    x = Bidirectional(CuDNNGRU(128, return_sequences=True))(x)
    x = _bn_elu()(x)
    
    x = Attention(maxlen)(x)
    
    x = Dense(128)(x)
    x = _bn_elu()(x) 
    x = Dropout(dropout_dense)(x)
    outp = Dense(6, use_bias=True, activation="sigmoid")(x)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer=Adam(lr=0.001, amsgrad=True),
                  metrics=['accuracy'])

    return model

In [21]:
from sklearn.model_selection import KFold
def prepare_data_cv():
    global targets_train, x_train, x_test
    
    kfold_data = []
    kf = KFold(n_splits=5, shuffle=True, random_state=0xCAFFE)
    
    targets_train = np.array(targets_train)
    x_train = np.array(x_train)
    
    for train_indices, val_indices in kf.split(targets_train):
        X_train_cv = x_train[train_indices]
        y_train_cv = targets_train[train_indices]

        X_val = x_train[val_indices]
        y_val = targets_train[val_indices]
        
        X_train_cv = np.vstack((X_train_cv, pseudo_labeling_data))
        y_train_cv = np.vstack((y_train_cv, pseudo_labeling_targets))

        kfold_data.append((X_train_cv, y_train_cv, X_val, y_val, val_indices))

    X_test = x_test

    return (kfold_data, X_test)

In [22]:
def get_model_callbacks(save_dir):
    stopping = EarlyStopping(monitor='val_loss',
                             min_delta=1e-3,
                             patience=5,
                             verbose=False,
                             mode='min')

    board_path = os.path.join(save_dir, 'board')
    if not os.path.exists(board_path):
        os.makedirs(board_path)

    lr_sheduler = ReduceLROnPlateau(monitor='val_loss',
                                    factor=0.1,
                                    patience=2,
                                    verbose=True,
                                    mode='min',
                                    epsilon=2e-3,
                                    min_lr=1e-5)

    model_path = os.path.join(save_dir, 'model/model_weights.hdf5')
    if not os.path.exists(os.path.dirname(model_path)):
        os.makedirs(os.path.dirname(model_path))

    model_checkpoint = ModelCheckpoint(model_path,
                                       monitor='val_loss',
                                       verbose=False,
                                       save_best_only=True,
                                       save_weights_only=False,
                                       mode='min',
                                       period=1)

    callbacks = [stopping, lr_sheduler, model_checkpoint]
    return callbacks

In [23]:
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
import os

In [24]:
model = get_gru_v2(dropout_dense=0.)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         (None, 200)               0         
_________________________________________________________________
embedding_7 (Embedding)      (None, 200, 300)          90000000  
_________________________________________________________________
gaussian_noise_7 (GaussianNo (None, 200, 300)          0         
_________________________________________________________________
bidirectional_13 (Bidirectio (None, 200, 256)          330240    
_________________________________________________________________
batch_normalization_19 (Batc (None, 200, 256)          1024      
_________________________________________________________________
activation_19 (Activation)   (None, 200, 256)          0         
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 200, 256)          0         
__________

In [25]:
#tf.reset_default_graph()

STAMP = 'gru_109'
experiment_path = './experiments/%s' % STAMP
epochs = 15
batch_size = 256

(kfold_data, X_test) = prepare_data_cv()


train_probas = np.zeros(shape=(x_train.shape[0], 6))
test_probas = np.zeros(shape=(x_test.shape[0], 6))

models_roc = []
models_train_roc = []


for idx, data in enumerate(tqdm(kfold_data)):
    X_train, y_train, X_valid, y_valid, val_indices = data

    model = get_gru_v2()
    callbacks = get_model_callbacks(save_dir=os.path.join(experiment_path, 'fold_%02d' % idx))

    model.fit(X_train, y_train, 
               batch_size=batch_size, 
               epochs=epochs, 
               validation_data=(X_valid, y_valid),
               shuffle=True,
               callbacks=callbacks, verbose=1)

    model.load_weights(filepath=os.path.join(experiment_path, ('fold_%02d/model/model_weights.hdf5' % idx)))

    proba = model.predict(X_train, batch_size=batch_size*2)
    proba_val = model.predict(X_valid, batch_size=batch_size*2)
    proba_test = model.predict(x_test, batch_size=batch_size*2)

    models_roc.append(roc_auc_score(y_valid, proba_val))
    models_train_roc.append(roc_auc_score(y_train, proba))
    
    train_probas[val_indices] += proba_val
    test_probas += proba_test / 5.


    print('Train ROC AUC:\nMean: %f\nStd: %f\nMin: %f\nMax: %f\n\n' % (np.mean(models_train_roc),
                                                                 np.std(models_train_roc),
                                                                 np.min(models_train_roc),
                                                                 np.max(models_train_roc)))

    print('Val ROC AUC:\nMean: %f\nStd: %f\nMin: %f\nMax: %f\n\n' % (np.mean(models_roc),
                                                                 np.std(models_roc),
                                                                 np.min(models_roc),
                                                                 np.max(models_roc)))


for i, cls_name in enumerate(class_names):
    train_submission[cls_name] = train_probas[:, i]
train_submission.to_csv('train_%s.csv' % STAMP, index=False)

for i, cls_name in enumerate(class_names):
    submission[cls_name] = test_probas[:, i]
submission.to_csv('submission_%s.csv' % STAMP, index=False)

  0%|          | 0/5 [00:00<?, ?it/s]

Train on 196620 samples, validate on 31915 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15

Epoch 00009: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 10/15
Epoch 11/15

Epoch 00011: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.


 20%|██        | 1/5 [15:31<1:02:07, 931.99s/it]

Train ROC AUC:
Mean: 0.995879
Std: 0.000000
Min: 0.995879
Max: 0.995879


Val ROC AUC:
Mean: 0.989693
Std: 0.000000
Min: 0.989693
Max: 0.989693


Train on 196621 samples, validate on 31914 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15

Epoch 00007: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15

Epoch 00012: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.


 40%|████      | 2/5 [32:24<48:36, 972.00s/it]  

Train ROC AUC:
Mean: 0.995778
Std: 0.000101
Min: 0.995677
Max: 0.995879


Val ROC AUC:
Mean: 0.989276
Std: 0.000417
Min: 0.988859
Max: 0.989693


Train on 196621 samples, validate on 31914 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15

Epoch 00008: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15

Epoch 00013: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 14/15


 60%|██████    | 3/5 [51:59<34:39, 1039.99s/it]

Train ROC AUC:
Mean: 0.995962
Std: 0.000272
Min: 0.995677
Max: 0.996328


Val ROC AUC:
Mean: 0.989133
Std: 0.000396
Min: 0.988847
Max: 0.989693


Train on 196621 samples, validate on 31914 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15

Epoch 00010: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.


 80%|████████  | 4/5 [1:06:20<16:35, 995.13s/it]

Train ROC AUC:
Mean: 0.995874
Std: 0.000280
Min: 0.995613
Max: 0.996328


Val ROC AUC:
Mean: 0.989270
Std: 0.000417
Min: 0.988847
Max: 0.989693


Train on 196621 samples, validate on 31914 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15

Epoch 00006: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15

Epoch 00010: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 11/15
Epoch 12/15

Epoch 00012: ReduceLROnPlateau reducing learning rate to 1e-05.


100%|██████████| 5/5 [1:23:27<00:00, 1001.52s/it]

Train ROC AUC:
Mean: 0.995870
Std: 0.000250
Min: 0.995613
Max: 0.996328


Val ROC AUC:
Mean: 0.989606
Std: 0.000768
Min: 0.988847
Max: 0.990948







## LSTM

In [None]:
def _bn_elu():
    def func(x):
        x = BatchNormalization()(x)
        x = Activation('elu')(x)
        return x
    return func


def get_lstm_v2(dropout=0., dropout_dense=0.):
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(inp)
    x = GaussianNoise(stddev=0.15)(x)
    
    x = Bidirectional(CuDNNLSTM(128, return_sequences=True))(x)
    x = _bn_elu()(x)
    
    x = SpatialDropout1D(0.4)(x)
    
    x = Bidirectional(CuDNNLSTM(128, return_sequences=True))(x)
    x = _bn_elu()(x)
    
    x = Attention(maxlen)(x)
    
    x = Dense(128)(x)
    x = _bn_elu()(x) 
    x = Dropout(dropout_dense)(x)
    outp = Dense(6, use_bias=True, activation="sigmoid")(x)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer=Adam(lr=0.001, amsgrad=True),
                  metrics=['accuracy'])

    return model

In [None]:
model = get_lstm_v2(dropout_dense=0.)
model.summary()

In [None]:
#tf.reset_default_graph()

STAMP = 'lstm_101'
experiment_path = './experiments/%s' % STAMP
epochs = 15
batch_size = 256

(kfold_data, X_test) = prepare_data_cv()


train_probas = np.zeros(shape=(x_train.shape[0], 6))
test_probas = np.zeros(shape=(x_test.shape[0], 6))

models_roc = []
models_train_roc = []


for idx, data in enumerate(tqdm(kfold_data)):
    X_train, y_train, X_valid, y_valid, val_indices = data

    model = get_lstm_v2()
    callbacks = get_model_callbacks(save_dir=os.path.join(experiment_path, 'fold_%02d' % idx))

    model.fit(X_train, y_train, 
               batch_size=batch_size, 
               epochs=epochs, 
               validation_data=(X_valid, y_valid),
               shuffle=True,
               callbacks=callbacks, verbose=1)

    model.load_weights(filepath=os.path.join(experiment_path, ('fold_%02d/model/model_weights.hdf5' % idx)))

    proba = model.predict(X_train, batch_size=batch_size*2)
    proba_val = model.predict(X_valid, batch_size=batch_size*2)
    proba_test = model.predict(x_test, batch_size=batch_size*2)

    models_roc.append(roc_auc_score(y_valid, proba_val))
    models_train_roc.append(roc_auc_score(y_train, proba))
    
    train_probas[val_indices] += proba_val
    test_probas += proba_test / 5.


    print('Train ROC AUC:\nMean: %f\nStd: %f\nMin: %f\nMax: %f\n\n' % (np.mean(models_train_roc),
                                                                 np.std(models_train_roc),
                                                                 np.min(models_train_roc),
                                                                 np.max(models_train_roc)))

    print('Val ROC AUC:\nMean: %f\nStd: %f\nMin: %f\nMax: %f\n\n' % (np.mean(models_roc),
                                                                 np.std(models_roc),
                                                                 np.min(models_roc),
                                                                 np.max(models_roc)))


for i, cls_name in enumerate(class_names):
    train_submission[cls_name] = train_probas[:, i]
train_submission.to_csv('train_%s.csv' % STAMP, index=False)

for i, cls_name in enumerate(class_names):
    submission[cls_name] = test_probas[:, i]
submission.to_csv('submission_%s.csv' % STAMP, index=False)

## TextCNN

In [None]:
def _bn_elu():
    def func(x):
        x = BatchNormalization()(x)
        x = Activation('elu')(x)
        return x
    return func


def get_text_cnn(dropout=0., dropout_dense=0., weight_decay=0.):
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(inp)
    #x = GaussianNoise(stddev=0.1)(x)
    
    x = Conv1D(filters=256, kernel_size=7, padding='same')(x)
    x = _bn_elu()(x) 
    x = MaxPooling1D(2)(x)
    
    x = Conv1D(filters=256, kernel_size=7, padding='same')(x)
    x = _bn_elu()(x) 
    x = Attention(maxlen // 2)(x)
    
    x = Dense(128, kernel_regularizer=l2(weight_decay))(x)
    x = _bn_elu()(x) 
    x = Dropout(dropout_dense)(x)
    outp = Dense(6, use_bias=True, activation="sigmoid")(x)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer=Adam(lr=0.001, amsgrad=True),
                  metrics=['accuracy'])

    return model

In [None]:
model = get_text_cnn(dropout_dense=0., weight_decay=0.)
model.summary()

In [None]:
STAMP = 'textcnn_100'
experiment_path = './experiments/%s' % STAMP
epochs = 15
batch_size = 256

(kfold_data, X_test) = prepare_data_cv()


train_probas = np.zeros(shape=(x_train.shape[0], 6))
test_probas = np.zeros(shape=(x_test.shape[0], 6))

models_roc = []
models_train_roc = []


for idx, data in enumerate(tqdm(kfold_data)):
    X_train, y_train, X_valid, y_valid, val_indices = data

    model = get_text_cnn(dropout_dense=0.3, weight_decay=1e-4)
    callbacks = get_model_callbacks(save_dir=os.path.join(experiment_path, 'fold_%02d' % idx))

    model.fit(X_train, y_train, 
               batch_size=batch_size, 
               epochs=epochs, 
               validation_data=(X_valid, y_valid),
               shuffle=True,
               callbacks=callbacks, verbose=1)

    model.load_weights(filepath=os.path.join(experiment_path, ('fold_%02d/model/model_weights.hdf5' % idx)))

    proba = model.predict(X_train, batch_size=batch_size*2)
    proba_val = model.predict(X_valid, batch_size=batch_size*2)
    proba_test = model.predict(x_test, batch_size=batch_size*2)

    models_roc.append(roc_auc_score(y_valid, proba_val))
    models_train_roc.append(roc_auc_score(y_train, proba))
    
    train_probas[val_indices] += proba_val
    test_probas += proba_test / 5.


    print('Train ROC AUC:\nMean: %f\nStd: %f\nMin: %f\nMax: %f\n\n' % (np.mean(models_train_roc),
                                                                 np.std(models_train_roc),
                                                                 np.min(models_train_roc),
                                                                 np.max(models_train_roc)))

    print('Val ROC AUC:\nMean: %f\nStd: %f\nMin: %f\nMax: %f\n\n' % (np.mean(models_roc),
                                                                 np.std(models_roc),
                                                                 np.min(models_roc),
                                                                 np.max(models_roc)))


for i, cls_name in enumerate(class_names):
    train_submission[cls_name] = train_probas[:, i]
train_submission.to_csv('train_%s.csv' % STAMP, index=False)

for i, cls_name in enumerate(class_names):
    submission[cls_name] = test_probas[:, i]
submission.to_csv('submission_%s.csv' % STAMP, index=False)