# Train using attention model

Reference: https://www.kaggle.com/truocpham/oob-cuda-gru-attention


In [46]:
import pandas as pd
from itertools import chain
import keras

#### BEGIN Attention Model ####
from keras.models import Sequential,Model
from keras.layers import *
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints
#### END Attention Model ####


import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold

import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

# Read Dataset

In [3]:
filename_train = './datasets/train.json'
filename_test = './datasets/test.json'

train = pd.read_json(filename_train)
test = pd.read_json(filename_test)

In [3]:
train.shape

(1195, 5)

In [4]:
train.head()

Unnamed: 0,audio_embedding,end_time_seconds_youtube_clip,is_turkey,start_time_seconds_youtube_clip,vid_id
0,"[[172, 34, 216, 110, 208, 46, 95, 66, 161, 125...",70,0,60,kDCk3hLIVXo
1,"[[169, 20, 165, 102, 205, 62, 110, 103, 211, 1...",40,1,30,DPcGzqHoo7Y
2,"[[148, 8, 138, 60, 237, 48, 121, 108, 145, 177...",240,1,230,7yM63MTHh5k
3,"[[151, 0, 162, 88, 171, 71, 47, 90, 179, 190, ...",520,1,510,luG3RmUAxxM
4,"[[162, 17, 187, 111, 211, 105, 92, 67, 203, 15...",10,0,0,PIm3cjxTpOk


# Retrive Audio Embedding and Flatten it into 1D

In [5]:
embeddings = pd.DataFrame(list(train['audio_embedding'].map(lambda x: chain.from_iterable(x))))

In [6]:
embeddings.shape

(1195, 1280)

In [7]:
embeddings.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1270,1271,1272,1273,1274,1275,1276,1277,1278,1279
0,172,34,216,110,208,46,95,66,161,125,...,0.0,135.0,133.0,151.0,0.0,3.0,206.0,101.0,104.0,255.0
1,169,20,165,102,205,62,110,103,211,187,...,0.0,0.0,119.0,205.0,27.0,151.0,226.0,44.0,0.0,255.0
2,148,8,138,60,237,48,121,108,145,177,...,0.0,62.0,79.0,204.0,0.0,74.0,243.0,255.0,95.0,255.0
3,151,0,162,88,171,71,47,90,179,190,...,0.0,255.0,207.0,52.0,178.0,129.0,186.0,0.0,0.0,255.0
4,162,17,187,111,211,105,92,67,203,152,...,62.0,224.0,15.0,172.0,0.0,2.0,255.0,144.0,34.0,255.0


# Check whether NULL Cell exist

In [8]:
embeddings.isnull().any().any()

True

In [9]:
embeddings = embeddings.fillna(-1)

# Define Model

# Prepare Train Dataset

In [4]:
xtrain = [k for k in train['audio_embedding']]
test_data = test['audio_embedding'].tolist()

In [5]:
print(len(xtrain))
print(len(test_data))

1195
1196


In [7]:
ytrain = train['is_turkey'].values

# Check the Min/Max Dimension

In [12]:
max([len(features) for features in xtrain])

10

In [13]:
min([len(features) for features in xtrain])

2

# Pad audio features

In [16]:
x_train = keras.preprocessing.sequence.pad_sequences(xtrain, maxlen=10)

In [17]:
x_train.shape

(1195, 10, 128)

In [19]:
x_train.max()

255

In [20]:
x_train.min()

0

In [21]:
type(x_train)

numpy.ndarray

In [27]:
np.isnan(x_train).any()

False

# Convert ytrain to np.array

In [28]:
y_train = np.array(ytrain)

In [29]:
y_train.shape

(1195,)

# Delcare Model

In [39]:
# https://www.kaggle.com/qqgeogor/keras-lstm-attention-glove840b-lb-0-043
class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim

In [51]:
# def get_model(input_dim, hidden_size=64, fc1_size=10, output_size=1, lr=1e-3):
    
#     # output shape = (None, seq_length, feature_size)
#     inputs = keras.layers.Input(input_dim[1:])

#     # output shape = (None, seq_length, hidden_size)
#     x_rnn = keras.layers.GRU(units=hidden_size)(inputs)
    
#     # output shape = (None, fc1_size)
#     x_attention = Attention(fc1_size)(x_rnn)
    
#     # output shape = (None, 1)
#     outputs = keras.layers.Dense(output_size, activation='sigmoid')(x_attention)
    
#     model = keras.models.Model(inputs=inputs, outputs=outputs)
    
#     optimizer = keras.optimizers.Adam(lr=lr)
    
#     model.compile(loss='binary_crossentropy',
#                   optimizer=optimizer,
#                  metrics=['accuracy'])
#     return model

def get_model():
    model = Sequential()
    model.add(BatchNormalization(input_shape=(10, 128)))
#     model.add(Bidirectional(RNN(64, activation='relu', return_sequences=True)))
    model.add(Bidirectional(GRU(128, dropout=0.4, recurrent_dropout=0.4, activation='relu', return_sequences=True)))
    model.add(Attention(10))
    model.add(Dense(1,activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


input_dim = x_train.shape
# model = get_model(input_dim)
model = get_model()


model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
batch_normalization_2 (Batch (None, 10, 128)           512       
_________________________________________________________________
bidirectional_2 (Bidirection (None, 10, 256)           197376    
_________________________________________________________________
attention_4 (Attention)      (None, 256)               266       
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 257       
Total params: 198,411
Trainable params: 198,155
Non-trainable params: 256
_________________________________________________________________


# 10-Fold Training

In [52]:

scores = []

# sklearn.model_selection.KFold
kf = StratifiedKFold(n_splits=10,
          shuffle=True,
          random_state=42069)

for index_fold, (index_train, index_val) in enumerate(kf.split(x_train, y_train)):
    x_train_f = x_train[index_train]
    y_train_f = y_train[index_train]
    
    x_val_f = x_train[index_val]
    y_val_f = y_train[index_val]
    
    # train
    model.fit(x_train_f, y_train_f, batch_size=256, epochs=16, verbose=1, validation_data=(x_val_f, y_val_f))
    
    print()
    
    # compute ROC-AUC score
    pred_val = model.predict([x_val_f], batch_size=512)
    
    score_auc = sklearn.metrics.roc_auc_score(y_val_f, pred_val)
    scores.append(score_auc)
    
    print('Fold {}: score={}'.format(index_fold, score_auc))
    
    
print('Average AUC score: {}'.format(np.mean(scores)))

Train on 1074 samples, validate on 121 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16

Fold 0: score=0.9901408450704225
Train on 1075 samples, validate on 120 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16

Fold 1: score=1.0
Train on 1075 samples, validate on 120 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16

Fold 2: score=1.0
Train on 1075 samples, validate on 120 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16


Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16

Fold 3: score=0.9994251221615407
Train on 1076 samples, validate on 119 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16

Fold 4: score=1.0
Train on 1076 samples, validate on 119 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16

Fold 5: score=1.0
Train on 1076 samples, validate on 119 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16

Fold 6: score=1.0
Train on 1076 samples, validate on 119 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16


Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16

Fold 7: score=1.0
Train on 1076 samples, validate on 119 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16

Fold 8: score=1.0
Train on 1076 samples, validate on 119 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16

Fold 9: score=1.0
Average AUC score: 0.9989565967231963
