In [1]:
#############################################################################
# 0. Libraries

import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt
import pandas_profiling as pp

from tqdm import tqdm
from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold
from sklearn.metrics import roc_auc_score 

import tensorflow as tf
tf.keras.backend.clear_session()

physical_devices = tf.config.list_physical_devices('GPU')

try:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
except:
    print('Invalid device or cannot modify virtual devices once initialized.')
    
from tensorflow.keras.utils import Sequence
from tensorflow.keras import models, layers, regularizers, constraints, metrics, losses, optimizers

#############################################################################

## References

- https://arxiv.org/abs/2010.12042

In [2]:
#############################################################################
# 1. Load Data

path = '../01_Data/01_Raw/'
path_output = '../01_Data/02_GeneratedData/'


#############################################################################

In [3]:
#############################################################################
# 2. Global Variables

SEQ_LENGTH = 100

#############################################################################

In [4]:
#############################################################################
# 3. Load Data

path_input = path_output + 'train/'

X_past_questions = np.load(path_input + 'X_past_questions.npy')
X_past_responses = np.load(path_input + 'X_past_responses.npy')
X_past_times = np.load(path_input + 'X_past_times.npy')
X_agg_content_features = np.load(path_input + 'X_agg_content_features.npy')
X_agg_user_features = np.load(path_input + 'X_agg_user_features.npy')
X_curr_question_ids = np.load(path_input + 'X_curr_question_ids.npy')
X_curr_question_tags = np.load(path_input + 'X_curr_question_tags.npy')
X_target = np.load(path_input + 'X_target.npy')

unique_users = np.load(path_input + 'unique_users.npy', allow_pickle=True)
dict_users = np.load(path_input + 'dict_users.npy', allow_pickle=True)
dict_users_inv = np.load(path_input + 'dict_users_inv.npy', allow_pickle=True)

unique_users = unique_users.flatten()[0]
dict_users = dict_users.flatten()[0]
dict_users_inv = dict_users_inv.flatten()[0]

#############################################################################

In [5]:
#############################################################################
# 4. Data Generator

class RIIDDataGenerator(Sequence):
    
    def __init__(self, users, dict_users, batch_size, training=True):
        super(RIIDDataGenerator, self).__init__()
        
        self.users = users
        self.dict_users = dict_users
        self.batch_size = batch_size
        self.training = training
        self.on_epoch_end()
        
    def __len__(self):
        self.num_steps = int(np.ceil(len(self.users) / self.batch_size))
        return self.num_steps
        
    def __getitem__(self, idx):
        indexes = self.indexes[idx*self.batch_size:(idx+1)*self.batch_size]
        list_batch_users = [self.users[k] for k in indexes]
        positions = [self.dict_users[u] for u in list_batch_users]
        
        batch = (X_past_questions[positions], X_past_responses[positions], X_past_times[positions],
                 X_agg_content_features[positions], X_agg_user_features[positions], X_curr_question_ids[positions],
                 X_curr_question_tags[positions])
        
        if self.training:
            return batch, X_target[positions]
        else:
            return batch
    
    def on_epoch_end(self):
        self.indexes = np.arange(len(self.users))
        np.random.shuffle(self.indexes)
        
        
    def generateOrderedSequences(self, list_users):
        pass
    
#############################################################################

In [6]:
# X_generator = RIIDDataGenerator(list(unique_users), dict_users, batch_size=8, training=True)

# for batch in X_generator:
#     break
    
# len(batch[0]), batch[0][0].shape, batch[1].shape

In [7]:
#############################################################################
# 5. Models

class ReturnBestEarlyStopping(tf.keras.callbacks.EarlyStopping):
    def __init__(self, **kwargs):
        super(ReturnBestEarlyStopping, self).__init__(**kwargs)

    def on_train_end(self, logs=None):
        if self.stopped_epoch > 0:
            if self.verbose > 0:
                print(f'\nEpoch {self.stopped_epoch + 1}: early stopping')
        elif self.restore_best_weights:
            if self.verbose > 0:
                print('Restoring model weights from the end of the best epoch.')
            self.model.set_weights(self.best_weights)
            

class CustomEmbedding(models.Model):
    def __init__(self, input_dim, output_dim, mask_value=-1, **kwargs):
        super(CustomEmbedding, self).__init__(**kwargs, name='CustomEmbedding')
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.mask_value = mask_value

    def build(self, input_shape):
        self.embeddings = self.add_weight(
            shape=(self.input_dim, self.output_dim),
            initializer="random_normal",
            dtype="float32")

    def call(self, inputs):
        return tf.nn.embedding_lookup(self.embeddings, inputs)

    def compute_mask(self, inputs, mask=None):
        if not self.mask_value:
            return 0
        return tf.not_equal(inputs, self.mask_value)
    

class CustomEmbeddingNormLayer(models.Model):
    def __init__(self, input_dim, output_dim, rate=0.1, mask_value=-1, **kwargs):
        super(CustomEmbeddingNormLayer, self).__init__(**kwargs, name='CustomEmbeddingNormLayer')

        self.embedding = CustomEmbedding(input_dim=input_dim, output_dim=output_dim, mask_value=mask_value)
        self.dropout = layers.Dropout(rate)
        self.layernorm = layers.LayerNormalization(epsilon=1e-6)
            

    def call(self, inputs, training):
        
        x = self.embedding(inputs)
        x = self.dropout(x, training=training)

        return x   

class MultiHeadAttention(models.Model):
    def __init__(self, d_model, num_heads, **kwargs):
        super(MultiHeadAttention, self).__init__(**kwargs, name='MultiHeadAttention')
        self.num_heads = num_heads
        self.d_model = d_model

        assert d_model % self.num_heads == 0

        self.depth = d_model // self.num_heads

        self.wq = layers.Dense(d_model)
        self.wk = layers.Dense(d_model)
        self.wv = layers.Dense(d_model)

        self.dense = layers.Dense(d_model)


    def split_heads(self, x, batch_size):

        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])
    

    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]

        q = self.wq(q)  # (batch_size, seq_len, d_model)
        k = self.wk(k)  # (batch_size, seq_len, d_model)
        v = self.wv(v)  # (batch_size, seq_len, d_model)

        q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
        k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
        v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)

        # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
        # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
        scaled_attention, attention_weights = self.scaled_dot_product_attention((q, k, v, mask))

        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)

        concat_attention = tf.reshape(scaled_attention, 
                                      (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)

        output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)

        return output, attention_weights
    
    def scaled_dot_product_attention(self, inputs):
        q, k, v, mask = inputs
        """Calculate the attention weights.
        q, k, v must have matching leading dimensions.
        k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
        The mask has different shapes depending on its type(padding or look ahead) 
        but it must be broadcastable for addition.

        Args:
        q: query shape == (..., seq_len_q, depth)
        k: key shape == (..., seq_len_k, depth)
        v: value shape == (..., seq_len_v, depth_v)
        mask: Float tensor with shape broadcastable 
              to (..., seq_len_q, seq_len_k). Defaults to None.

        Returns:
        output, attention_weights
        """

        matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)

        # scale matmul_qk
        dk = tf.cast(tf.shape(k)[-1], tf.float32)
        scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

        # add the mask to the scaled tensor.
        if mask is not None:
            scaled_attention_logits += (mask * -1e9)  

        # softmax is normalized on the last axis (seq_len_k) so that the scores
        # add up to 1.
        attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)

        output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)

        return output, attention_weights
    
    
class BaseModel(models.Model):
    def __init__(self, d_model, num_heads, **kwargs):
        super(BaseModel, self).__init__(**kwargs, name='BaseModel')
        
#         self.embed_question_id = CustomEmbedding(input_dim=13_523, output_dim=128, mask_value=-1)
#         self.embed_question_parts = CustomEmbedding(input_dim=7, output_dim=128, mask_value=-1)
#         self.embed_question_tags = CustomEmbedding(input_dim=189, output_dim=128, mask_value=-1)
        
#         self.embed_time_elapsed = CustomEmbedding(input_dim=300, output_dim=128, mask_value=-1)
#         self.embed_time_lag = CustomEmbedding(input_dim=1440, output_dim=128, mask_value=-1)

        self.embed_question_id = CustomEmbeddingNormLayer(input_dim=13_523, output_dim=128, mask_value=-1)
        self.embed_question_parts = CustomEmbeddingNormLayer(input_dim=7, output_dim=128, mask_value=-1)
        self.embed_question_correct_answer = CustomEmbeddingNormLayer(input_dim=4, output_dim=128, mask_value=-1)
        self.embed_question_tags = CustomEmbeddingNormLayer(input_dim=189, output_dim=128, mask_value=-1)
        
        self.embed_time_elapsed = CustomEmbeddingNormLayer(input_dim=300, output_dim=128, mask_value=-1)
        self.embed_time_lag = CustomEmbeddingNormLayer(input_dim=1440, output_dim=128, mask_value=-1)
        
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = layers.LayerNormalization(epsilon=1e-6)
        
        self.layernorm4 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm5 = layers.LayerNormalization(epsilon=1e-6)
        
#         self.gru_questions = layers.GRU(32, return_sequences=False, dropout=0.2, kernel_regularizer=regularizers.l2(1e-4))
#         self.gru_responses = layers.GRU(32, return_sequences=False, dropout=0.2, kernel_regularizer=regularizers.l2(1e-4))
        self.drop_1 = layers.Dropout(0.4)
        self.dense = layers.Dense(64, activation=None, kernel_regularizer=regularizers.l2(1e-4))
        self.batch_norm = layers.BatchNormalization()
        self.drop_2 = layers.Dropout(0.3)
        
        self.dense_output = layers.Dense(1, activation='sigmoid')
        self.mask_questions = layers.Lambda(lambda x: tf.not_equal(x, -1))
        self.mask_responses = layers.Lambda(lambda x: tf.not_equal(x, -1))

        self.mha1 = MultiHeadAttention(d_model=d_model, num_heads=num_heads)
        self.mha2 = MultiHeadAttention(d_model=d_model, num_heads=num_heads)
        
        self.mask = layers.Lambda(lambda x: tf.not_equal(x, -1))
        
        
    def call(self, inputs, training):
        batch_past_questions, batch_past_responses, batch_past_times, \
        batch_agg_content_features, batch_agg_user_features, batch_curr_question_ids, \
        batch_curr_question_tags = inputs[0], inputs[1], inputs[2], inputs[3], inputs[4], inputs[5], inputs[6]
        
        ## Past
        
        # Questions
        batch_past_question_id = self.embed_question_id(tf.cast(batch_past_questions[:, :, 0], tf.int64), training)
        batch_past_question_part = self.embed_question_parts(tf.cast(batch_past_questions[:, :, 1], tf.int32), training)
        batch_past_question_correct_answer = self.embed_question_correct_answer(tf.cast(batch_past_questions[:, :, 2], 
                                                                                        tf.int32), 
                                                                       training)
        
        batch_past_question_tags = self.embed_question_tags(tf.cast(batch_past_questions[:, :, 3:], tf.int32), training)     
        
        # Responses
        batch_past_user_answered_correct = tf.expand_dims(batch_past_responses[:, :, 0], -1)
        batch_past_user_answered_answer = self.embed_question_correct_answer(
                                               tf.cast(batch_past_responses[:, :, 1], tf.int32), training)
        
        # Times
        batch_past_times_elapsed = self.embed_time_elapsed(tf.cast(batch_past_times[:, :, 0], tf.int32), training)
        batch_past_times_lag = self.embed_time_lag(tf.cast(batch_past_times[:, :, 1], tf.int32), training)
         
        ## Current
        
        batch_curr_question_id = self.embed_question_id(tf.cast(batch_curr_question_ids[:, 0], tf.int64))
        batch_curr_question_part = self.embed_question_parts(tf.cast(batch_curr_question_ids[:, 1], tf.int32), training)
        batch_curr_question_tags = self.embed_question_tags(tf.cast(batch_curr_question_tags, tf.int32), training)
                
        ###
        
        embed_past_tags = tf.reduce_sum(batch_past_question_tags, axis=2)
        past_questions_embed = (batch_past_question_id + batch_past_question_part + 
                                batch_past_question_correct_answer + embed_past_tags)
        past_questions_embed = self.layernorm1(past_questions_embed)

        past_response_embed = (batch_past_user_answered_correct + batch_past_user_answered_answer + 
                               batch_past_times_elapsed + batch_past_times_lag)
        past_response_embed = self.layernorm2(past_response_embed)
     
        embed_curr_tags = tf.reduce_sum(batch_curr_question_tags, axis=1)
        curr_question_embed = (batch_curr_question_id + batch_curr_question_part + embed_curr_tags)
        curr_question_embed = self.layernorm3(curr_question_embed)
        
        mask = self.createPaddingMask(tf.cast(batch_past_questions[:, :, 0], tf.int64), mask_value=-1)
        
        past_questions_x, _ = self.mha1(v=past_questions_embed, q=past_questions_embed, k=past_questions_embed, mask=mask)
        past_response_x, _ = self.mha2(v=past_response_embed, q=past_response_embed, k=past_response_embed, mask=mask)
        
        past_questions_x = self.layernorm4(past_questions_embed + past_questions_x)
        past_response_x = self.layernorm5(past_response_embed + past_response_x)
        
        x_past_question = layers.GlobalAveragePooling1D()(past_questions_x)
        x_past_response = layers.GlobalAveragePooling1D()(past_response_x)
        
#         print(x_past_question.shape, x_past_response.shape, curr_question_embed.shape)
        
        x = tf.concat([x_past_question, x_past_response, curr_question_embed], axis=-1)
        
#         print(x.shape)
        
        x = self.drop_1(x, training=training)
        x = self.dense(x)
        x = self.batch_norm(x, training=training)
        x = tf.nn.relu(x)
        x = self.drop_2(x, training=training)
        
        prediction = self.dense_output(x)
        
        return prediction
    
    def createPaddingMask(self, seq, mask_value=-1):
        seq = tf.cast(tf.math.equal(seq, mask_value), tf.float32)
        return seq[:, tf.newaxis,  tf.newaxis, :]
        

def buildmodel(d_model, num_heads, summary=False):

    basemodel = BaseModel(d_model, num_heads)
    
    in_1 = layers.Input(shape=(SEQ_LENGTH, 3+7))
    in_2 = layers.Input(shape=(SEQ_LENGTH, 2))
    in_3 = layers.Input(shape=(SEQ_LENGTH, 2))
    
    in_4 = layers.Input(shape=(3))
    in_5 = layers.Input(shape=(3))
    
    in_6 = layers.Input(shape=(2))
    in_7 = layers.Input(shape=(7))
    
    x = basemodel((in_1, in_2, in_3, in_4, in_5, in_6, in_7))
    
    model = models.Model(inputs=[in_1, in_2, in_3, in_4, in_5, in_6, in_7],
                         outputs=[x])
    
    model.compile(optimizer=optimizers.Adam(learning_rate=8e-4), loss=losses.binary_crossentropy,
                 metrics=[metrics.AUC(), 'acc'])
    
    if summary:
        print(model.summary())
    
    return model
        
#############################################################################   

In [8]:
#############################################################################
# 6. Training

batch_size = 16
d_model = 128
num_heads= 4

callback_early_stopping = ReturnBestEarlyStopping(monitor='val_loss', patience=10, verbose=1, restore_best_weights=True)
list_callbacks = [callback_early_stopping]

kf = StratifiedKFold(n_splits = 5, shuffle=True, random_state=12)
list_history, list_models = [], []

for num_fold, (train_index, val_index) in enumerate(kf.split(list(unique_users),
                                                             X_target)):
    
    users_train_fold = np.asarray(list(unique_users))[train_index]
    users_val_fold = np.asarray(list(unique_users))[val_index]
    
    X_train_generator = RIIDDataGenerator(list(users_train_fold), dict_users, batch_size=batch_size, training=True)
    
    X_val_generator = RIIDDataGenerator(list(users_val_fold), dict_users, batch_size=batch_size, training=True)
    
    print(f'Num Fold: {num_fold + 1}')
    print(f'Train users: {len(train_index)} Val users: {len(val_index)}')
    
    model = buildmodel(d_model, num_heads, summary=False)
    
    history = model.fit(X_train_generator,
                        validation_data=X_val_generator,
                        epochs=5,
                        callbacks=list_callbacks,
                        verbose=1)
    
    print('***'*20)
    list_history.append(history)
    list_models.append(model)
    print('***'*20)
    
# tf.keras.models.save_model(list_models[0], './model/')

#############################################################################

Num Fold: 1
Train users: 6169 Val users: 1543
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Restoring model weights from the end of the best epoch.
************************************************************
************************************************************
Num Fold: 2
Train users: 6169 Val users: 1543
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Restoring model weights from the end of the best epoch.
************************************************************
************************************************************
Num Fold: 3
Train users: 6170 Val users: 1542
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Restoring model weights from the end of the best epoch.
************************************************************
************************************************************
Num Fold: 4
Train users: 6170 Val users: 1542
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Restoring model weights from the end of the best epoch.
**************************

In [10]:
# m = BaseModel()
num_iters = 1_000
list_score = []
for i, batch in enumerate(X_val_generator):
#     score = roc_auc_score(batch[1], list_models[0].predict(batch[0]))
    score = roc_auc_score(batch[1], list_models[0].predict(batch[0]))
    list_score.append(score)
#     print(score)
    if i>=num_iters:
        break
print('***'*10)
print(np.mean(list_score))

******************************
0.9312166463261825
