In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)



In [2]:
import gc
import random
from tqdm.notebook import tqdm
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.utils.rnn as rnn_utils
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from pathlib import Path
import datatable as dt

In [3]:
from Transformer_torch import TransformerModel
from Early_Stopping import EarlyStopping
from LRSechduler import NoamOpt

In [4]:
data_types_dict = {
    'row_id' : 'int32',
    'content_type_id': 'bool',
    'timestamp': 'int64',
    'user_id': 'int32', 
    'content_id': 'int16',
    'task_container_id' : 'int16',
    'answered_correctly': 'int8',
    'prior_question_elapsed_time': 'int64'
}
target = 'answered_correctly'
train_df = dt.fread('./data/sample_train.csv', columns=set(data_types_dict.keys())).to_pandas()

In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   row_id                       10000 non-null  int32  
 1   timestamp                    10000 non-null  int64  
 2   user_id                      10000 non-null  int32  
 3   content_id                   10000 non-null  int32  
 4   content_type_id              10000 non-null  bool   
 5   task_container_id            10000 non-null  int32  
 6   answered_correctly           10000 non-null  int32  
 7   prior_question_elapsed_time  9742 non-null   float64
dtypes: bool(1), float64(1), int32(5), int64(1)
memory usage: 361.5 KB


In [6]:
train_df

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,answered_correctly,prior_question_elapsed_time
0,0,0,115,5692,False,1,1,
1,1,56943,115,5716,False,2,1,37000.0
2,2,118363,115,128,False,0,1,55000.0
3,3,131167,115,7860,False,3,1,19000.0
4,4,137965,115,7922,False,4,1,11000.0
...,...,...,...,...,...,...,...,...
9995,9995,2868187305,91216,1124,False,775,0,18000.0
9996,9996,2868272689,91216,810,False,776,1,18000.0
9997,9997,2868367298,91216,1245,False,777,1,17000.0
9998,9998,2868439137,91216,711,False,778,1,17000.0


## Data Preprocess 

In [7]:
train_df = train_df[train_df.content_type_id == False]

In [8]:
train_df=train_df.sort_values(['user_id','timestamp'], ascending=True).reset_index(drop = True)

In [9]:
del train_df['content_type_id']

In [10]:
train_df

Unnamed: 0,row_id,timestamp,user_id,content_id,task_container_id,answered_correctly,prior_question_elapsed_time
0,0,0,115,5692,1,1,
1,1,56943,115,5716,2,1,37000.0
2,2,118363,115,128,0,1,55000.0
3,3,131167,115,7860,3,1,19000.0
4,4,137965,115,7922,4,1,11000.0
...,...,...,...,...,...,...,...
9758,9995,2868187305,91216,1124,775,0,18000.0
9759,9996,2868272689,91216,810,776,1,18000.0
9760,9997,2868367298,91216,1245,777,1,17000.0
9761,9998,2868439137,91216,711,778,1,17000.0


In [11]:
train_df["content_id"].max()

13522

In [12]:
total_ex = train_df["content_id"].nunique()
total_ex = total_ex if total_ex > train_df["content_id"].max() else train_df["content_id"].max()
print("number skills", total_ex)

number skills 13522


In [13]:
total_task = train_df["task_container_id"].nunique()
total_task = total_task if total_task > train_df["task_container_id"].max() else train_df["task_container_id"].max()
print("number tasks", total_task)

number tasks 4889


In [14]:
questions = pd.read_csv('./data/questions.csv')
#questions['part']=questions['part'].astype('int8')
train_df = train_df.merge(questions[['question_id', 'part']], left_on='content_id', right_on='question_id', how='left').drop(columns=['question_id'])
del questions
train_df.fillna(0,inplace = True)
total_cat = train_df['part'].nunique()
group = train_df[['user_id', 'content_id', 'answered_correctly','part','timestamp', 'task_container_id','prior_question_elapsed_time']].groupby('user_id').apply(lambda r: (r['content_id'].values, 
                                                                                                                                              r['part'].values, 
                                                                                                                                              r['timestamp'].values,
                                                                                                                                              r['task_container_id'].values,
                                                                                                                                              r['prior_question_elapsed_time'].values,
                                                                                                                                              r['answered_correctly'].values))



In [15]:
group

user_id
115      ([5692, 5716, 128, 7860, 7922, 156, 51, 50, 78...
124      ([7900, 7876, 175, 1278, 2064, 2063, 2065, 336...
2746     ([5273, 758, 5976, 236, 404, 382, 405, 873, 53...
5382     ([5000, 3944, 217, 5844, 5965, 4990, 5235, 605...
8623     ([3915, 4750, 6456, 3968, 6104, 5738, 6435, 54...
8701     ([3901, 6671, 4963, 6143, 8279, 3964, 4002, 75...
12741    ([5145, 9691, 9697, 5202, 4787, 5695, 7858, 56...
13134    ([3926, 564, 3865, 4231, 3684, 3988, 3968, 521...
24418    ([7900, 7876, 175, 1278, 2065, 2064, 2063, 336...
24600    ([7900, 7876, 175, 1278, 2064, 2065, 2063, 336...
32421    ([7900, 7876, 175, 1278, 2063, 2065, 2064, 336...
40828    ([7900, 7876, 175, 1278, 2065, 2063, 2064, 336...
44331    ([5542, 5697, 5748, 376, 5597, 6099, 4231, 689...
45001    ([7900, 7876, 175, 1278, 2065, 2063, 2064, 336...
46886    ([5059, 1207, 5880, 4777, 6417, 6407, 4527, 59...
50132    ([4485, 4913, 9261, 5681, 5165, 3903, 6411, 53...
51285    ([5692, 4512, 3566, 5811, 5132, 4466, 5

## HyperParameters 

In [16]:
DEBUG = True

TEST_SIZE = 0.05

MAX_SEQ = 97
ACCEPTED_USER_CONTENT_SIZE = 10
EMBED_SIZE = 64
BATCH_SIZE = 64
DROPOUT = 0.1
HEADS_EN = 4
HEADS_DE =4
ENC_LAYERS = 3
DEC_LAYERS = 3


LR = 0.0005
BETA1 = 0.9
BETA2 = 0.999
EPISLON = 1e-8
warmup_steps = 4000
PATIENCE = 10

total_lag = 3001
total_p = 301
total_in = 2

## Data Loader

In [17]:
class TransformerDataset(Dataset):
    def __init__(self, group,  max_seq=MAX_SEQ):
        super(TransformerDataset, self).__init__()
        self.samples, self.max_seq = {}, max_seq
        giveup_user=[]
        
        self.user_ids = []
        for i, user_id in enumerate(group.index):
            
            print(f'Processed {i} users')
            content_id, part, timestamp, task_container_id, prior_question_elapsed_time, answered_correctly = group[user_id]
            # TIME STAMP
            # Create LAG TIME
            np_time = ((np.array(timestamp)-np.roll(np.array(timestamp), 1))).astype('int64')
            #Control LAG TIME < 3000mins & > 0Millisecond
            for i in range(len(np_time)):
                if i>0:
                    if np_time[i]==0:
                        np_time[i]=np_time[i-1]
                if np_time[i]<0:
                    np_time[i]=3
            np_time = (np_time/60000).astype('int64')
            for i in range(len(np_time)):
                if np_time[i]>3000:
                    np_time[i]=3000
                elif np_time[i]>10:
                    np_time[i]=int(np_time[i]/10)*10
            np_time = np_time.astype('int32')
            # The least number of records in one sequence
            # Create sequence
            if len(content_id) >= ACCEPTED_USER_CONTENT_SIZE:
                if len(content_id) > self.max_seq:
                    total_questions = len(content_id)
                    last_pos = total_questions // self.max_seq
                    for seq in range(last_pos):
                        index = f"{user_id}_{seq}"
                        self.user_ids.append(index)
                        start = seq * self.max_seq
                        end = (seq + 1) * self.max_seq
                        self.samples[index] = (content_id[start:end], part[start:end], np_time[start:end], task_container_id[start:end], prior_question_elapsed_time[start:end], answered_correctly[start:end])
                    if len(content_id[end:]) >= ACCEPTED_USER_CONTENT_SIZE:
                        index = f"{user_id}_{last_pos + 1}"
                        self.user_ids.append(index)
                        self.samples[index] = (content_id[end:], part[end:], np_time[end:], task_container_id[end:], prior_question_elapsed_time[end:], answered_correctly[end:])
                else:
                    index = f'{user_id}'
                    self.user_ids.append(index)
                    self.samples[index] = (content_id, part, np_time, task_container_id, prior_question_elapsed_time, answered_correctly)
            else:
                giveup_user.append(user_id)
                
                
    def __len__(self):
        return len(self.user_ids)

    def __getitem__(self, index):
        user_id = self.user_ids[index]
        content_id, part, time, task, p_time, answered_correctly = self.samples[user_id]
        seq_len = len(content_id)
        
        
                
        
        
        content_id_seq = np.zeros(self.max_seq, dtype=int)
        part_id_seq = np.zeros(self.max_seq, dtype=int)
        time_fea = np.zeros(self.max_seq, dtype=int)
        task_seq = np.zeros(self.max_seq, dtype=int)
        p_time_seq = np.zeros(self.max_seq, dtype=int)
        res_seq = np.zeros(self.max_seq, dtype=int)
        answered_correctly_seq = np.zeros(self.max_seq, dtype=int)
        if seq_len >= self.max_seq:
            content_id_seq[:] = content_id[-self.max_seq:]
            part_id_seq[:] = part[-self.max_seq:]
            time_fea[:] = time[-self.max_seq:]
            task_seq[:] = task[-self.max_seq:]
            p_time_seq[:] = p_time[-self.max_seq:]
            res_seq[:] = answered_correctly[-self.max_seq:]
            answered_correctly_seq[:] = answered_correctly[-self.max_seq:]
        else:
            content_id_seq[-seq_len:] = content_id
            part_id_seq[-seq_len:] = part
            time_fea[-seq_len:] = time
            task_seq[-seq_len:] = task
            p_time_seq[-seq_len:] = p_time
            res_seq[-seq_len:] = answered_correctly
            answered_correctly_seq[-seq_len:] = answered_correctly
            
        target_id = content_id_seq[1:]
        part_id = part_id_seq[1:]
        task_id = task_seq[1:]
        lag_time_id = time_fea[1:]
        p_time_id = p_time_seq[1:]
        label = answered_correctly_seq[1:]
        
        res = res_seq[:-1]
        #x += (answered_correctly_seq[:-1] == 1) * self.n_skill
        
        return target_id, part_id, task_id, lag_time_id, p_time_id, res, label
    

In [18]:
train, val = train_test_split(group, test_size = TEST_SIZE)

In [19]:
train_dataset = TransformerDataset(train,  max_seq=MAX_SEQ)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=8)
del train

Processed 0 users
Processed 1 users
Processed 2 users
Processed 3 users
Processed 4 users
Processed 5 users
Processed 6 users
Processed 7 users
Processed 8 users
Processed 9 users
Processed 10 users
Processed 11 users
Processed 12 users
Processed 13 users
Processed 14 users
Processed 15 users
Processed 16 users
Processed 17 users
Processed 18 users


In [20]:
BATCH_SIZE

64

In [21]:
val_dataset = TransformerDataset(val, max_seq=MAX_SEQ)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=8)
del val

Processed 0 users
Processed 1 users


In [22]:
sample_batch = next(iter(train_dataloader))
sample_batch[0].shape, sample_batch[1].shape, sample_batch[2].shape, sample_batch[3].shape, sample_batch[4].shape, sample_batch[5].shape, sample_batch[6].shape


(torch.Size([64, 96]),
 torch.Size([64, 96]),
 torch.Size([64, 96]),
 torch.Size([64, 96]),
 torch.Size([64, 96]),
 torch.Size([64, 96]),
 torch.Size([64, 96]))

## Loading Model 

In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [24]:
total_ex

13522

In [25]:

def create_model():
    return TransformerModel(total_ex+1, total_cat, total_in, total_task, total_lag, total_p, embed_dim=EMBED_SIZE, heads_en = HEADS_EN, heads_de = HEADS_DE, max_seq=MAX_SEQ, dropout=DROPOUT, forward_expansion=1, enc_layers = ENC_LAYERS, dec_layers = DEC_LAYERS,  )
model = create_model()
model

TransformerModel(
  (encoder): Encoder(
    (embedding_id): Embedding(13523, 64)
    (pos_embedding): Embedding(96, 64)
    (embedding_part): Embedding(8, 64)
    (layers): ModuleList(
      (0): TransformerBlock_en(
        (multi_att): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=64, out_features=64, bias=True)
        )
        (dropout): Dropout(p=0.1, inplace=False)
        (layer_normal_q): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (layer_normal_k): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (layer_normal_v): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (ffn): FFN(
          (lr1): Linear(in_features=64, out_features=64, bias=True)
          (relu): ReLU()
          (bn): BatchNorm1d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (lr2): Linear(in_features=64, out_features=64, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (layer_normal_2): 

In [26]:
np.array(np.hstack(sample_batch[0].detach().numpy())).max()

13522

In [27]:
model(sample_batch[0], sample_batch[1], sample_batch[2], sample_batch[3], sample_batch[4], sample_batch[5])[0]

tensor([[-5.5498e-01, -2.8253e-01, -6.3585e-01,  ...,  2.8382e-01,
         -3.0115e-01,  1.4852e-01],
        [-3.5992e-02,  2.9492e-01, -8.6183e-01,  ..., -8.1282e-01,
         -2.6167e-01, -4.7813e-01],
        [ 1.8178e-01, -3.3851e-01,  2.3834e-01,  ..., -4.8998e-01,
          2.2559e-01,  6.3762e-01],
        ...,
        [ 1.4564e+00,  1.0751e-01,  5.4727e-01,  ..., -9.9677e-01,
         -7.8360e-01, -2.5469e-01],
        [-5.0359e-01, -3.1266e-01,  4.3622e-02,  ..., -1.8906e-01,
          1.3953e-01, -8.2845e-01],
        [ 7.7356e-04,  7.4606e-01,  3.3155e-01,  ..., -1.6192e-01,
          5.9918e-02, -1.2333e+00]], grad_fn=<SqueezeBackward1>)

## Train 

In [28]:
if DEBUG:
    EPOCHS = 3
    MODEL_PATH = './transformer_debug.pth'
else:
    EPOCHS = 60
    MODEL_PATH = './transformer.pth'

In [29]:
def load_from_item(item):
    
    e_id = item[0].to(device).long()
    part_id = item[1].to(device).long()
    task_id = item[2].to(device).long()
    time_id = item[3].to(device).long()
    p_time = item[4].to(device).long()
    res = item[5].to(device).long()
    label = item[6].to(device).float()
    target_mask = (part_id != 0)
    return e_id, part_id, task_id, time_id, p_time, res, label, target_mask

def update_stats(tbar, train_loss, loss, output, label, num_corrects, num_total, labels, outs):
    train_loss.append(loss.item())
    pred = (torch.sigmoid(output) >= 0.5).long()
    num_corrects += (pred == label).sum().item()
    num_total += len(label)
    labels.extend(label.view(-1).data.cpu().numpy())
    outs.extend(output.view(-1).data.cpu().numpy())
    tbar.set_description('loss - {:.4f}'.format(loss))
    return num_corrects, num_total

def train_epoch(model, dataloader, optim, criterion,device="cpu"):
    model.train()
    
    train_loss = []
    num_corrects = 0
    num_total = 0
    labels = []
    outs = []
    
    tbar = tqdm(dataloader)
    for item in tbar:
        e_id, part_id, task_id, lag_time, p_time, res, label, target_mask = load_from_item(item)
        
        optim.optimizer.zero_grad()
        output, _ = model(e_id, part_id, task_id, lag_time, p_time, res)
        
        output = torch.masked_select(output, target_mask)
        label = torch.masked_select(label, target_mask)
        
        loss = criterion(output, label)
        loss.backward()
        optim.step()
        #scheduler.step()
        
        tbar.set_description('loss - {:.4f}'.format(loss))
        #auc = roc_auc_score(labels, outs)

def val_epoch(model, val_iterator, criterion, device="cpu"):
    model.eval()

    train_loss = []
    num_corrects = 0
    num_total = 0
    labels = []
    outs = []
    
    tbar = tqdm(val_iterator)
    for item in tbar:
        e_id, part_id, task_id, lag_time, p_time, res, label, target_mask = load_from_item(item)

        with torch.no_grad():
            output, atten_weight = model(e_id, part_id, task_id, lag_time, p_time, res)
        
        output = torch.masked_select(output, target_mask)
        label = torch.masked_select(label, target_mask)

        loss = criterion(output, label)
        
        num_corrects, num_total = update_stats(tbar, train_loss, loss, output, label, num_corrects, num_total, labels, outs)

    acc = num_corrects / num_total
    auc = roc_auc_score(labels, outs)
    loss = np.average(train_loss)

    return loss, acc, auc

In [30]:
def get_std_opt(d_model, warmup_step, LR, B1, B2, EPS):
    return NoamOpt(d_model, warmup_step,
            torch.optim.Adam(model.parameters(), lr=LR, betas=(B1, B2), eps=EPS))

In [31]:
def do_train():
    optimizer = get_std_opt(EMBED_SIZE, warmup_steps, LR, BETA1, BETA2, EPISLON)
    #optimizer = torch.optim.Adam(model.parameters(), lr=LR)
    criterion = nn.BCEWithLogitsLoss()
    #scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=LR, steps_per_epoch=len(train_dataloader), epochs=EPOCHS)
    earlystopping = EarlyStopping(PATIENCE, verbose=True)
    
    model.to(device)
    criterion.to(device)
    best_auc = 0.0
    for epoch in range(EPOCHS):
        train_epoch(model, train_dataloader, optimizer, criterion, device)
        val_loss, avl_acc, val_auc = val_epoch(model, val_dataloader, criterion, device)
        print(f"epoch - {epoch + 1} val_loss - {val_loss:.3f} acc - {avl_acc:.3f} auc - {val_auc:.3f}")
        if best_auc < val_auc:
            print(f'epoch - {epoch + 1} best model with val auc: {val_auc}')
            best_auc = val_auc
            torch.save(model.state_dict(), MODEL_PATH)
        print('LR is:', optimizer.optimizer.param_groups[0]['lr'])
        earlystopping(val_loss, model)
        if earlystopping.early_stop:
            print('early stopping')
            break

In [32]:
do_train()

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


epoch - 1 val_loss - 0.721 acc - 0.463 auc - 0.517
epoch - 1 best model with val auc: 0.5169082125603865
LR is: 9.882117688026186e-07
Validation loss decreased (inf --> 0.720676).  Saving model ...


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


epoch - 2 val_loss - 0.722 acc - 0.463 auc - 0.534
epoch - 2 best model with val auc: 0.5338164251207729
LR is: 1.976423537605237e-06
EarlyStopping counter: 1 out of 10


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


epoch - 3 val_loss - 0.723 acc - 0.463 auc - 0.536
epoch - 3 best model with val auc: 0.5362318840579711
LR is: 2.964635306407856e-06
EarlyStopping counter: 2 out of 10
