# Setups

## Import

In [1]:
import numpy as np
import pandas as pd
from itertools import chain
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F

from preprocess import *

## Load Data

In [2]:
%%time
df = Preprocess().preprocess()

Wall time: 48.8 s


# Setups

## Hyperparams

In [3]:
item_ids = list(chain(*df['assessmentItemID'].values))
test_ids = list(chain(*df['testId'].values))

In [4]:
cfg = {
    'seq_len': 128,
    'tshold': 86400, # one day
    'model_cfg': {
        'num_items': len(set(item_ids)) + 1,
        'num_tests': len(set(test_ids)) + 1,
        'num_feats': len(df.columns),
        'num_consec': 2 + 1,
        'dropout': 0.1,
        'enc_layers': 2,
        'dec_layers': 2,
        'hidden': 64,
        'ff_model': 64 * 4, # hidden * 4
        'n_heads': 64 // 32,
        'tshold': 86400
    }
}

In [5]:
seq_len = 256
num_items = len(set(item_ids))
num_tests = len(set(test_ids))
num_feats = len(df.columns)

## Modeling

### Chunk

In [6]:
split = lambda data, split_size: np.split(data[:split_size], split_size // cfg['seq_len'])
pad = lambda row, pad_len: F.pad(torch.tensor(row), (0, pad_len), value=-1)

In [7]:
df.columns

Index(['assessmentItemID', 'testId', 'answerCode', 'Elapsed',
       'testConsecutive'],
      dtype='object')

**How to make Paddings**
+ `item` shift by 1, pad with 0
+ `test` shift by 1, pad with 0
+ `answer` ... 어쩌지 ㅅㅂ 그냥 shift? -> 2로 채움 일단
+ `Elapsed` 는 그냥 0으로 채워도 문제 없을 것...같기도함
+ `testConsec` 얘도 문제네... -> 2로 채움 일단

In [8]:
def shift_value(row):
    
    shifted_row = []
    for i, col in enumerate(row):
        
        if i in [0, 1]: # item, test        
            col = list(map(lambda x: x+1, col))
    
        else: # elapsed, answer, test_consec
            pass
        
        shifted_row.append(col)
        
    shifted_row = tuple(map(np.array, shifted_row))
    return shifted_row

In [9]:
def pad(row, pad_len):
    
    padded_row = []
    for i, col in enumerate(row):
        
        if i in [0, 1, 3]: # item, test, elapsed
            
            col = F.pad(torch.tensor(col), (0, pad_len), value=0).numpy()
        
        elif i in [2, 4]: # answer, test_consec
            
            col = F.pad(torch.tensor(col), (0, pad_len), value=2).numpy()
            
        else:
            pass
        
        padded_row.append(col)
        
    return padded_row

In [17]:
chunked_data = []
for row in df.values:
    
    row = shift_value(row)    
    
    if len(row[0]) >= cfg['seq_len']:
        
        split_size = len(row[0]) - len(row[0])%cfg['seq_len']        
        # TODO: simple and clean. Currently,,, only simple
        chunked_data.extend(list(zip(*map(split, row, [split_size]*num_feats))))
        
    elif len(row[0]) < seq_len:
        
        pad_len = cfg['seq_len'] - len(row[0])
        chunked_data.append(tuple(pad(row, pad_len)))
        
    else:
        pass
        
chunked_data = pd.Series(chunked_data)

### DataLoader

In [18]:
import torch
from torch.utils.data import Dataset, DataLoader

In [19]:
class IscreamDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __getitem__(self, index):
        return tuple(torch.LongTensor(feat) for feat in self.data[index])
    
    def __len__(self):
        return len(self.data)

In [20]:
def collate_fn_short(batch):
    
    batchify = lambda idx: torch.cat([b[idx].unsqueeze(1) for b in batch], dim=1)
    return [batchify(i) for i in range(num_feats)]

In [21]:
pin_memory = False

trainset = IscreamDataset(chunked_data)
train_loader = DataLoader(trainset, shuffle=False,
                          batch_size=16,
                          drop_last=True,
                          pin_memory=True,
                          collate_fn=collate_fn_short)

In [22]:
batch = next(iter(train_loader))

### Model

In [23]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [24]:
import torch.optim as optim
import torch.nn as nn
import torch.nn.utils.rnn as rnn_utils
from torch.autograd import Variable

import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.nn import TransformerDecoder, TransformerDecoderLayer

In [25]:
from utils import PositionalEncoding, NoamOpt

In [26]:
class Saintplus(nn.Module):
    '''
    Try to Implement SAINT+
    ENCODER::
        + AssessmentItemID
        + TestID
        + Consecutive Test
        + Positional Encoding
        
    DECODER::
        + Correctness
        + Lag Time Encoding
        + Positional Encoding
    '''
    def __init__(self, cfg):
        super(Saintplus, self).__init__()
        
        self.cfg = cfg
        hidden = cfg['hidden']
        dropout = cfg['dropout']
        # Encoders' Embeddings
        self.item_emb = nn.Embedding(cfg['num_items'], hidden)
        self.test_emb = nn.Embedding(cfg['num_tests'], hidden)
        self.cons_emb = nn.Embedding(cfg['num_consec'], hidden)
        self.pos_enc = PositionalEncoding(hidden, dropout)
        
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        self.dropout_3 = nn.Dropout(dropout)
        
        
        # Decoders' Embeddings
        self.ans_emb = nn.Embedding(3, hidden)
        self.lag_emb = nn.Embedding(cfg['tshold'], hidden)
        self.pos_dec = PositionalEncoding(hidden, dropout) 
        
        self.dropout_4 = nn.Dropout(dropout)
        self.dropout_5 = nn.Dropout(dropout)
        
        self.src_mask = None
        self.trg_mask = None
        self.memory_mask = None # TODO:
        
        self.transformer = nn.Transformer(d_model=hidden,
                                         nhead=cfg['n_heads'],
                                         num_encoder_layers=cfg['enc_layers'],
                                         num_decoder_layers=cfg['dec_layers'],
                                         dim_feedforward=cfg['ff_model'],
                                         dropout=dropout,
                                         activation='relu')
        
        self.fc_out = nn.Linear(hidden, 2)
        self.softmax = nn.Softmax()
        
        
    def generate_square_subsequent_mask(self, sz, sz1=None):
        
        if sz1 == None:
            mask = torch.triu(torch.ones(sz, sz), 1)
        else:
            mask = torch.triu(torch.ones(sz, sz1), 1)
            
        return mask.masked_fill(mask==1, float('-inf'))

    
    def forward(self, item, test, ans, lag, consec):
        
        # Encoder Embeddings
        src = self.dropout_1(self.item_emb(item))
        test_emb = self.dropout_2(self.test_emb(test))
        cons_emb = self.dropout_3(self.cons_emb(consec))
        
        src = torch.add(src, test_emb)
        src = torch.add(src, cons_emb)
        src = self.pos_enc(src)
        
        # Decoder Embeddings
        trg = self.dropout_4(self.ans_emb(ans))
        lag_emb = self.dropout_5(self.lag_emb(lag))
        trg = torch.add(trg, lag_emb)
        trg = self.pos_dec(trg)
        
        if self.trg_mask is None or self.trg_mask.size(0) != len(trg):
            self.trg_mask = self.generate_square_subsequent_mask(len(trg)).to(trg.device)
            
        if self.src_mask is None or self.src_mask.size(0) != len(src):
            self.src_mask = self.generate_square_subsequent_mask(len(src)).to(trg.device)
            
        if self.memory_mask is None or self.memory_mask.size(0) != len(trg) or self.memory_mask.size(1) != len(src):
            self.memory_mask = self.generate_square_subsequent_mask(len(trg),len(src)).to(trg.device)
        
        output = self.transformer(src, trg,
                                 src_mask=self.src_mask,
                                 tgt_mask=self.trg_mask,)
#                                  memory_mask=self.memory_mask)
        
        output = self.softmax(self.fc_out(output))
        return output

Always remember that batch consists in the following order
+ Item   -> encoder
+ Test   -> encoder
+ Ans    -> decoder
+ Lag    -> decoder
+ Consec -> encoder

In [30]:
model = Saintplus(cfg['model_cfg'])
model.to(device)


In [31]:
loss_fn = nn.BCEWithLogitsLoss()
# optimizer = NoamOpt(cfg['model_cfg']['hidden'], 1, 4000, optim.Adam(model.parameters(), lr=0))
optimizer = optim.Adam(model.parameters(), lr=1e-1)

In [32]:
pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
pytorch_total_params

6467396

In [33]:
def add_shift(var, pad_value, device):
    
    return torch.cat([torch.LongTensor(1, var.shape[1]).fill_(pad_value).to(device), var])

Saintplus(
  (item_emb): Embedding(9455, 64)
  (test_emb): Embedding(1538, 64)
  (cons_emb): Embedding(3, 64)
  (pos_enc): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (dropout_1): Dropout(p=0.1, inplace=False)
  (dropout_2): Dropout(p=0.1, inplace=False)
  (dropout_3): Dropout(p=0.1, inplace=False)
  (ans_emb): Embedding(3, 64)
  (lag_emb): Embedding(86400, 64)
  (pos_dec): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (dropout_4): Dropout(p=0.1, inplace=False)
  (dropout_5): Dropout(p=0.1, inplace=False)
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): Linear(in_features=64, out_features=64, bias=True)
          )
          (linear1): Linear(in_features=64, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=256, out_fea

In [None]:
model.train()
epoch_loss = 0
answer_rates = []
for e in range(10):
    for i, batch in tqdm(enumerate(train_loader)):

        item, test, ans, lag, consec = map(lambda x: x.to(device), batch)
    #     item, test, ans, lag, consec = batch

        ans = add_shift(ans, 2, device).to(device)
        lag = add_shift(lag, 0, device).to(device)

        optimizer.zero_grad()
        output = model(item, test, ans[:-1, :], lag[:-1, :], consec)

        idx = ans[1:, :] != 2
        ans_logit = torch.Tensor([[0, 1] if logit else [1, 0] for logit in ans[1:, :][idx]]).to(device)
        loss = loss_fn(output.squeeze()[idx], ans_logit.float())

        loss.backward()

#         torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        
        pred = output.squeeze()[idx]
        true = ans[1:, :][idx]
        correct = true.cpu().numpy() & pred.argmax(axis=1).cpu().numpy()
        answer_rate = correct[correct == 1].shape[0] / correct.shape[0]
        answer_rates.append(answer_rate)

        epoch_loss += loss.item()
        
    print(answer_rate)

1139it [02:53,  6.56it/s]
1it [00:00,  6.67it/s]

0.2003968253968254


1139it [02:53,  6.57it/s]
1it [00:00,  6.49it/s]

0.3834325396825397


753it [01:54,  6.51it/s]

In [50]:
pred.argmax(axis=1)

tensor([1, 1, 1,  ..., 1, 1, 1], device='cuda:0', grad_fn=<NotImplemented>)

In [51]:
true

tensor([1, 1, 1,  ..., 0, 1, 1], device='cuda:0')

In [56]:
correct = true.cpu().numpy() & pred.argmax(axis=1).cpu().numpy()

In [58]:
correct[correct == 1].shape

(639,)

In [59]:
correct.shape

(1967,)