In [62]:
import sys
import numpy as np

import os
import glob
import time
from os import listdir

import sklearn.preprocessing as preprocessing
from sklearn.metrics import roc_auc_score

import torch.nn.functional as F

import pandas as pd

from sys import getsizeof

#supress warnings
import warnings
warnings.filterwarnings("ignore")

import time
import gc
import math 
import pickle
import torch

In [55]:
group = pd.read_pickle("../data/processed/inference_group")

with open('../data/processed/inference_last_timestamp.pickle', 'rb') as handle:
    last_timestamp = pickle.load(handle)

boundaries = [120,600,1800,3600,10800,43200,86400,259200,604800]


  0%|          | 0/2500000 [01:09<?, ?it/s]


In [59]:
#Loading the model
sys.path.append('..')

from src.models.model import TransformerModel
import yaml

with open('../config.yaml') as file:
    config = yaml.load(file, Loader=yaml.FullLoader)



#Transformer hyperparameter 
d_model = config["d_model"]

decoder_layers = config["decoder_layers"]
encoder_layers = config["encoder_layers"]


correct_start_token = config["correct_start_token"]
user_answer_start_token = config["user_answer_start_token"]
seq_len = config["seq_len"]

dropout = config["dropout"]
ff_model = d_model*4
att_heads = d_model // 64


#Loading questions, and every question corresponding part
que_data = pd.read_csv( "../data/raw/questions.csv")
part_valus = que_data.part.values
unique_ques = len(que_data)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
part_valus = torch.LongTensor(part_valus).to(device)
que_emb_size = unique_ques

model = TransformerModel(que_emb_size, hidden=d_model,part_arr=part_valus, dec_layers=decoder_layers, enc_layers=encoder_layers, dropout=dropout, nheads=att_heads, ff_model=ff_model).to(device)

In [60]:
model.load_state_dict(torch.load("../models/model_best.torch"))
model.eval()

<All keys matched successfully>

In [33]:
def pred_users(vals): #Input must be (eval_batch, 3): ["user_id", "content_id", "content_type_id", "timestamp"]

    eval_batch = vals.shape[0]

    tensor_question = np.zeros((eval_batch, seq_len), dtype=np.long)
    tensor_answers = np.zeros((eval_batch, seq_len), dtype=np.long)
    tensor_ts = np.zeros((eval_batch, seq_len), dtype=np.long)
    tensor_user_answer = np.zeros((eval_batch, seq_len), dtype=np.long)


    val_len = []
    preds = []
    group_index = group.index

    for i, line in enumerate(vals):

        if line[2] == True:
            val_len.append(0)
            continue

        user_id = line[0]
        question_id = line[1]
        timestamp = get_timestamp(line[3], user_id) #Compute timestamp difference correctly
        

        que_history = np.array([], dtype=np.int32)
        answers_history = np.array([], dtype=np.int32)  
        ts_history = np.array([], dtype=np.int32)  
        user_answer_history = np.array([], dtype=np.int32)  

        if user_id in group_index:

            cap = seq_len-1
            que_history, answers_history, ts_history, user_answer_history = group[user_id]

            que_history = que_history[-cap:]
            answers_history = answers_history[-cap:]
            ts_history = ts_history[-cap:]
            user_answer_history = user_answer_history[-cap:]


        #Decoder data, add start token
        answers_history = np.concatenate(([correct_start_token],answers_history))
        user_answer_history = np.concatenate(([user_answer_start_token],user_answer_history))

        #Decoder data
        que_history = np.concatenate((que_history, [question_id]))  #Add current question
        ts_history = np.concatenate((ts_history, [timestamp]))  

        tensor_question[i][:len(que_history)] = que_history
        tensor_answers[i][:len(que_history)] = answers_history
        tensor_ts[i][:len(que_history)] = ts_history
        tensor_user_answer[i][:len(que_history)] = user_answer_history

        val_len.append(len(que_history))

    tensor_question = torch.from_numpy(tensor_question).long().T.to(device)
    tensor_answers = torch.from_numpy(tensor_answers).long().T.to(device)
    tensor_ts = torch.from_numpy(tensor_ts).long().T.to(device)
    tensor_user_answer = torch.from_numpy(tensor_user_answer).long().T.to(device)
    
    with torch.no_grad():   #Disable gradients so prediction runs faster
        out = F.sigmoid(model(tensor_question, tensor_answers, tensor_ts, tensor_user_answer)).squeeze(dim=-1).T


    for j in range(len(val_len)):
        preds.append(out[j][val_len[j]-1].item())

    return preds

In [34]:
def update_group_var(vals):
    
    global group
    
    for i, line in enumerate(vals):
        
        user_id = line[0]
        question_id = line[1]
        
        content_type_id = line[2]
        ts = get_timestamp(line[3], user_id)
        
        correct = line[4]
        user_answer = line[5]
        
        
        if content_type_id == True:
            continue

        if last_timestamp.get(user_id, -1) == -1:
            last_timestamp[user_id] = 0
        else:
            last_timestamp[user_id] = line[3]
            
        if user_id in group.index:
            questions= np.append(group[user_id][0],[question_id])
            answers= np.append(group[user_id][1],[correct])
            ts= np.append(group[user_id][2],[ts])
            user_answer= np.append(group[user_id][3],[user_answer])
            
            group[user_id] = (questions, answers, ts, user_answer)
        else:
            group[user_id] = (np.array([question_id], dtype=np.int32), np.array([correct], dtype=np.int32), np.array([ts], dtype=np.int32)
                             ,np.array([user_answer], dtype=np.int32))

In [35]:
#Re-creates the timestamp encoding
def get_timestamp(ts, user_id):
    
    if last_timestamp.get(user_id, -1) == -1:
        return 0
    
    diff = (ts - last_timestamp[user_id])/1000
    
    if diff < 0:
        return 0
    
    if diff <= 60:
        return int(diff)
    
    for i, boundary in enumerate(boundaries):
        if boundary > diff:
            break
            
    if i == len(boundaries) - 1:
        return 60+i+1
    
    return 60+i

In [36]:
#Tito's iterator: https://www.kaggle.com/its7171/time-series-api-iter-test-emulator

class Iter_Valid(object):
    def __init__(self, df, max_user=1000):
        df = df.reset_index(drop=True)
        self.df = df
        self.user_answer = df['user_answer'].astype(str).values
        self.answered_correctly = df['answered_correctly'].astype(str).values
        df['prior_group_responses'] = "[]"
        df['prior_group_answers_correct'] = "[]"
        self.sample_df = df[df['content_type_id'] == 0][['row_id']]
        self.sample_df['answered_correctly'] = 0
        self.len = len(df)
        self.user_id = df.user_id.values
        self.task_container_id = df.task_container_id.values
        self.content_type_id = df.content_type_id.values
        self.max_user = max_user
        self.current = 0
        self.pre_user_answer_list = []
        self.pre_answered_correctly_list = []

    def __iter__(self):
        return self
    
    def fix_df(self, user_answer_list, answered_correctly_list, pre_start):
        df= self.df[pre_start:self.current].copy()
        sample_df = self.sample_df[pre_start:self.current].copy()
        df.loc[pre_start,'prior_group_responses'] = '[' + ",".join(self.pre_user_answer_list) + ']'
        df.loc[pre_start,'prior_group_answers_correct'] = '[' + ",".join(self.pre_answered_correctly_list) + ']'
        self.pre_user_answer_list = user_answer_list
        self.pre_answered_correctly_list = answered_correctly_list
        return df, sample_df

    def __next__(self):
        added_user = set()
        pre_start = self.current
        pre_added_user = -1
        pre_task_container_id = -1

        user_answer_list = []
        answered_correctly_list = []
        while self.current < self.len:
            crr_user_id = self.user_id[self.current]
            crr_task_container_id = self.task_container_id[self.current]
            crr_content_type_id = self.content_type_id[self.current]
            if crr_content_type_id == 1:
                # no more than one task_container_id of "questions" from any single user
                # so we only care for content_type_id == 0 to break loop
                user_answer_list.append(self.user_answer[self.current])
                answered_correctly_list.append(self.answered_correctly[self.current])
                self.current += 1
                continue
            if crr_user_id in added_user and ((crr_user_id != pre_added_user) or (crr_task_container_id != pre_task_container_id)):
                # known user(not prev user or differnt task container)
                return self.fix_df(user_answer_list, answered_correctly_list, pre_start)
            if len(added_user) == self.max_user:
                if  crr_user_id == pre_added_user and crr_task_container_id == pre_task_container_id:
                    user_answer_list.append(self.user_answer[self.current])
                    answered_correctly_list.append(self.answered_correctly[self.current])
                    self.current += 1
                    continue
                else:
                    return self.fix_df(user_answer_list, answered_correctly_list, pre_start)
            added_user.add(crr_user_id)
            pre_added_user = crr_user_id
            pre_task_container_id = crr_task_container_id
            user_answer_list.append(self.user_answer[self.current])
            answered_correctly_list.append(self.answered_correctly[self.current])
            self.current += 1
        if pre_start < self.current:
            return self.fix_df(user_answer_list, answered_correctly_list, pre_start)
        else:
            raise StopIteration()

In [41]:
validation = pd.read_pickle("../data/interim/cv_valid.pickle")

In [47]:
iter_test = Iter_Valid(validation,max_user=1000)
predicted = []
def set_predict(df):
    predicted.append(df)

In [65]:
%%time
import ast
from tqdm import tqdm

model.eval()

preds = []
pbar = tqdm(total=validation.shape[0], position=0, leave=True)
check = None

for (test_data, current_prediction_df) in iter_test:   
        
    if check is not None:
        past_vals = np.array(ast.literal_eval(test_data.iloc[0].prior_group_answers_correct)) 
        past_answers = np.array(ast.literal_eval(test_data.iloc[0].prior_group_responses))

        past_vals = np.concatenate((vals, past_vals.reshape(len(past_vals),1)), axis=1)
        past_vals = np.concatenate((past_vals, past_answers.reshape(len(past_answers),1)), axis=1)

        update_group_var(past_vals)  #Update database with the vals of the last batch        
        
    vals = test_data[["user_id","content_id","content_type_id","timestamp"]].values
    preds.extend(pred_users(vals))
    
    check = 1

    pbar.update(len(test_data))

  4%|▍         | 1158/27272 [00:08<03:16, 133.03it/s]

KeyboardInterrupt: 

In [64]:
df = validation.iloc[:len(preds)]
df["preds"] = preds

df = df[df.content_type_id == False]
print('Validation ROC:',roc_auc_score(df.answered_correctly, df.preds))

Validation ROC: 0.5181898138018086
