In [None]:
import argparse
import math
import pandas as pd
import joblib
import numpy as np

from typing import Optional
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable
from datetime import datetime
from sklearn import metrics
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, accuracy_score
from torch_geometric.data import Data
import scipy.sparse as sp

In [None]:
read_col = ['studentId', 'skill', 'problemId', 'startTime', 'correct', 'original', 'attemptCount']
target = 'correct'
df = pd.read_csv('jxy/KT/NoteBook/ADM2017/assisment_2017_raw.csv', low_memory=False, encoding="ISO-8859-1")[read_col]
print('original df length is %d' % len(df))


df = df[df['original'].isin([1])]
print('After removing scaffolding problems, records number %d' % len(df))

df.sort_values('startTime', inplace=True)

min_inter_num = 3
users = df.groupby(['studentId'], as_index=True)
delete_users = []
for u in users:
    if len(u[1]) < min_inter_num:
        delete_users.append(u[0])

print('deleted user number based min-inters %d' % len(delete_users))
df = df[~df['studentId'].isin(delete_users)]
df = df[[ 'studentId', 'skill', 'problemId', 'correct']]
print('After deleting some users, records number %d' % len(df))


problems_list = df['problemId'].drop_duplicates().tolist()
skills_list = df['skill'].drop_duplicates().tolist()
print(len(problems_list))
print(len(skills_list))

problems_dict = {i:problems_list[i] for i in range(0,len(problems_list))}
skills_dict = {i:skills_list[i] for i in range(0,len(skills_list))}
problems_re_dict = {problems_list[i]:i for i in range(0,len(problems_list))}
skills_re_dict = {skills_list[i]:i for i in range(0,len(skills_list))}

df['skill_cat'] = df['skill'].apply(lambda r: skills_re_dict[r])
df['problem_cat'] = df['problemId'].apply(lambda r: problems_re_dict[r])

skill_problem = df[['skill_cat', 'problem_cat']].groupby(['skill_cat'], as_index=True).apply(lambda r: np.array(list(set(r['problem_cat'].values))))
skill_prob_dict = {}
for skill_prob in skill_problem.index:
    skill_prob_dict[skill_prob] = skill_problem[skill_prob]


user_sequence = df[['studentId', 'correct', 'skill_cat', 'problem_cat']].groupby(['studentId']).apply(
                lambda r: (r['skill_cat'].values, r['problem_cat'].values, r['correct'].values))

In [None]:
WINDOW = 50

def co_acc_skills(user_skill, matrix_agg, matrix_cnt, skill_dict):
    count = 0
    agg = 0
    for user in tqdm(user_skill.index):
        skills = user_skill[user][0]
        correct = user_skill[user][2]

        for i in range(0,len(skills)-1):
            for j in range(i+1, min(i+1+WINDOW, len(skills))):
                matrix_cnt[skills[i]][skills[j]] += 1
                matrix_agg[skills[i]][skills[j]] += correct[i] * correct[j]

    return matrix_agg, matrix_cnt

skill_mats = []
print('processing skill co-currence')

skills = skills_re_dict.keys()
mat_length = len(skills)

print(mat_length)
skill_dict = {}
skill_key = []

matrix_agg = np.zeros((mat_length,mat_length))
matrix_cnt = np.zeros((mat_length,mat_length))

agg, cnt = co_acc_skills(user_sequence, matrix_agg, matrix_cnt, skills_re_dict)
print(agg)
print(cnt)
res = agg / (cnt + 1e-8)
print(res)
#
joblib.dump(res, 'c2c_para.pkl.zip')


In [None]:
dimensionK = 64
mat = (res.transpose(1, 0) + res) / 2

class MDPreTrain(nn.Module):
    def __init__(self, a_nums, k):
        super(MDPreTrain, self).__init__()
        self.embed_a = torch.nn.Embedding(a_nums, k)

    def forward(self, a):
        a_vector = self.embed_a(a)
        return torch.matmul(a_vector,a_vector.T)

    def getemb(self, a):
        return self.embed_a(a).detach().data.cpu().numpy()

a = torch.tensor(np.array([_ for _ in range(0,mat.shape[0])]))

ajaMat = torch.tensor(mat).float()
md = MDPreTrain(mat.shape[0], dimensionK)

optimizer = torch.optim.Adam(md.parameters(), lr=1e-2)

loss_func = torch.nn.MSELoss()


for epoch in tqdm(range(0,10001)):
    prediction = md(a)
    loss = loss_func(prediction, ajaMat)
    if loss.item() < 1e-3:
        print('loss小到可以提前停止了')
        break
    optimizer.zero_grad()
    loss.backward(retain_graph=True)
    optimizer.step()

    if epoch % 500 == 0:
        print('epoch:{}, loss:{}'.format(epoch, loss.item()))

skill_emb = {}
emb_res = md.getemb(a)
joblib.dump(emb_res, 'emb_c2c.pkl.zip')

In [None]:
WINDOW = 50

def co_acc_problems(user_skill, matrix_agg, matrix_cnt, skill_dict):
    for user in tqdm(user_skill.index):
        problems = user_skill[user][1]
        correct = user_skill[user][2]
        for i in range(0,len(problems)-1):
            for j in range(i+1, min(i+1+WINDOW, len(problems))):
                matrix_cnt[problems[i]][problems[j]] += 1
                matrix_agg[problems[i]][problems[j]] += correct[i] * correct[j]
    return matrix_agg, matrix_cnt


print('processing problem co-currence')
mat_length = len(problems_list)

matrix_agg = np.zeros((mat_length,mat_length))
matrix_cnt = np.zeros((mat_length,mat_length))


agg, cnt = co_acc_problems(user_sequence, matrix_agg, matrix_cnt, problems_re_dict)
print(agg)
print(cnt)
res = agg / (cnt + 1e-8)
print(res)


In [None]:
mat = (res + res.T) / 2

dimensionK = 64
mat_length = mat.shape[0]

class MDPreTrain(nn.Module):
    def __init__(self, a_nums, k):
        super(MDPreTrain, self).__init__()
        self.embed_a = torch.nn.Embedding(a_nums, k)

    def forward(self, a):
        a_vector = self.embed_a(a)
        return torch.matmul(a_vector,a_vector.T)

    def getemb(self, a):
        return self.embed_a(a).detach().data.cpu().numpy()

a = torch.tensor(np.array([_ for _ in range(0,mat_length)])).cuda(0)

ajaMat = torch.tensor(mat).float().cuda(0)
md = MDPreTrain(mat_length, dimensionK).cuda(0)

optimizer = torch.optim.Adam(md.parameters(), lr=1e-2)
loss_func = torch.nn.MSELoss()


for epoch in tqdm(range(0,5001)):
    prediction = md(a)
    loss = loss_func(prediction, ajaMat)
    if loss.item() < 1e-2:
        print('loss小到可以提前停止了')
        break
    optimizer.zero_grad()
    loss.backward(retain_graph=True)
    optimizer.step()

    if epoch % 500 == 0:
        print('epoch:{}, loss:{}'.format(epoch, loss.item()))

skill_emb = {}
emb_res = md.getemb(a)
joblib.dump(emb_res, 'emb_e2e.pkl.zip')

In [None]:
problem_emb = {}
emb_res = joblib.load('emb_e2e.pkl.zip')
for i in range(0,len(emb_res)):
    problem_emb[i] = emb_res[i].tolist()
print(len(problem_emb))

skill_emb = {}
emb_res = joblib.load('emb_c2c.pkl.zip')
for i in range(0,len(emb_res)):
    skill_emb[i] = emb_res[i].tolist()
print(len(skill_emb))

df['skill_emb'] = df['skill_cat'].apply(lambda r: skill_emb[r])
df['problem_emb'] = df['problem_cat'].apply(lambda r: problem_emb[r])

new_user_sequence = df[['studentId', 'skill_cat', 'problem_cat', 'correct', 'skill_emb', 'problem_emb']].groupby(['studentId']).apply(
                lambda r: (r['skill_cat'].values, r['problem_cat'].values, r['correct'].values, r['skill_emb'].values, r['problem_emb'].values))
user_sequence = new_user_sequence

In [None]:
train_val_group_c = user_sequence.sample(frac=0.8, random_state=14)
test_group_c = user_sequence[~user_sequence.index.isin(train_val_group_c.index)]
train_group_c = train_val_group_c.sample(frac=0.75, random_state=14)
val_group_c = train_val_group_c[~train_val_group_c.index.isin(train_group_c.index)]

In [None]:
class MyDataset(Dataset):
    def __init__(self, group, min_samples=3, max_seq=100, cold_start=10):
        '''需要过滤seq<3 并且>max_seq的进行折叠'''
        self.max_seq = max_seq
        self.samples = {}
        self.user_ids = []
        for user_id in group.index:
            sids, pids, labels, s_emb, p_emb = group[user_id]

            if len(labels) > self.max_seq:  # Fold
                total_questions = len(labels)
                initial = total_questions % self.max_seq
                for seq in range(total_questions // self.max_seq):
                    self.user_ids.append(f"{user_id}_{seq}")
                    start = seq * self.max_seq
                    end = start + self.max_seq
                    self.samples[f"{user_id}_{seq}"] = (sids[start:end], pids[start:end], labels[start:end],
                                                        s_emb[start:end], p_emb[start:end])
                if initial >= min_samples:
                    seq = total_questions // self.max_seq
                    start = seq * self.max_seq
                    end = start + initial
                    self.user_ids.append(f"{user_id}_{seq}")
                    self.samples[f"{user_id}_{seq}"] = (sids[start:end], pids[start:end], labels[start:end],
                                                        s_emb[start:end], p_emb[start:end])
            else:
                user_id = str(user_id)
                self.user_ids.append(user_id)
                self.samples[user_id] = (sids, pids, labels, s_emb, p_emb)

    def __len__(self):
        return len(self.user_ids)

    def __getitem__(self, index):
        user_id = self.user_ids[index]
        sids, pids, labels, s_embs, p_embs = self.samples[user_id]
        seq_len = len(labels)
        tmp_y = self.init_sequence(labels)
        tmp_sc = self.init_sequence(sids)
        tmp_pc = self.init_sequence(pids)

        mask = np.concatenate((np.ones(seq_len - 1, dtype=bool), np.zeros(self.max_seq - seq_len, dtype=bool)))
        tmp_s_embs = np.zeros((self.max_seq, len(s_embs[0])),dtype=float)
        tmp_p_embs = np.zeros((self.max_seq, len(p_embs[0])),dtype=float)
        for i in range(0,seq_len):
            tmp_s_embs[i][:] = s_embs[i][:]
        for i in range(0,seq_len):
            tmp_p_embs[i][:] = p_embs[i][:]

        next_y = tmp_y[1:]
        next_p = tmp_pc[1:]
        next_s = tmp_sc[1:]
        now_y = tmp_y[:-1]
        now_p = tmp_pc[:-1]
        now_s = tmp_sc[:-1]

        now_s_emb = tmp_s_embs[:-1]
        next_s_emb = tmp_s_embs[1:]
        now_p_emb = tmp_p_embs[:-1]
        next_p_emb = tmp_p_embs[1:]

        # return now_s, next_s, now_y, next_y, now_p, next_p, mask
        return now_s_emb, next_s_emb, now_y, next_y, now_p_emb, next_p_emb, mask

    def init_sequence(self, target, dtype_=int):
        seq_len = len(target)
        tmp = np.zeros(self.max_seq, dtype=dtype_)
        tmp[:seq_len] = target
        return tmp

In [None]:
class BaseModel(nn.Module):
    def __init__(self, s_emb_size, p_emb_size, s_emb_dim, p_emb_dim, kernel_dim, kernel_num,
                 output_dim=1, seq_len = 99, s_total=111, p_total=15911, device=0):
        super(BaseModel, self).__init__()
        self.seq_len = seq_len
        self.s_total = s_total
        self.p_total = p_total

        self.bond_dim = kernel_num * kernel_dim
        self.kernel_indim = (p_emb_dim + s_emb_dim) * 2
        self.kernel_dim = kernel_dim
        self.kernel_num = kernel_num
        self.head = 8

        self.query_fc = nn.Linear((p_emb_dim + s_emb_dim), kernel_dim)
        self.key_fc = nn.Linear(self.kernel_indim, kernel_dim)
        self.head_dim = int(self.kernel_indim / 8)

        self.head_out = int(self.kernel_dim / 8)

        self.value_fc = nn.ModuleList([nn.Linear(in_features=self.head_dim, out_features=self.head_out, bias=True) for x in
                                  range(self.head)])

        self.kernel_fc_indim = s_emb_dim + p_emb_dim + kernel_num * kernel_dim
        self.kernel_fc = nn.Linear(self.kernel_fc_indim, output_dim)
        self.device = device

    def forward(self, s_now, s_next, p_now, p_next, labels):
        batch_size = s_now.size(0)
        seq_len = s_now.size(1)
        input_concat = torch.cat([s_now, p_now], dim=-1).to(torch.float)
        next_concat = torch.cat([s_next, p_next], dim=-1).to(torch.float)


        query = self.query_fc(next_concat)

        input_concat = self.emb_extend(input_concat,labels).to(torch.float32)
        key = self.key_fc(input_concat)


        score = torch.einsum('lik,ljk->lij', query, key)
        k_select = []
        for i in range(0,self.kernel_num):
            tmp = torch.tensor(np.concatenate((np.zeros(self.kernel_num - i - 1),np.arange(0,seq_len - self.kernel_num +i + 1))), dtype=torch.int64).repeat(batch_size, 1).unsqueeze(-1).cuda(self.device)
            k_select.append(tmp)


        for i in range(0,self.kernel_num):
            tmp = torch.gather(score,-1,k_select[i])
            tmp[:,0:self.kernel_num - i - 1,:] = -1e10
            k_select[i] = tmp

        score = torch.cat(k_select, dim=-1)
        score = F.softmax(score, dim =-1)

        score = score.unsqueeze(-1).repeat(1,1,1,kernel_dim).reshape(batch_size,seq_len,self.bond_dim)
        # input_concat = self.emb_extend(input_concat,labels).to(torch.float32)
        zeros = torch.zeros_like(input_concat)

        k_select=[]
        for i in range(0,self.kernel_num):
            tmp = torch.cat([zeros[:,0:self.kernel_num - i - 1,:], input_concat[:,:seq_len - self.kernel_num + i + 1,:]], dim=1)
            tmp_list = []

            for j in range(0, self.head):
                head_in = tmp[:,:,j*self.head_dim:(j+1)*self.head_dim]
                tmp_list.append(self.value_fc[j](head_in))
            tmp = torch.concat(tmp_list,dim=-1)
            k_select.append(tmp)

        con_kernel = torch.cat(k_select, dim=-1)
        # 所以最后，是sum pooling 还是concat
        con_kernel = con_kernel * score

        con_kernel = torch.tanh(con_kernel)
        out = torch.cat([con_kernel, next_concat],dim = -1).to(torch.float)

        return self.kernel_fc(out).squeeze(-1)


    def emb_extend(self, item_inputs, label_inputs):
        dim = item_inputs.shape[-1]
        label_inputs = label_inputs.unsqueeze(-1).float()
        inputs = torch.cat([item_inputs, item_inputs], dim=-1)
        inputs[..., :dim] *= label_inputs
        inputs[..., dim:] *= 1 - label_inputs
        return inputs

In [None]:
def process(eval, data_loader, model, optim, num, device):
    epoch_loss = 0
    prediction = []
    true = []
    binary = []
    for data in  (data_loader):
        input_s = data[0].cuda(device)
        next_s = data[1].cuda(device)
        label = data[2].cuda(device)
        y = data[3].cuda(device)
        input_p = data[4].cuda(device)
        next_p = data[5].cuda(device)
        mask = data[6].cuda(device)

        batch_size = mask.shape[0]

        if eval == 'train':
            model.train()
            optim.zero_grad()

            output = model(input_s, next_s, input_p, next_p, label)
            # return
            logits = torch.masked_select(output,mask)

            tensor_label = torch.masked_select(y,mask)
            loss_gate = nn.BCEWithLogitsLoss()  #

            tensor_label = tensor_label.to(torch.float)
            loss = loss_gate(logits, tensor_label)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 20)
            optim.step()

            epoch_loss = epoch_loss + loss.item()
            prediction.extend(torch.sigmoid(logits).detach().cpu().numpy())
            true.extend(tensor_label.cpu().numpy())

            binary.extend((torch.sigmoid(logits).detach().cpu().numpy() >= 0.5))
        else:
            with torch.no_grad():
                model.eval()
                output = model(input_s, next_s, input_p, next_p, label)
                logits = torch.masked_select(output,mask)
                tensor_label = torch.masked_select(y,mask)

                tensor_label = tensor_label.to(torch.float)
                prediction.extend(torch.sigmoid(logits).detach().cpu().numpy())
                true.extend(tensor_label.cpu().numpy())

                binary.extend((torch.sigmoid(logits).detach().cpu().numpy() >= 0.5))

    return roc_auc_score(true,prediction), accuracy_score(true,binary)


In [None]:
num = 86

learning_rate = .1
timy_learning_rate = 1e-3
epsilon = .1
grad_threshold = 20
model_dropout = .6
epoch = 1000
epoch_test = 1
batch_size = 32
sequence_size = 100
sequence_threshold = 3
cold_start = 100

heads = 8
embed_dimension = 64

S_CAT_NUM = 86
P_CAT_NUM = 1183
S_EMBEDDING_SIZE = S_CAT_NUM
P_EMBEDDING_SIZE = P_CAT_NUM

HIDDEN_DIM = 64
HIDDEN_LAYER = 1
S_EMBEDDING_DIM = 64
P_EMBEDDING_DIM = 64
kernel_num = 16
kernel_dim = 64

DEVICE = 0

train_dataset = MyDataset(train_group_c, max_seq=sequence_size, cold_start=cold_start)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_dataset = MyDataset(test_group_c, max_seq=sequence_size, cold_start=cold_start)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True)


model = BaseModel(S_EMBEDDING_SIZE, P_EMBEDDING_SIZE, S_EMBEDDING_DIM , P_EMBEDDING_DIM, kernel_dim, kernel_num,
                  seq_len=sequence_size-1, s_total=S_CAT_NUM, p_total=P_CAT_NUM, device=DEVICE).cuda(DEVICE)

optim = torch.optim.Adam(model.parameters(), lr=learning_rate, eps=epsilon)

count_epoch_for_test = 0
max_auc = 0
max_acc = 0
for i in range(epoch):
    count_epoch_for_test += 1
    process(eval='train', data_loader = train_dataloader, num=num, model=model, optim=optim, device=DEVICE)
    # break
    if count_epoch_for_test == epoch_test:
        auc, acc = process(eval='test', data_loader = valid_dataloader, num = num, model=model, optim=optim, device=DEVICE)
        max_auc = max(max_auc, auc)
        max_acc = max(max_acc, acc)
        print("epoch - {}/{} auc - {:.5f} acc - {:5f} max - {:.5f}/ {:.5f}".format(i+1, epoch, auc, acc, max_auc, max_acc))
        count_epoch_for_test = 0