In [None]:
import pandas as pd
import joblib
import numpy as np

from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data
import torch_geometric.nn as pyg_nn
import scipy.sparse as sp

WINDOW = 60 * 60
c2c_emb_file = '2017_CL_c2c.pkl.zip'
e2e_emb_file = '2017_CL_e2e.pkl.zip'
model_temp_file = 'model_temp1.pkl'

DEVICE = 0

dimensionK = 64

In [None]:
read_col = ['studentId', 'skill', 'problemId', 'startTime', 'correct', 'original', 'attemptCount']
target = 'correct'
df = pd.read_csv('jxy/KT/NoteBook/ADM2017/assisment_2017_raw.csv', low_memory=False, encoding="ISO-8859-1")[read_col]
print('original df length is %d' % len(df))


df = df[df['original'].isin([1])]
print('After removing scaffolding problems, records number %d' % len(df))

df.sort_values('startTime', inplace=True)

min_inter_num = 3
users = df.groupby(['studentId'], as_index=True)
delete_users = []
for u in users:
    if len(u[1]) < min_inter_num:
        delete_users.append(u[0])

print('deleted user number based min-inters %d' % len(delete_users))
df = df[~df['studentId'].isin(delete_users)]
df = df[[ 'studentId', 'skill', 'problemId', 'correct', 'startTime']]
print('After deleting some users, records number %d' % len(df))


problems_list = df['problemId'].drop_duplicates().tolist()
skills_list = df['skill'].drop_duplicates().tolist()
print(len(problems_list))
print(len(skills_list))

problems_dict = {i:problems_list[i] for i in range(0,len(problems_list))}
skills_dict = {i:skills_list[i] for i in range(0,len(skills_list))}
problems_re_dict = {problems_list[i]:i for i in range(0,len(problems_list))}
skills_re_dict = {skills_list[i]:i for i in range(0,len(skills_list))}

df['skill_cat'] = df['skill'].apply(lambda r: skills_re_dict[r])
df['problem_cat'] = df['problemId'].apply(lambda r: problems_re_dict[r])

skill_problem = df[['skill_cat', 'problem_cat']].groupby(['skill_cat'], as_index=True).apply(lambda r: np.array(list(set(r['problem_cat'].values))))
skill_prob_dict = {}
for skill_prob in skill_problem.index:
    skill_prob_dict[skill_prob] = skill_problem[skill_prob] # .tolist()


user_sequence = df[['studentId', 'correct', 'skill_cat', 'problem_cat', 'startTime']].groupby(['studentId']).apply(
                lambda r: (r['skill_cat'].values, r['problem_cat'].values, r['correct'].values, r['startTime'].values))


In [None]:
def co_acc_skills(user_skill, matrix_agg, matrix_cnt, skill_dict):
    for user in tqdm(user_skill.index):
        skills = user_skill[user][0]
        correct = user_skill[user][2]
        tsp = user_sequence[user][3]
        for i in range(0,len(skills)-1):
            for j in range(i+1, len(skills)):
                if tsp[j] - tsp[i] > WINDOW:
                    break
                matrix_cnt[skills[i]][skills[j]] += 1
                matrix_agg[skills[i]][skills[j]] += correct[i] * correct[j]

    return matrix_agg, matrix_cnt

skill_mats = []
print('processing skill co-currence')

skills = skills_re_dict.keys()
mat_length = len(skills)

print(mat_length)
skill_dict = {}
skill_key = []

matrix_agg = np.zeros((mat_length,mat_length))
matrix_cnt = np.zeros((mat_length,mat_length))

agg, cnt = co_acc_skills(user_sequence, matrix_agg, matrix_cnt, skills_re_dict)

res = agg / (cnt + 1e-8)

In [None]:
mat = (res.transpose(1, 0) + res) / 2

class MDPreTrain(nn.Module):
    def __init__(self, a_nums, k):
        super(MDPreTrain, self).__init__()
        self.embed_a = torch.nn.Embedding(a_nums, k)

    def forward(self, a):
        a_vector = self.embed_a(a)
        return torch.matmul(a_vector,a_vector.T)

    def getemb(self, a):
        return self.embed_a(a).detach().data.cpu().numpy()

a = torch.tensor(np.array([_ for _ in range(0,mat.shape[0])]))

ajaMat = torch.tensor(mat).float()
md = MDPreTrain(mat.shape[0], dimensionK)

optimizer = torch.optim.Adam(md.parameters(), lr=1e-2)
# 优化函数
loss_func = torch.nn.MSELoss()
# 损失函数

for epoch in tqdm(range(0,10001)):
    prediction = md(a)
    loss = loss_func(prediction, ajaMat)
    if loss.item() < 1e-3:
        print('loss小到可以提前停止了 {}'.format(loss.item()))
        break
    optimizer.zero_grad()
    loss.backward(retain_graph=True)
    optimizer.step()

    if epoch % 500 == 0:
        print('epoch:{}, loss:{}'.format(epoch, loss.item()))

skill_emb = {}
emb_res = md.getemb(a)

print('loading in file:'+c2c_emb_file)
joblib.dump(emb_res, c2c_emb_file)

In [None]:
def co_acc_problems(user_skill, matrix_agg, matrix_cnt, skill_dict):
    for user in tqdm(user_skill.index):
        problems = user_skill[user][1]
        correct = user_skill[user][2]
        tsp = user_skill[user][3]
        for i in range(0,len(problems)-1):
            for j in range(i+1, len(problems)):
                if tsp[j] - tsp[i] > WINDOW:
                    break
                matrix_cnt[problems[i]][problems[j]] += 1
                matrix_agg[problems[i]][problems[j]] += correct[i] * correct[j]
    return matrix_agg, matrix_cnt


print('processing problem co-currence')
mat_length = len(problems_list)

matrix_agg = np.zeros((mat_length,mat_length))
matrix_cnt = np.zeros((mat_length,mat_length))

agg, cnt = co_acc_problems(user_sequence, matrix_agg, matrix_cnt, problems_re_dict)
res = agg / (cnt + 1e-8)

In [None]:
res_list = []
for skill in skill_prob_dict.keys():
    prob_list = skill_prob_dict[skill]
    new_res = np.zeros((prob_list.shape[0],prob_list.shape[0]))
    for i in range(0,prob_list.shape[0]):
        for j in range(0,prob_list.shape[0]):
            new_res[i][j] = res[prob_list[i]][prob_list[j]]
            if i == j:  continue
            new_res[j][i] = res[prob_list[j]][prob_list[i]]
    res_list.append(new_res)

In [None]:
def get_all_nodes_attributes_e(i,concept_length,total_length):
    concept_attrs = np.zeros((total_length,concept_length), int)
    concept_attrs[:,i] = 1
    node_attrs = np.zeros((total_length, total_length), int)
    np.fill_diagonal(node_attrs, 1)

    node_attrs = np.concatenate((concept_attrs,node_attrs),axis=-1)
    return node_attrs

def get_nodes_weights_e(sub_mat):
    weights_org = sub_mat
    weights_org_t = weights_org.transpose()
    weights_add = (weights_org + weights_org_t)/2
    np.fill_diagonal(weights_add, 0)

    neg_org = np.where(weights_add == 0, 1, 0)
    np.fill_diagonal(neg_org, 0)

    if np.count_nonzero(weights_add) == 0:
        threshold = 0
    else:
        threshold = np.sum(weights_add) / np.count_nonzero(weights_add)
    weights_add  = np.where(weights_add >= threshold, weights_add, 0)

    edges_org = sp.coo_matrix(weights_add)
    edges_neg = sp.coo_matrix(neg_org)
    return edges_org, weights_add, edges_neg

def normalize_adj(adj, self_loop=True):
    if self_loop:
        np.fill_diagonal(adj, 1)
    adj = sp.coo_matrix(adj)
    rowsum = np.array(adj.sum(1))
    d_inv_sqrt = np.power(rowsum, -0.5).flatten()
    d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.

    d_mat_inv_sqrt = sp.diags(d_inv_sqrt)
    return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo()

def e2edataset(mat):
    graph_list = []
    adj_list = []
    neg_edge_list = []
    attr_length = 0
    concept_length = len(mat)
    for sub_mat in mat:
        attr_length = max(attr_length, len(sub_mat))

    for i in range(0,len(mat)):
        sub_mat = mat[i]
        node_attrs = get_all_nodes_attributes_e(i,concept_length,attr_length)
        edges_org, adj, edges_neg = get_nodes_weights_e(sub_mat)
        adj = normalize_adj(adj)

        row = torch.tensor(edges_org.row)
        col = torch.tensor(edges_org.col)
        edge_index = torch.stack((row,col)).long()
        edge_attr = torch.tensor(edges_org.data).float()

        row = torch.tensor(edges_neg.row)
        col = torch.tensor(edges_neg.col)
        neg_edge_index = torch.stack((row,col)).long()

        x = node_attrs[[n for n in range(0,sub_mat.shape[0])]]
        x = torch.tensor(x).float()

        sub_graph = Data(x = x, edge_index = edge_index, edge_attr = edge_attr)
        graph_list.append(sub_graph)
        adj_list.append(adj)
        neg_edge_list.append(neg_edge_index)
    return graph_list, adj_list, neg_edge_list

print("\nGenerate C2C with 0 threshold...")
Egraph_list, adj_list, neg_edge_list = e2edataset(res_list)

In [None]:
class GCNLayer(nn.Module):
    def __init__(self, in_ft, out_ft, bias=True):
        super(GCNLayer, self).__init__()
        self.fc = nn.Linear(in_ft, out_ft, bias=False)
        self.bias = nn.Parameter(torch.FloatTensor(out_ft))
        self.bias.data.fill_(0.0)
        for m in self.modules():
            self.weights_init(m)
    def weights_init(self, m):
        if isinstance(m, nn.Linear):
            torch.nn.init.xavier_uniform_(m.weight.data)
            if m.bias is not None:
                m.bias.data.fill_(0.0)
    def forward(self, feat, adj):
        feat = self.fc(feat)
        out = torch.mm(adj, feat)
        if self.bias is not None:
            out += self.bias
        return out

class Encoder(nn.Module):
    def __init__(self, in_channels: int, hidden_channels, out_channels: int):
        super(Encoder, self).__init__()
        self.GCN1 = GCNLayer(in_channels, hidden_channels)
        self.GCN2 = GCNLayer(hidden_channels, out_channels)

    def forward(self, x, adj):
        h1 = self.GCN1(x, adj)
        h1 = torch.relu(h1)
        h2 = self.GCN2(h1, adj)

        final_sum_pooling  = torch.sum(h2,dim=0)
        con_sum_pooling = torch.sum(torch.concat([h1,h2],dim=-1),dim=0)
        return con_sum_pooling, final_sum_pooling

    def emb_encode(self, x, adj):
        h1 = self.GCN1(x, adj)
        h1 = torch.relu(h1)
        h2 = self.GCN2(h1, adj)
        return h2

class GRACE(nn.Module):
    def __init__(self, in_channels, out_channels,  num_proj_hidden: int, tau: float = 0.5):
        super(GRACE, self).__init__()
        self.encoder = Encoder(in_channels,out_channels,out_channels)
        self.tau: float = tau

        self.num_proj_hidden = num_proj_hidden
        self.num_hidden = out_channels

        self.proj_head = nn.Sequential(nn.Linear(out_channels, num_proj_hidden),nn.ReLU(inplace=True),
                                       nn.Linear(num_proj_hidden, num_proj_hidden),nn.ReLU(inplace=True),
                                       nn.Linear(num_proj_hidden, 10))

    def forward(self, attr, adj) -> torch.Tensor:
        c,f = self.encoder(attr, adj)
        return self.proj_head(f)

    def loss(self, x, x_aug):
        batch_size, _ = x.size()
        x_abs = x.norm(dim=1)
        x_aug_abs = x_aug.norm(dim=1)
        sim_matrix = torch.einsum('ik,jk->ij', x, x_aug) / torch.einsum('i,j->ij', x_abs, x_aug_abs)
        sim_matrix = torch.exp(sim_matrix / self.tau)
        pos_sim = sim_matrix[range(batch_size), range(batch_size)]
        loss = pos_sim / (sim_matrix.sum(dim=1) - pos_sim)
        loss = - torch.log(loss).mean()
        return loss


    def emb_encoder(self, attr, adj) -> torch.Tensor:
        x = self.encoder.emb_encode(attr, adj)
        return x


def gen_ran_output(attr, adj, model, vice_model):
    for (adv_name,adv_param), (name,param) in zip(vice_model.named_parameters(), model.named_parameters()):
        if name.split('.')[0] == 'proj_head' or name.split('.')[0] == 'proj_head2':   # 两个模型是一样的，所以参数是一样的
            adv_param.data = param.data
        else:
            adv_param.data = param.data + 1.0 * torch.normal(0,torch.ones_like(param.data)*param.data.std()).cuda(DEVICE)

    return vice_model(attr, adj)



In [None]:
GCA_NUM_EPOCHS = 10001
C_NUM = 86
P_NUM = 1183
in_channes = C_NUM + 429
out_channels = dimensionK
hidden_dim = dimensionK
lr =.01
proj_hidden = dimensionK

def Egae_encode(graph, neg_edge,C_mat,emb_save_dir):
    vice_model = GRACE(in_channes,out_channels,proj_hidden).cuda(DEVICE)
    grace_model = GRACE(in_channes,out_channels,proj_hidden).cuda(DEVICE)
    optimizer = torch.optim.Adam(grace_model.parameters(), lr=lr)
    loss_func = torch.nn.MSELoss()
    target_Mat = torch.tensor(C_mat).to(torch.float32).cuda(DEVICE)
    loss_min = 100000
    epoch_min = 0
    print('begin')
    for epoch in tqdm(range(1, GCA_NUM_EPOCHS + 1)):
        epoch_loss = 0
        raw_proj_rep = []
        ptb_proj_rep = []
        for i in range(0,len(graph)):
            grace_model.train()
            vice_model.train
            optimizer.zero_grad()
            data = graph[i]
            data = data.cuda(DEVICE)
            adj = adj_list[i]
            adj = torch.tensor(adj.todense(), dtype=torch.float32).cuda(DEVICE)
            subg_r1 = gen_ran_output(data.x,adj, grace_model, vice_model)
            subg_r2 = grace_model(data.x, adj)
            raw_proj_rep.append(subg_r2.unsqueeze(0))
            ptb_proj_rep.append(subg_r1.unsqueeze(0))

        raw_proj_rep = torch.concat(raw_proj_rep,dim=0)
        ptb_proj_rep = torch.concat(ptb_proj_rep,dim=0)

        recon = torch.matmul(raw_proj_rep,raw_proj_rep.transpose(1,0))
        loss1 = loss_func(recon, target_Mat)
        loss2 = grace_model.loss(raw_proj_rep, ptb_proj_rep)
        loss = loss1 + loss2

        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

        if epoch_loss < loss_min:
            loss_min = epoch_loss
            torch.save(grace_model.state_dict(), model_temp_file)
            epoch_min = epoch
        if epoch % 500 == 0:
            print('Epoch={:03d}, loss={:.4f}'.format(epoch, epoch_loss))


    print('(Train) | Epoch={:03d}, loss={:.4f}'.format(epoch_min, loss_min))
    grace_model.load_state_dict(torch.load(model_temp_file))
    grace_model.eval()
    n2n_emb = torch.zeros(P_NUM, out_channels)
    countP = torch.zeros(P_NUM)
    for i in range(0,len(graph)):
        prob_list = skill_prob_dict[i]
        data = graph[i]
        data = data.cuda(DEVICE)
        adj = adj_list[i]
        adj = torch.tensor(adj.todense(), dtype=torch.float32).cuda(DEVICE)
        z = grace_model.emb_encoder(data.x, adj).cpu().detach()
        for node in range(z.shape[0]):
            n2n_emb[prob_list[node]] += z[node]
            countP[prob_list[node]] += 1

    for i in range(0,P_NUM):
        if countP[i] >1:
            n2n_emb[i] = n2n_emb[i] / countP[i]

    n2n_emb = n2n_emb.cpu().detach()
    joblib.dump(n2n_emb, emb_save_dir)


Egae_encode(Egraph_list, neg_edge_list , mat, emb_save_dir=e2e_emb_file)