In [1]:
import numpy as np
import pandas as pd
import torch
import csv, random
import torch.nn.init as init
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import RNN, LSTM, Sigmoid, Linear, Dropout, Module
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence, pack_sequence
from sklearn.metrics import roc_auc_score, accuracy_score
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torch.autograd import Variable
from sklearn.externals import joblib
import time, math
import json
from datetime import datetime

In [2]:
def read_data(path):
    """
    从文件中读取数据
    :param path: str 文件路径
    :return: int 知识点数量, sequence 原始数据序列
    """
    with open(path, 'r', encoding='UTF-8') as f:
        skill_dt = {}
        skill2name= {}
        seq_dt = {}
        readlines = csv.reader(f)
        for i, readline in enumerate(readlines):
            if i == 0:
                continue
            skill_id = readline[1]
            skill_name = readline[2]
            user_id = readline[0]
            correct = int(readline[3])
            if skill_id == '' or user_id == '' or correct == '':
                continue
            if skill_id not in skill2name:    
                skill2name[skill_id] = skill_name
            if skill_id not in skill_dt:
                skill_dt[skill_id] = len(skill_dt)
            if user_id in seq_dt:
                seq_dt[user_id].append((skill_dt[(skill_id)], correct))
            else:
                seq_dt[user_id] = [(skill_dt[(skill_id)], correct)]
        seq_list = list(seq_dt.values())
        return skill_dt, seq_list, skill2name


def split_dataset(seqs, val_rate=0.2, test_rate=0.1):
    """
    将数据拆分为训练集，验证集和测试集
    :param seqs: sequence 原始数据序列
    :param val_rate: float 验证集占总体数据比
    :param test_rate: float 测试集占总体数据比
    :return:
    """
    random.shuffle(seqs)
    seq_size = len(seqs)
    val_size = int(seq_size * val_rate)
    test_size = int(seq_size * test_rate)
    val_seqs = seqs[:val_size]
    test_seqs = seqs[val_size:val_size + test_size]
    train_seqs = seqs[val_size + test_size:]
    return train_seqs, val_seqs, test_seqs


class QuizDataSet(Dataset):

    def __len__(self):
        return len(self._seq_list)

    def __getitem__(self, index):
        feature_size = self._skill_size * 2
        seq = self._seq_list[index]
        seq_size = len(seq)
        # 包含初始状态x_{0}
        x = torch.zeros(seq_size, feature_size)
        # delta(t + 1)
        y_skill = torch.zeros(seq_size, self._skill_size)
        # a_{t+1}
        y_ans = torch.zeros(seq_size)
        for i, v in enumerate(seq):
            if i < seq_size - 1:
                new_feature_v = torch.zeros(feature_size)
                # skill_id * 2 + correct
                idx = int(v[0] * 2 + v[1])
                new_feature_v[idx] = 1
                x[i + 1] = new_feature_v
            new_skill_v = torch.zeros(self._skill_size)
            new_skill_v[v[0]] = 1
            y_skill[i] = new_skill_v
            y_ans[i] = v[1]
        return x, y_skill, y_ans

    def __init__(self, skill_dt, seq_list):
        self._skill_size = len(skill_dt)
        self._seq_list = seq_list


def collate(batch):
    # 压紧序列前需要先排序
    batch.sort(key=lambda x: x[0].size()[0], reverse=True)
    y_skill = []
    y_ans = []
    x = []
    for each_x, each_y, each_ans in batch:
        x.append(each_x)
        y_skill.append(each_y)
        y_ans.append(each_ans)
    # 压紧序列数据
    x_batch = pack_sequence(x)
    # delta(t+1)和a_{t+1}不用保留序列结构信息
    y_skill_batch = pack_sequence(y_skill).data
    y_ans_batch = pack_sequence(y_ans).data
#     y_ans_batch = torch.Tensor(y_ans)
    return x_batch, y_skill_batch, y_ans_batch


class DktNet(Module):
    """
    deep knowledge tracing model
    input => rnn => dropout => sigmoid => output
    """

    def __init__(self, skill_size, rnn_h_size, rnn_layer_size, dropout_rate):
        """
        :param skill_size: int 知识点数量
        :param rnn_h_size: int rnn隐藏单元数量
        :param rnn_layer_size: int rnn隐藏层数量
        :param dropout_rate: float
        """
        super(DktNet, self).__init__()
        self.hidden_size = rnn_h_size
        self.rnn_layer_size = rnn_layer_size
        self.rnn = nn.LSTM(skill_size * 2, rnn_h_size, rnn_layer_size, dropout=dropout_rate,bias=True)
        self.dropout = Dropout(p=dropout_rate)
        self.encoder = nn.Linear(skill_size * 2, skill_size * 2)
        self.linear = nn.Sequential(
                nn.Linear(rnn_h_size, rnn_h_size),
                nn.ReLU(),
                nn.Linear(rnn_h_size, skill_size)
                )
        self.sigmoid = nn.Sigmoid()
        self.softmax = nn.Softmax()
        self.weight_init(self.encoder)
        self.weight_init(self.linear)
        self.weight_init(self.rnn)
    def weight_init(self, m):
        if isinstance(m, nn.LSTM):
            for param in m.parameters():
                if len(param.shape) >= 2:
                    init.orthogonal_(param.data)
                else:
                    init.normal_(param.data)
        elif isinstance(m, nn.Linear):
            init.xavier_normal_(m.weight.data)
            init.normal_(m.bias.data)
    def init_hidden(self, batch_size):
        hidden_p = Variable(torch.zeros(self.rnn_layer_size * 1, batch_size, self.hidden_size)).cuda()
        hidden_h = Variable(torch.zeros(self.rnn_layer_size * 1, batch_size, self.hidden_size)).cuda()
        return (hidden_p, hidden_h)
    
    def forward(self, x, hidden):
#         encoder = self.encoder(x)
        rnn_output, hidden = self.rnn(x, hidden)
        # rnn_output无需保留序列结构数据，直接用rnn_output.data
        dropout_output = self.dropout(rnn_output.data)
        output = self.linear(dropout_output)
        output = self.sigmoid(output)
        return output, hidden


def compute_auc(data_loader, model):
    """
    计算验证集和测试集的auc值
    """
    y_pred = torch.tensor([])
    y_ans = torch.tensor([])
    for x_batch, y_skill_batch, y_ans_batch in data_loader:
        hidden = model.init_hidden(BATCH_SIZE)
        skill_pred, _ = model(x_batch.cuda(), hidden)
        _y_pred = (skill_pred.data * y_skill_batch.cuda()).sum(dim=1)        
#         rounded_preds = torch.round(F.sigmoid(skill_pred.squeeze(1)))
#         pred, y_pred_labels = torch.max(skill_pred, dim=-1, keepdim=True)
        y_pred = torch.cat((y_pred, _y_pred.float().cpu()))
        y_ans = torch.cat((y_ans, y_ans_batch))
    return roc_auc_score(y_ans, y_pred)

def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(F.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

In [3]:
# rnn隐藏单元数量
HIDDEN_SIZE = 64
# rnn隐藏层数量
HIDDEN_LAYER_SIZE = 1
DROPOUT_RATE = 0.6
EPOCHS = 10
BATCH_SIZE = 16

skill_dt, seq_list, skill2name = read_data('./data/skill_builder_filter.csv')
train_seqs, val_seqs, test_seqs = split_dataset(seq_list)
train_dataset = QuizDataSet(skill_dt, train_seqs)
val_dataset = QuizDataSet(skill_dt, val_seqs)
model = DktNet(len(skill_dt), HIDDEN_SIZE, HIDDEN_LAYER_SIZE, DROPOUT_RATE)
criterion = torch.nn.MSELoss()
# criterion = torch.nn.NLLLoss( size_average=False)
model = model.cuda()
criterion = criterion.cuda()
trainable_params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.Adam(params=trainable_params, lr=0.001, betas=(
        0.9, 0.999), eps=1e-08, weight_decay=0)
# optimizer = torch.optim.Adagrad(params=trainable_params, lr=0.001)
train_data_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, collate_fn=collate, shuffle=True, drop_last=True)
val_data_loader = DataLoader(val_dataset, collate_fn=collate, batch_size=BATCH_SIZE, drop_last=True)

  "num_layers={}".format(dropout, num_layers))


In [4]:
best_auc = 0
for epoch in range(EPOCHS + 1):
    start_time = time.time()
    train_y_pred = torch.tensor([])
    train_y_ans = torch.tensor([])
    batch_losses = 0.0
    for batch, batched in enumerate(train_data_loader):
        x_batch, y_skill_batch, y_ans_batch = batched
        hidden = model.init_hidden(BATCH_SIZE)
        skill_pred, hidden = model(x_batch.cuda(), hidden)
        y_pred = (skill_pred * y_skill_batch.cuda()).sum(dim=1)
#         pred, pred_labels = torch.max(y_pred, dim=-1, keepdim=True)
        loss = criterion(y_pred, y_ans_batch.cuda())
        train_y_pred = torch.cat((train_y_pred, y_pred.cpu()))
        train_y_ans = torch.cat((train_y_ans, y_ans_batch))
        optimizer.zero_grad()
        loss.backward()
#         torch.nn.utils.clip_grad_norm(model.parameters(), 1)
        optimizer.step()

        batch_losses += loss.data
        if batch % 50 == 0 and batch > 0:
            cur_loss = batch_losses[0] / 200
            elapsed = time.time() - start_time

            print('| epoch {:3d} | {:5d}/{:5d} batches  | lr {:05.5f} | ms/epoch {} | '
                  'loss {:5.5f} '.format(
                epoch, batch, len(train_data_loader), optimizer.param_groups[0]['lr'],
                str(elapsed), cur_loss))
            batch_losses = 0
            start_time = time.time()
    with torch.no_grad():
        # 验证集auc
        val_auc = compute_auc(val_data_loader, model)
        # 训练集auc
        train_auc = roc_auc_score(train_y_ans, train_y_pred.detach().numpy())
        print('epoch: {0}, train_auc: {1}, val_auc: {2}'.format(epoch + 1, train_auc, val_auc))
        if best_auc < val_auc:
            best_auc = val_auc
        else:
            print('reduce learning rate by multiply 0.7')
            optimizer.param_groups[0]['lr'] = optimizer.param_groups[0]['lr'] * 0.7



| epoch   0 |    50/  182 batches  | lr 0.00100 | ms/epoch 4.29323148727417 | loss 0.05960 
| epoch   0 |   100/  182 batches  | lr 0.00100 | ms/epoch 2.326080083847046 | loss 0.05521 
| epoch   0 |   150/  182 batches  | lr 0.00100 | ms/epoch 2.445678234100342 | loss 0.05476 
epoch: 1, train_auc: 0.5943783618779475, val_auc: 0.6429399849903316
| epoch   1 |    50/  182 batches  | lr 0.00100 | ms/epoch 2.2443785667419434 | loss 0.05555 
| epoch   1 |   100/  182 batches  | lr 0.00100 | ms/epoch 2.1359307765960693 | loss 0.05366 
| epoch   1 |   150/  182 batches  | lr 0.00100 | ms/epoch 2.3570075035095215 | loss 0.05186 
epoch: 2, train_auc: 0.6643597454116454, val_auc: 0.7216422500934945
| epoch   2 |    50/  182 batches  | lr 0.00100 | ms/epoch 2.326144218444824 | loss 0.05083 
| epoch   2 |   100/  182 batches  | lr 0.00100 | ms/epoch 2.1179893016815186 | loss 0.05002 
| epoch   2 |   150/  182 batches  | lr 0.00100 | ms/epoch 2.189312696456909 | loss 0.04979 
epoch: 3, train_auc: 0

In [5]:
torch.save(model, './chk/dkt.model')

  "type " + obj.__name__ + ". It won't be checked "


In [6]:
def vis_generate(decoder, input_str, mask, temperature=0.8):
    hidden = decoder.init_hidden(1)
    test_len = len(input_str)
    prime_input = torch.autograd.Variable(input_str[0].unsqueeze(0)).cuda()
    out, hidden = decoder(prime_input.unsqueeze(0), hidden)
    hidden_matrix = np.copy(F.avg_pool1d(hidden[0], 1).data.cpu().numpy())
    hidden_matrix = hidden_matrix.reshape((1,hidden_matrix.size))
    inp = prime_input
    for p in range(1,test_len + 1):
        print('p', p)
#         inp = torch.autograd.Variable(input_str[p].unsqueeze(0)).cuda()
        output, hidden = decoder(inp.unsqueeze(0), hidden)
        hidden_matrix = np.vstack((hidden_matrix, F.avg_pool1d(hidden[0], 1).squeeze(dim=1).data.cpu().numpy()))
        if p < (test_len):
            inp = torch.autograd.Variable(input_str[p].unsqueeze(0)).cuda()
    hidden_matrix = np.delete(hidden_matrix, 0, 0)
    df = pd.DataFrame(hidden_matrix)
    df.to_csv('paran-data-df.csv')
    np.savetxt("paren-data.csv", hidden_matrix, delimiter=",")
    np.savetxt("paren-data.tsv", hidden_matrix, delimiter="\t")

In [7]:
test = [(3, 0), (4, 0), (5, 0), (6, 0), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1)]

In [8]:
problem_dict = {v:k for k,v in skill_dt.items()}

In [9]:
[skill2name[problem_dict[x[0]]] for x in test]

['Number Line',
 'Scatter Plot',
 'Stem and Leaf Plot',
 'Table',
 'Venn Diagram',
 'Mean',
 'Median',
 'Mode',
 'Range']

In [10]:
inputs = test # test_seqs[22]
feature_size = len(skill_dt) * 2
seq_size = len(inputs)
x = torch.zeros(seq_size, feature_size)
y_skill = torch.zeros(seq_size, len(skill_dt))
y_ans = torch.zeros(seq_size)
for i, v in enumerate(inputs):
    if i < seq_size - 1:
        new_feature_v = torch.zeros(feature_size)
        # skill_id * 2 + correct
        idx = int(v[0])
        new_feature_v[idx] = 1
        x[i + 1] = new_feature_v
    new_skill_v = torch.zeros(len(skill_dt))
    new_skill_v[v[0]] = 1
    y_skill[i] = new_skill_v
    y_ans[i] = v[1]

vis_generate(model, x, y_skill)

p 1
p 2
p 3
p 4
p 5
p 6
p 7
p 8
p 9


In [11]:
hidden = model.init_hidden(1)
out, hidden = model(pack_sequence([x]).cuda(), hidden)
print('output', out)
(out * y_skill.cuda()).sum(dim=1)

output tensor([[ 0.6259,  0.3488,  0.5421,  ...,  0.5367,  0.4469,  0.5509],
        [ 0.7911,  0.5343,  0.7034,  ...,  0.5584,  0.6215,  0.6278],
        [ 0.6895,  0.5100,  0.5932,  ...,  0.5797,  0.5277,  0.5872],
        ...,
        [ 0.7023,  0.4868,  0.6094,  ...,  0.6575,  0.4924,  0.6265],
        [ 0.7959,  0.5355,  0.7325,  ...,  0.6918,  0.5520,  0.5829],
        [ 0.6647,  0.4863,  0.5249,  ...,  0.6205,  0.4996,  0.5230]], device='cuda:0')


tensor([ 0.4860,  0.8431,  0.5674,  0.7416,  0.6480,  0.4304,  0.5721,
         0.9366,  0.6307], device='cuda:0')

In [12]:
print(roc_auc_score(y_ans.detach().cpu(), (out * y_skill.cuda()).sum(dim=1).detach().cpu()))

0.5


In [13]:
criterion((out * y_skill.cuda()).sum(dim=1), y_ans.cuda())

tensor(0.2879, device='cuda:0')

In [14]:
from math import pi
import pandas as pd

from bokeh.io import show
from bokeh.models import ColumnDataSource, HoverTool, LinearColorMapper, BasicTicker, PrintfTickFormatter,ColorBar
from bokeh.models import FuncTickFormatter
from bokeh.plotting import figure

data = pd.read_csv('paran-data-df.csv', index_col=0).abs()
data = data.T
data.index.name = 'cell'
data.columns.name = 'chars'


In [15]:
data2 = data.diff(axis=1).fillna(0.0)

In [16]:
%matplotlib inline


In [17]:
from bokeh.io import output_notebook
from bokeh.models import LogColorMapper, LogTicker, ColorBar
output_notebook()

In [18]:
index = {i:skill2name[problem_dict[inputs[i][0]]] for i in range(len(data.columns))}

seq = [str(i) for i in data.columns]
cell = list([str(x) for x in data.index])

df = pd.DataFrame(data.stack(), columns=['value']).reset_index()
colors = ["#313695", "#4575b4", "#74add1", "#abd9e9", "#e0f3f8", "#ffffbf", "#fee090", "#fdae61", "#f46d43", "#d73027", "#a50026"]

colors.reverse()
mapper = LinearColorMapper(palette=colors, low=-1, high=1)#low=df.value.min(), high=df.value.max())
source = ColumnDataSource(df)
TOOLS = "hover,pan,reset,save,wheel_zoom"

color_bar = ColorBar(color_mapper=mapper, ticker=BasicTicker(),
                     label_standoff=12, border_line_color=None, location=(0,0))

p = figure(title="LSTM Hidden State Activations",  x_range=cell, y_range=list(reversed(seq)), x_axis_location="above",
            plot_width=500, plot_height=300,
            tools=TOOLS, toolbar_location='below')

p.grid.grid_line_color = None
p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
p.axis.major_label_text_font_size = "8pt"
p.axis.major_label_standoff = 0
p.yaxis.major_label_orientation = pi / 3
p.yaxis.formatter = FuncTickFormatter(code="""
                                        var labels = %s;
                                        return labels[tick];
                                    """%index)

p.rect(x="cell", y="chars", width=1, height=1, source=source, fill_color={'field': 'value', 'transform': mapper},
                                line_color=None)

p.select_one(HoverTool).tooltips = [('value', '@value')]
p.add_layout(color_bar, 'right')
show(p)      # show the plot

In [19]:
index = {i:skill2name[problem_dict[inputs[i][0]]] for i in range(len(data2.columns))}

seq = [str(i) for i in data2.columns]
cell = list(data2.index)

df = pd.DataFrame(data2.stack(), columns=['value']).reset_index()
colors = ["#313695", "#4575b4", "#74add1", "#abd9e9", "#e0f3f8", "#ffffbf", "#fee090", "#fdae61", "#f46d43", "#d73027", "#a50026"]

colors.reverse()
mapper = LinearColorMapper(palette=colors, low=-1, high=1)#low=df.value.min(), high=df.value.max())
source = ColumnDataSource(df)
TOOLS = "hover,pan,reset,save,wheel_zoom"

color_bar = ColorBar(color_mapper=mapper, ticker=BasicTicker(),
                     label_standoff=12, border_line_color=None, location=(0,0))

p = figure(title="LSTM Hidden State Activations",  x_range=seq, y_range=list(reversed(cell)), x_axis_location="above",
            plot_width=500, plot_height=700,
            tools=TOOLS, toolbar_location='below')

p.grid.grid_line_color = None
p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
p.axis.major_label_text_font_size = "8pt"
p.axis.major_label_standoff = 0
p.xaxis.major_label_orientation = pi / 3
p.xaxis.formatter = FuncTickFormatter(code="""
                                        var labels = %s;
                                        return labels[tick];
                                    """%index)

p.rect(x="chars", y="cell", width=1, height=1, source=source, fill_color={'field': 'value', 'transform': mapper},
                                line_color=None)

p.select_one(HoverTool).tooltips = [('value', '@value')]
p.add_layout(color_bar, 'right')
show(p)      # show the plot