In [1]:
import numpy as np
import pandas as pd
import torch
import csv, random
import torch.nn.init as init
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import RNN, LSTM, Sigmoid, Linear, Dropout, Module
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence, pack_sequence
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torch.autograd import Variable
from sklearn.externals import joblib
import time, math
import json
from datetime import datetime

In [2]:
PAD = 0 
EOS = 1
def pad_text(text, pad, max_length=None):
    length = len(text)
    if max_length is not None:
        if length > max_length:
            return text[:max_length]
        else:
            return text + [pad] * (max_length - length)
    return text

def read_data(path):
    """
    从文件中读取数据
    :param path: str 文件路径
    :return: int 知识点数量, sequence 原始数据序列
    """
    with open(path, 'r', encoding='UTF-8') as f:
        skill_dt = {'PAD':0, 'EOS':1}
        skill2name= {}
        seq_dt = {}
        readlines = csv.reader(f)
        for i, readline in enumerate(readlines):
            if i == 0:
                continue
            skill_id = readline[1]
#             skill_name = readline[2]
            user_id = readline[0]
            correct = 1 if readline[5] =='True' else 0
            if skill_id == '' or user_id == '' or correct == '':
                continue
            if skill_id not in skill_dt:
                skill_dt[skill_id] = len(skill_dt)
            if user_id in seq_dt:
                seq_dt[user_id].append((skill_dt[(skill_id)], correct))
            else:
                seq_dt[user_id] = [(skill_dt[(skill_id)], correct)]
        seq_list = list(seq_dt.values())
        return skill_dt, seq_list


def split_dataset(seqs, val_rate=0.2, test_rate=0.1):
    """
    将数据拆分为训练集，验证集和测试集
    :param seqs: sequence 原始数据序列
    :param val_rate: float 验证集占总体数据比
    :param test_rate: float 测试集占总体数据比
    :return:
    """
    random.shuffle(seqs)
    seq_size = len(seqs)
    val_size = int(seq_size * val_rate)
    test_size = int(seq_size * test_rate)
    val_seqs = seqs[:val_size]
    test_seqs = seqs[val_size:val_size + test_size]
    train_seqs = seqs[val_size + test_size:]
    return train_seqs, val_seqs, test_seqs


class QuizDataSet(Dataset):

    def __len__(self):
        return len(self._seq_list)

    def __getitem__(self, index):
        feature_size = self._skill_size * 2
        data = self._seq_list[index]
        seq_size = len(data)
        # 包含初始状态x_{0}
        x = torch.zeros(seq_size, feature_size)
        skill = [1]
        # delta(t + 1)
        mask = torch.zeros((seq_size, self._skill_size))
        # a_{t+1}
        ans = torch.zeros(seq_size)
        for i, v in enumerate(data):
            if i < seq_size - 1:
                new_feature_v = torch.zeros(feature_size)
                # skill_id * 2 + correct
                idx = int(v[0] * 2 + v[1])
                new_feature_v[idx] = 1
                x[i + 1] = new_feature_v
                skill.append(v[0])
            new_skill_v = torch.zeros(self._skill_size)
            new_skill_v[v[0]] = 1
            mask[i] = new_skill_v
            ans[i] = v[1]
        return skill, x, mask, ans

    def __init__(self, skill_dt, seq_list):
        self._skill_size = len(skill_dt)
#         self._seq_list = [([x[0] for x in data],[x[1] for x in data]) for data in seq_list]
        self._seq_list = seq_list


def collate(batch):
    # 压紧序列前需要先排序
    batch.sort(key=lambda x: len(x[0]), reverse=True)
    longest = max([len(skill) for (skill, x, mask, ans) in batch])
    skill = [pad_text(skill, PAD, longest)  for (skill, x, mask, ans) in batch] 
    x = [torch.cat((x, torch.zeros(longest - x.size(0), x.size(1))), dim=0).unsqueeze(0) for (skill, x, mask, ans) in batch]
    ans = [ans for (skill, x, mask, ans) in batch]
    mask = [mask for (skill, x, mask, ans) in batch]
    skill_tensor = torch.LongTensor(skill)
    x = torch.cat(x, dim=0)
    ans_tensor = pack_sequence(ans).data
    mask_tensor = pack_sequence(mask).data
    return skill_tensor, x, mask_tensor, ans_tensor


class DktNet(Module):
    """
    deep knowledge tracing model
    input => rnn => dropout => sigmoid => output
    """

    def __init__(self, skill_dt, rnn_h_size, rnn_layer_size, embedding_size, dropout_rate):
        """
        :param skill_size: int 知识点数量
        :param rnn_h_size: int rnn隐藏单元数量
        :param rnn_layer_size: int rnn隐藏层数量
        :param dropout_rate: float
        """
        super(DktNet, self).__init__()
        self.hidden_size = rnn_h_size
        skill_size = len(skill_dt)
        self.rnn_layer_size = rnn_layer_size
        self.embedding_size = embedding_size
        embedding_weight = self._load_word2vec('./data/skill_embedding.vec', skill_dt)
        self.embedding = nn.Embedding(skill_size , embedding_size)
#         self.init_weights()
        self.embedding.weight = nn.Parameter(embedding_weight, requires_grad=True)
#         word_dict, embedding_tensor = self.load_embedding('./data/skill_embedding.vec', skill_dt)
#         self.embedding.weight = nn.Parameter(embedding_tensor.cuda())
        self.lstm = nn.LSTM(embedding_size + skill_size * 2, rnn_h_size, rnn_layer_size, dropout=dropout_rate, batch_first=True, bias=True)
        self.weight_init(self.lstm)
        self.dropout = Dropout(p=dropout_rate)
        self.linear = Linear(rnn_h_size, skill_size)
        self.weight_init(self.linear)
        self.sigmoid = Sigmoid()
        self.weight_init(self.linear)
        self.weight_init(self.lstm)
        
    def init_weights(self):
        bias = np.sqrt(3.0 / self.embedding_size)
        self.embedding.weight.data.uniform_(-bias, bias)
#         self.A.data.uniform_(-initrange, initrange)
    
    def _load_word2vec(self, emb_file, feature_map):
        
        embedding_dict = dict()
        feature_set = set([key for key in feature_map])
        for line in open(emb_file, 'r'):
            line = line.split('|')
            vector = list(map(lambda t: float(t), filter(lambda n: n and not n.isspace(), line[1].split(' '))))
            if line[0] not in feature_set:
                continue
            embedding_dict[line[0]] = vector

        word_vectors = []
        for word in feature_map.keys():

            if word in embedding_dict.keys():
                vector = embedding_dict[word]
            else:
                vector = np.random.normal(scale=0.2, size=self.embedding_size)  # random vector

            word_vectors.append(vector)

        weight = np.stack(word_vectors)
        return torch.Tensor(weight)

    def weight_init(self, m):
        if isinstance(m, nn.LSTM):
            for param in m.parameters():
                if len(param.shape) >= 2:
                    init.orthogonal_(param.data)
                else:
                    init.normal_(param.data)
        elif isinstance(m, nn.Linear):
            init.xavier_normal_(m.weight.data)
            init.normal_(m.bias.data)
        
    def init_hidden(self, batch_size):
        hidden_p = Variable(torch.zeros(self.rnn_layer_size * 1, batch_size, self.hidden_size)).cuda()
        hidden_h = Variable(torch.zeros(self.rnn_layer_size * 1, batch_size, self.hidden_size)).cuda()
        return (hidden_p, hidden_h)
    
    def forward(self, x, y,  hidden):
#         encoder = self.encoder(x)
        embedded = self.embedding(x)
        mask = x.data.gt(0).float()
#         embedded = self.dropout1(embedded)
        embedded = torch.cat((embedded, y), dim=-1)
        embedded = nn.utils.rnn.pack_padded_sequence(embedded, mask.sum(1).int(), batch_first=True)
        rnn_output, hidden = self.lstm(embedded, hidden)
        # rnn_output无需保留序列结构数据，直接用rnn_output.data
#         dropout_output = self.dropout(rnn_output.data)
        linear_output = self.linear(rnn_output.data)
        output = self.sigmoid(linear_output)
        return output, hidden


def compute_auc(data_loader, model):
    """''
    计算验证集和测试集的auc值
    """
    y_pred = torch.tensor([])
    y_ans = torch.tensor([])
    for x_batch, x, y_skill_batch, y_ans_batch in data_loader:
        hidden = model.init_hidden(BATCH_SIZE)
        skill_pred, _ = model(x_batch.cuda(), x.cuda(), hidden)
        _y_pred = (skill_pred.data * y_skill_batch.cuda()).sum(dim=1)
        y_pred = torch.cat((y_pred, _y_pred.cpu()))
        y_ans = torch.cat((y_ans, y_ans_batch))
    return roc_auc_score(y_ans, y_pred)



In [3]:
# rnn隐藏单元数量
HIDDEN_SIZE = 200
EMBEDDING_SIZE = 100
# rnn隐藏层数量
HIDDEN_LAYER_SIZE = 1
DROPOUT_RATE = 0.6
EPOCHS = 10
BATCH_SIZE = 16

skill_dt, seq_list = read_data('./data/edu_sample.csv')
train_seqs, val_seqs, test_seqs = split_dataset(seq_list)
train_dataset = QuizDataSet(skill_dt, train_seqs)
val_dataset = QuizDataSet(skill_dt, val_seqs)
model = DktNet(skill_dt, HIDDEN_SIZE, HIDDEN_LAYER_SIZE, EMBEDDING_SIZE, DROPOUT_RATE)
criterion = torch.nn.BCELoss()
# criterion = torch.nn.NLLLoss(ignore_index=0, size_average=False)
model = model.cuda()
criterion = criterion.cuda()
trainable_params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.Adam(params=trainable_params, lr=0.001, betas=(
        0.9, 0.999), eps=1e-08, weight_decay=0)
# optimizer = torch.optim.Adagrad(params=trainable_params, lr=0.001)
train_data_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, collate_fn=collate, shuffle=True, drop_last=True)
val_data_loader = DataLoader(val_dataset, collate_fn=collate, batch_size=BATCH_SIZE, drop_last=True)

  "num_layers={}".format(dropout, num_layers))


In [4]:
best_auc = 0
for epoch in range(EPOCHS + 1):
    start_time = time.time()
    train_y_pred = torch.tensor([])
    train_y_ans = torch.tensor([])
    batch_losses = 0.0
    for batch, batched in enumerate(train_data_loader):
        skill, x, y_skill_batch, y_ans_batch = batched
        hidden = model.init_hidden(BATCH_SIZE)
        skill, y_ans_batch = Variable(skill.cuda()),Variable(y_ans_batch.cuda())
        skill_pred, hidden = model(skill, x.cuda(), hidden)
        y_pred = (skill_pred * y_skill_batch.cuda()).sum(dim=1)
        
        loss = criterion(y_pred, y_ans_batch)
        train_y_pred = torch.cat((train_y_pred, y_pred.cpu()))
        train_y_ans = torch.cat((train_y_ans, y_ans_batch.cpu()))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        batch_losses += loss.data
        if batch % 50 == 0 and batch > 0:
            cur_loss = batch_losses[0] / 50
            elapsed = time.time() - start_time

            print('| epoch {:3d} | {:5d}/{:5d} batches  | lr {:05.5f} | ms/epoch {} | '
                  'loss {:5.5f} '.format(
                epoch, batch, len(train_data_loader), optimizer.param_groups[0]['lr'],
                str(elapsed), cur_loss))
            batch_losses = 0
            start_time = time.time()
    with torch.no_grad():
        # 验证集auc
        val_auc = compute_auc(val_data_loader, model)
        # 训练集auc
        train_auc = roc_auc_score(train_y_ans.detach().cpu(), train_y_pred.detach().cpu(), average='micro')
        print('epoch: {0}, train_auc: {1}, val_auc: {2}'.format(epoch + 1, train_auc, val_auc))
        if best_auc < val_auc:
            best_auc = val_auc
        else:
            print('reduce learning rate by multiply 0.7')
            optimizer.param_groups[0]['lr'] = optimizer.param_groups[0]['lr'] * 0.7



| epoch   0 |    50/  429 batches  | lr 0.00100 | ms/epoch 21.739397287368774 | loss 0.55580 
| epoch   0 |   100/  429 batches  | lr 0.00100 | ms/epoch 16.11191177368164 | loss 0.43623 
| epoch   0 |   150/  429 batches  | lr 0.00100 | ms/epoch 24.076493501663208 | loss 0.41705 
| epoch   0 |   200/  429 batches  | lr 0.00100 | ms/epoch 14.4184410572052 | loss 0.42891 
| epoch   0 |   250/  429 batches  | lr 0.00100 | ms/epoch 17.459308862686157 | loss 0.42845 
| epoch   0 |   300/  429 batches  | lr 0.00100 | ms/epoch 14.911123514175415 | loss 0.42435 
| epoch   0 |   350/  429 batches  | lr 0.00100 | ms/epoch 14.64383840560913 | loss 0.43741 
| epoch   0 |   400/  429 batches  | lr 0.00100 | ms/epoch 19.61554265022278 | loss 0.44429 
epoch: 1, train_auc: 0.6888021642778738, val_auc: 0.7191843108108056
| epoch   1 |    50/  429 batches  | lr 0.00100 | ms/epoch 20.40109086036682 | loss 0.40819 
| epoch   1 |   100/  429 batches  | lr 0.00100 | ms/epoch 13.336334943771362 | loss 0.4097

| epoch  10 |   100/  429 batches  | lr 0.00100 | ms/epoch 13.111530303955078 | loss 0.35048 
| epoch  10 |   150/  429 batches  | lr 0.00100 | ms/epoch 14.284798622131348 | loss 0.36631 
| epoch  10 |   200/  429 batches  | lr 0.00100 | ms/epoch 11.347681522369385 | loss 0.36052 
| epoch  10 |   250/  429 batches  | lr 0.00100 | ms/epoch 13.1726655960083 | loss 0.36913 
| epoch  10 |   300/  429 batches  | lr 0.00100 | ms/epoch 14.828503847122192 | loss 0.36457 
| epoch  10 |   350/  429 batches  | lr 0.00100 | ms/epoch 11.136219501495361 | loss 0.34990 
| epoch  10 |   400/  429 batches  | lr 0.00100 | ms/epoch 12.181609153747559 | loss 0.37974 
epoch: 11, train_auc: 0.8026461472492974, val_auc: 0.7661171349265878
reduce learning rate by multiply 0.7


In [5]:
torch.save(model, './chk/dkt.model')

  "type " + obj.__name__ + ". It won't be checked "


In [132]:
def vis_generate(decoder, input_str, x, temperature=0.8):
    hidden = decoder.init_hidden(1)
    test_len = len(input_str)
    prime_x = torch.autograd.Variable(x[0].unsqueeze(0)).cuda()
    prime_input = torch.autograd.Variable(input_str[0].unsqueeze(0)).cuda()
    print('x', x.shape)
    print('x', prime_x.shape)
    print('prime_input', prime_input.shape)
    out, hidden = decoder(prime_input.unsqueeze(0), prime_x.unsqueeze(0), hidden)
    hidden_matrix = np.copy(F.avg_pool1d(hidden[0], 3).data.cpu().numpy())
    hidden_matrix = hidden_matrix.reshape((1,hidden_matrix.size))
    inp = prime_input
    inp_x = prime_x
    for p in range(1,test_len + 1):
        print('p', p)
#         inp = torch.autograd.Variable(input_str[p].unsqueeze(0)).cuda()
        output, hidden = decoder(inp.unsqueeze(0), inp_x.unsqueeze(0), hidden)
        hidden_matrix = np.vstack((hidden_matrix, F.avg_pool1d(hidden[0], 3).squeeze(dim=1).data.cpu().numpy()))
        if p < (test_len):
            inp = torch.autograd.Variable(input_str[p].unsqueeze(0)).cuda()
            inp_x = torch.autograd.Variable(x[p].unsqueeze(0)).cuda()
    hidden_matrix = np.delete(hidden_matrix, 0, 0)
    df = pd.DataFrame(hidden_matrix)
    df.to_csv('paran-data-df.csv')
    np.savetxt("paren-data.csv", hidden_matrix, delimiter=",")
    np.savetxt("paren-data.tsv", hidden_matrix, delimiter="\t")

In [197]:
test = [(2, 1), (31, 1), (8, 1), (14, 1), (3, 1), (4, 1), (5, 1)]

In [198]:
problem_dict = {v:k for k,v in skill_dt.items()}

In [199]:
[problem_dict[x[0]] for x in test]

['number_line',
 'understand_coins',
 'addition_1',
 'addition_2',
 'multiplication_0.5',
 'multiplication_1',
 'division_0.5']

In [200]:
inputs = test # test_seqs[22]
seq_size = len(inputs)
# 包含初始状态x_{0}
feature_size = len(skill_dt) * 2
# 包含初始状态x_{0}
x = torch.zeros(seq_size, feature_size)
skill = [1]
# delta(t + 1)
mask = torch.zeros((seq_size, len(skill_dt)))
# a_{t+1}
ans = torch.zeros(seq_size)
for i, v in enumerate(inputs):
    if i < seq_size - 1:
        new_feature_v = torch.zeros(feature_size)
        # skill_id * 2 + correct
        idx = int(v[0] * 2 + v[1])
        new_feature_v[idx] = 1
        x[i + 1] = new_feature_v
        skill.append(v[0])
    new_skill_v = torch.zeros(len(skill_dt))
    new_skill_v[v[0]] = 1
    mask[i] = new_skill_v
    ans[i] = v[1]

vis_generate(model, torch.LongTensor(skill), x )

x torch.Size([7, 1382])
x torch.Size([1, 1382])
prime_input torch.Size([1])
p 1
p 2
p 3
p 4
p 5
p 6
p 7


In [201]:
# hidden = model.init_hidden(1)
# out, hidden = model((torch.LongTensor([x[0] for x in test])).cuda(), hidden)
# print('output', out)
# torch.max(skill_pred, dim=-1, keepdim=True)

In [202]:
# criterion((out * y_skill.cuda()).sum(dim=1), y_ans.cuda())

In [203]:
from math import pi
import pandas as pd

from bokeh.io import show
from bokeh.models import ColumnDataSource, HoverTool, LinearColorMapper, BasicTicker, PrintfTickFormatter,ColorBar
from bokeh.models import FuncTickFormatter
from bokeh.plotting import figure

data = pd.read_csv('paran-data-df.csv', index_col=0)
data = data.T
data.index.name = 'cell'
data.columns.name = 'chars'


In [204]:
data2 = data.diff(axis=1).fillna(0.0)

In [205]:
%matplotlib inline


In [206]:
from bokeh.io import output_notebook
from bokeh.models import LogColorMapper, LogTicker, ColorBar
output_notebook()

In [207]:
index = {i:problem_dict[test[i][0]] for i in range(len(data.columns))}

seq = [str(i) for i in data.columns]
cell = list([str(x) for x in data.index])

df = pd.DataFrame(data.stack(), columns=['value']).reset_index()
colors = ["#313695", "#4575b4", "#74add1", "#abd9e9", "#e0f3f8", "#ffffbf", "#fee090", "#fdae61", "#f46d43", "#d73027", "#a50026"]

colors.reverse()
mapper = LinearColorMapper(palette=colors, low=-1, high=1)#low=df.value.min(), high=df.value.max())
source = ColumnDataSource(df)
TOOLS = "hover,pan,reset,save,wheel_zoom"

color_bar = ColorBar(color_mapper=mapper, ticker=BasicTicker(),
                     label_standoff=12, border_line_color=None, location=(0,0))

p = figure(title="LSTM Hidden State Activations",  x_range=cell, y_range=list(reversed(seq)), x_axis_location="above",
            plot_width=500, plot_height=300,
            tools=TOOLS, toolbar_location='below')

p.grid.grid_line_color = None
p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
p.axis.major_label_text_font_size = "8pt"
p.axis.major_label_standoff = 0
p.yaxis.major_label_orientation = pi / 3
p.yaxis.formatter = FuncTickFormatter(code="""
                                        var labels = %s;
                                        return labels[tick];
                                    """%index)

p.rect(x="cell", y="chars", width=1, height=1, source=source, fill_color={'field': 'value', 'transform': mapper},
                                line_color=None)

p.select_one(HoverTool).tooltips = [('value', '@value')]
p.add_layout(color_bar, 'right')
show(p)      # show the plot

In [208]:
index = {i:problem_dict[test[i][0]] for i in range(len(data2.columns))}

seq = [str(i) for i in data2.columns]
cell = list(data2.index)

df = pd.DataFrame(data2.stack(), columns=['value']).reset_index()
colors = ["#313695", "#4575b4", "#74add1", "#abd9e9", "#e0f3f8", "#ffffbf", "#fee090", "#fdae61", "#f46d43", "#d73027", "#a50026"]

colors.reverse()
mapper = LinearColorMapper(palette=colors, low=-1, high=1)#low=df.value.min(), high=df.value.max())
source = ColumnDataSource(df)
TOOLS = "hover,pan,reset,save,wheel_zoom"

color_bar = ColorBar(color_mapper=mapper, ticker=BasicTicker(),
                     label_standoff=12, border_line_color=None, location=(0,0))

p = figure(title="LSTM Hidden State Activations",  x_range=seq, y_range=list(reversed(cell)), x_axis_location="above",
            plot_width=300, plot_height=500,
            tools=TOOLS, toolbar_location='below')

p.grid.grid_line_color = None
p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
p.axis.major_label_text_font_size = "8pt"
p.axis.major_label_standoff = 0
p.xaxis.major_label_orientation = pi / 3
p.xaxis.formatter = FuncTickFormatter(code="""
                                        var labels = %s;
                                        return labels[tick];
                                    """%index)

p.rect(x="chars", y="cell", width=1, height=1, source=source, fill_color={'field': 'value', 'transform': mapper},
                                line_color=None)

p.select_one(HoverTool).tooltips = [('value', '@value')]
p.add_layout(color_bar, 'right')
show(p)      # show the plot