## Imports

In [329]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset, random_split
import time
import math
import statistics
import pandas as pd
import matplotlib.pyplot as plt

## Data Loading

In [2]:
duolingo_dataset = pd.read_csv("./settles.acl16.learning_traces.13m.csv")
duolingo_dataset = duolingo_dataset[duolingo_dataset["learning_language"] == "en"]
duolingo_dataset.head(5)

Unnamed: 0,p_recall,timestamp,delta,user_id,learning_language,ui_language,lexeme_id,lexeme_string,history_seen,history_correct,session_seen,session_correct
87,1.0,1362082504,357,u:dwbJ,en,pt,1052c3ace653dbc8923eaa183bc02b88,definition/definition<n><sg>,17,17,2,2
88,1.0,1362082504,357,u:dwbJ,en,pt,9cba1b30f88bf3c047b22cffcaf88c12,surface/surface<n><sg>,19,19,3,3
89,1.0,1362082504,357,u:dwbJ,en,pt,961cd149f20f2571419b1412d849f19a,scale/scale<n><sg>,21,20,3,3
90,0.8,1362082504,357,u:dwbJ,en,pt,5cbb1249562e95794a4c4ae0e2d8ae26,temperature/temperature<n><sg>,44,36,5,4
91,1.0,1362082504,357,u:dwbJ,en,pt,2df65bdf80d10d2b78d62cb2e0a731d8,distance/distance<n><sg>,21,20,3,3


## Data Preparation

In [13]:
duolingo_dataset['history_right'] = duolingo_dataset.apply(lambda row: math.sqrt(1 + row['history_correct']), axis=1)
duolingo_dataset['history_wrong'] = duolingo_dataset.apply(lambda row: math.sqrt(1 + row['history_seen'] - row['history_correct']), axis=1)
duolingo_dataset['time'] = duolingo_dataset.apply(lambda row: row['delta']/(60*60*24), axis=1)
duolingo_dataset.head(5)

Unnamed: 0,p_recall,timestamp,delta,user_id,learning_language,ui_language,lexeme_id,lexeme_string,history_seen,history_correct,session_seen,session_correct,history_right,history_wrong,time
87,1.0,1362082504,357,u:dwbJ,en,pt,1052c3ace653dbc8923eaa183bc02b88,definition/definition<n><sg>,17,17,2,2,4.242641,1.0,0.004132
88,1.0,1362082504,357,u:dwbJ,en,pt,9cba1b30f88bf3c047b22cffcaf88c12,surface/surface<n><sg>,19,19,3,3,4.472136,1.0,0.004132
89,1.0,1362082504,357,u:dwbJ,en,pt,961cd149f20f2571419b1412d849f19a,scale/scale<n><sg>,21,20,3,3,4.582576,1.414214,0.004132
90,0.8,1362082504,357,u:dwbJ,en,pt,5cbb1249562e95794a4c4ae0e2d8ae26,temperature/temperature<n><sg>,44,36,5,4,6.082763,3.0,0.004132
91,1.0,1362082504,357,u:dwbJ,en,pt,2df65bdf80d10d2b78d62cb2e0a731d8,distance/distance<n><sg>,21,20,3,3,4.582576,1.414214,0.004132


## Data Split

In [93]:
samples = len(duolingo_dataset)
split_lengths = [int(samples*0.9), samples - int(samples*0.9)]
X = duolingo_dataset[['history_right', 'history_wrong', 'time']]
# X = duolingo_dataset[['history_right', 'history_wrong']] # HLR
# X = duolingo_dataset[['time', 'history_correct', 'history_seen']]
y = duolingo_dataset['p_recall']
dataset = TensorDataset(torch.tensor(X.values).float(), torch.tensor(y.values).float().unsqueeze(1))
train, test = random_split(dataset, split_lengths)

In [242]:
# dataset = TensorDataset(torch.tensor(X.values).float(), torch.tensor(y.values).float())
# train, test = random_split(dataset, split_lengths)

## Models

In [248]:
# min_hl = 15.0 / (24 * 60) # 15 minutes
min_hl = 10.0 / (24 * 60) # 10 minutes
# max_hl = 274.0
max_hl = 1000.0
min_hl_pow = math.log2(min_hl)
max_hl_pow = math.log2(max_hl)

class SimpleNet(nn.Module):
    def __init__(self):
        super(SimpleNet, self).__init__()
        self.fc1 = nn.Linear(3, 1)

    def forward(self, x):
        x = self.fc1(x)
        return x

class SimpleNetSigmoid(nn.Module):
    def __init__(self):
        super(SimpleNetSigmoid, self).__init__()
        self.fc1 = nn.Linear(3, 1)

    def forward(self, x):
        x = torch.sigmoid(self.fc1(x))
        return x

class LinearNet(nn.Module):
    def __init__(self):
        super(LinearNet, self).__init__()
        self.fc1 = nn.Linear(3, 10)
        self.fc2 = nn.Linear(10, 10)
        self.fc3 = nn.Linear(10, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = torch.clamp(self.fc3(x), min=0, max=1)
        return x

class HLRWithHNet(nn.Module):
    def __init__(self):
        super(HLRWithHNet, self).__init__()
        self.fc1 = nn.Linear(3, 1)

    def forward(self, x):
        t = x[:, 2].unsqueeze(1)
        x = torch.clamp(self.fc1(x), min_hl_pow, max_hl_pow)
        h = torch.pow(2, x)
        p = torch.pow(2, -t/h)
        return p, h

class HLRNet(nn.Module):
    def __init__(self):
        super(HLRNet, self).__init__()
        self.fc1 = nn.Linear(3, 1)

    def forward(self, x):
        return torch.clamp(self.fc1(x), min_hl_pow, max_hl_pow)


## Train

In [235]:
batch_size = 64
epochs = 3
weight_decay = 0.1
lr = 0.001
alpha = 0.01
# model = HLRWithHNet()
model = LinearNet()
loss_fn = nn.MSELoss()
# loss_fn = nn.L1Loss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
train_loader = DataLoader(train, batch_size=batch_size)
test_loader = DataLoader(test, batch_size=batch_size)

print(f'{len(train)} training samples among {len(train_loader)} batches of size {batch_size}')

def accuracy(pred, y):
    return torch.abs(pred - y).mean()

epoch_loss, epoch_acc = 0, 0
for epoch in range(epochs):
    epoch_loss, epoch_acc, running_loss, running_acc, batch_count = 0, 0, 0, 0, 0
    batches = len(train_loader)
    for batch in train_loader:
        X, y = batch
        optimizer.zero_grad()
#         output, h = model(X)
        output = model(X)
        loss = loss_fn(output, y)
        loss.backward()
        optimizer.step()

        with torch.no_grad():
            acc = accuracy(output, y)
            running_loss += loss.item()
            running_acc += acc.item()
            epoch_loss += loss.item()
            epoch_acc += acc.item()
            
        batch_count += 1
        if batch_count % 19999 == 0:
            #print(f'Epoch {epoch+1} loss {running_loss/20000:.4f}, accuracy {running_acc/20000:.4f}')
            running_loss = 0
            running_acc = 0
    print(f'Total epoch {epoch+1} loss {epoch_loss/batches:.4f}, accuracy {epoch_acc/batches:.4f}')

4513311 training samples among 70521 batches of size 64
Total epoch 1 loss 0.0741, accuracy 0.1997
Total epoch 2 loss 0.0735, accuracy 0.2006
Total epoch 3 loss 0.0735, accuracy 0.2006


## Loss Functions

In [195]:
def hlr_loss_fn(alpha, y_hat, y, h_hat, h):
    return ((y_hat - y).pow(2) + alpha*(h_hat - h).pow(2)).mean()

def hlr_prediction(delta, log2h):
    h = torch.pow(2, log2h)
    neg_delta_over_h = -delta/h
    return torch.pow(2, neg_delta_over_h)

# def complete_hlr_loss_fn(delta, log2h, y):
#     h = torch.pow(2, log2h)
#     return (torch.pow(2, -delta/torch.pow(2, log2h)) - y).pow(2) + 0.01*(torch.pow(2, log2h) + delta/torch.log2(y)).pow(2)

## HLR Train

In [249]:
batch_size = 64
epochs = 3
weight_decay = 0.1
lr = 0.001
alpha = 0.01
model = HLRWithHNet()
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
train_loader = DataLoader(train, batch_size=batch_size)
test_loader = DataLoader(test, batch_size=batch_size)

print(f'{len(train)} training samples among {len(train_loader)} batches of size {batch_size}')

def accuracy(pred, y):
    return torch.abs(pred - y).mean()

epoch_loss, epoch_acc = 0, 0
for epoch in range(epochs):
    epoch_loss, epoch_acc, running_loss, running_acc, batch_count = 0, 0, 0, 0, 0
    batches = len(train_loader)
    for batch in train_loader:
        X, y = batch
        optimizer.zero_grad()
#         log2h = model(X)
        prediction, pred_h = model(X)
#         print(f'h_hat: {h_hat.shape}')
        delta = X[:, 2].unsqueeze(1)
#         delta.detach()
#         print(f'd: {delta.shape}')
#         print(f'y: {y.shape}')
        h = torch.where(y == 0, torch.tensor(max_hl, dtype=torch.float), (-delta/torch.log2(y)).clamp(min_hl, max_hl))
#         h.detach()
#         print(f'pred: {prediction}, pred_h: {pred_h}, delta: {delta}, h: {h}, y: {y}')
        loss = hlr_loss_fn(alpha, prediction, y, pred_h, h)
#         loss = hlr_loss_fn(alpha, hlr_prediction(delta, log2h), y, torch.pow(2, log2h), h)
#         loss = complete_hlr_loss_fn(delta, log2h, y)
        loss.backward()
        optimizer.step()

        with torch.no_grad():
            acc = accuracy(prediction, y)
            running_loss += loss.item()
            running_acc += acc.item()
            epoch_loss += loss.item()
            epoch_acc += acc.item()
            
        batch_count += 1
        if batch_count % 19999 == 0:
            #print(f'Epoch {epoch+1} loss {running_loss/20000:.4f}, accuracy {running_acc/20000:.4f}')
            running_loss = 0
            running_acc = 0
    print(f'Total epoch {epoch+1} loss {epoch_loss/batches:.4f}, accuracy {epoch_acc/batches:.4f}')

4513311 training samples among 70521 batches of size 64
Total epoch 1 loss 640.6961, accuracy 0.2045
Total epoch 2 loss 624.0540, accuracy 0.1377
Total epoch 3 loss 624.0540, accuracy 0.1377


## LSTM Data Preparation

In [284]:
# for r in duolingo_dataset.groupby(['user_id', 'lexeme_id']).size()[:5]:
#     print(r)
# lstm_dataset = duolingo_dataset[['user_id', 'lexeme_id', 'timestamp', 'p_recall', 'delta']].sort_values(by=['user_id', 'lexeme_id', 'timestamp'])
k = 3
sequences = []
labels = []
# for row in lstm_dataset[:5].itertuples():
#     if prev_user_lexeme == (row.user_id, row.lexeme_id):
#         prev_p_deltas.append([row.p_recall, row.delta])
# count = 0
for (_, rows) in duolingo_dataset[['user_id', 'lexeme_id', 'timestamp', 'p_recall', 'time']].groupby(['user_id', 'lexeme_id']):
    if len(rows) > k:
        sequence = rows.sort_values(by=['timestamp'])[['p_recall', 'time']]
        sequence['next_time'] = sequence['time'].shift(-1)
        sequences.extend([sequence[i:i+k].values for i in range(len(sequence)-k)])
        labels.extend(sequence['p_recall'].values[k:])
#         count += 1
#     if count == 2:
#         break
lstm_X = torch.tensor(sequences, dtype=torch.float)
lstm_y = torch.tensor(labels, dtype=torch.float).unsqueeze(1)
# print(lstm_X.shape)
# print(lstm_X)
# print(lstm_y.shape)
# print(lstm_y)
print(f'LSTM X shape: {lstm_X.shape}, y shape: {lstm_y.shape}')
samples = len(sequences)
print(f'{samples} samples')
split_lengths = [int(samples*0.9), samples - int(samples*0.9)]
print(f'Split lengths: {split_lengths}')
lstm_dataset = TensorDataset(lstm_X, lstm_y)
lstm_train_data, lstm_test_data = random_split(lstm_dataset, split_lengths)

LSTM X shape: torch.Size([1395552, 3, 3]), y shape: torch.Size([1395552, 1])
1395552 samples
Split lengths: [1255996, 139556]


## LSTM Train

In [338]:
class RNN(nn.Module):
    def __init__(self, features_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout):
        super(RNN, self).__init__()
        self.rnn = nn.LSTM(features_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
#         self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.hidden_dim = hidden_dim
        
    def forward(self, sequences):        
        output, (hidden, cell) = self.rnn(sequences)
#         hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
#         hidden = self.dropout(hidden[-1,:,:])
        output = self.dropout(output)
#         print(f'{output.shape} output, {hidden.shape} hidden, len seq {sequences.shape}')
#         hidden = hidden.view(hidden.shape[0], len(sequences), self.hidden_dim)
        output = output.view(output.shape[1], len(sequences), self.hidden_dim)[-1]
#         return self.fc(hidden)
        return self.fc(output)

INPUT_DIM = 3 # p_recall, delta, next_delta
HIDDEN_DIM = 16
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = False#True
DROPOUT = 0.0

model = RNN(INPUT_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 64
lstm_optimizer = optim.Adam(model.parameters())
lstm_criterion = nn.MSELoss()
model = model.to(device)
lstm_criterion = lstm_criterion.to(device)

def lstm_train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    running_loss, running_acc, batch_count = 0, 0, 0
    model.train()
    for batch in iterator:
        optimizer.zero_grad()
        X, y = batch
        predictions = model(X)
        loss = criterion(predictions, y)
        loss.backward()
        optimizer.step()
        
        with torch.no_grad():
            acc = accuracy(predictions, y)
            running_loss += loss.item()
            running_acc += acc.item()
            epoch_loss += loss.item()
            epoch_acc += acc.item()
            
        batch_count += 1
        if batch_count % 1999 == 0:
            print(f'Epoch {epoch+1} loss {running_loss/2000:.4f}, accuracy {running_acc/2000:.4f}')
            running_loss = 0
            running_acc = 0
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

lstm_train_loader = DataLoader(lstm_train_data, batch_size=batch_size)
lstm_test_loader = DataLoader(lstm_test_data, batch_size=batch_size)

for epoch in range(epochs):
    start_time = time.time()
    train_loss, train_acc = lstm_train(model, lstm_train_loader, lstm_optimizer, lstm_criterion)
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)    
    # print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    # print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    # print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
    print(f'Total epoch {epoch+1}: loss {train_loss:.4f}, accuracy {train_acc:.4f}, mins {epoch_mins}, secs {epoch_secs}')

Epoch 1 loss 0.0827, accuracy 0.1835
Epoch 1 loss 0.0762, accuracy 0.1766
Epoch 1 loss 0.0746, accuracy 0.1736
Epoch 1 loss 0.0750, accuracy 0.1745
Epoch 1 loss 0.0751, accuracy 0.1744
Epoch 1 loss 0.0747, accuracy 0.1734
Epoch 1 loss 0.0748, accuracy 0.1737
Epoch 1 loss 0.0746, accuracy 0.1738
Epoch 1 loss 0.0737, accuracy 0.1719
Total epoch 1: loss 0.0757, accuracy 0.1750, mins 8, secs 47
Epoch 2 loss 0.0751, accuracy 0.1744
Epoch 2 loss 0.0761, accuracy 0.1765
Epoch 2 loss 0.0746, accuracy 0.1736
Epoch 2 loss 0.0749, accuracy 0.1744
Epoch 2 loss 0.0751, accuracy 0.1744
Epoch 2 loss 0.0746, accuracy 0.1734
Epoch 2 loss 0.0747, accuracy 0.1737
Epoch 2 loss 0.0746, accuracy 0.1737
Epoch 2 loss 0.0737, accuracy 0.1719
Total epoch 2: loss 0.0749, accuracy 0.1741, mins 8, secs 22
Epoch 3 loss 0.0751, accuracy 0.1744
Epoch 3 loss 0.0761, accuracy 0.1765
Epoch 3 loss 0.0746, accuracy 0.1736
Epoch 3 loss 0.0749, accuracy 0.1744
Epoch 3 loss 0.0750, accuracy 0.1744
Epoch 3 loss 0.0746, accura

## LSTM Evaluation

In [340]:
preds = []
ys = []

def lstm_evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            X, y = batch
            predictions = model(X)
            loss = criterion(predictions, y)
            acc = accuracy(predictions, y)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
            preds.extend(predictions.squeeze(1).tolist())
            ys.extend(y.squeeze(1).tolist())
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

lstm_test_loss, lstm_test_acc = lstm_evaluate(model, lstm_test_loader, lstm_criterion)
print(f'Final testing loss {lstm_test_loss:.4f}, accuracy {lstm_test_acc:.4f}')
with open('predictions.csv', 'w') as file:
    for line in [f'{phat},{p}\n' for phat, p in zip(preds, ys)]:
        file.write(line)

print(f'Average true p_recall: {sum(ys)/len(ys):.4f}, variance: {statistics.pvariance(ys):.12f}')
print(f'Average predicted p_recall: {sum(preds)/len(preds):.4f}, variance: {statistics.pvariance(preds):.12f}')

Final testing loss 0.0753, accuracy 0.1585
Average true p_recall: 0.8977, variance: 0.074783152983
Average predicted p_recall: 0.9201, variance: 0.000009352349


In [335]:
# model(torch.tensor([[[1, 15, 80], [1, 80, 140], [1, 140, 8]]], dtype=torch.float))
print(f'Average true p_recall: {sum(ys)/len(ys):.4f}, variance: {statistics.pvariance(ys):.12f}')

Average true p_recall: 0.8977, variance: 0.074783152983


## Evaluation

In [246]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    with torch.no_grad():
        for batch in iterator:
            X, y = batch
            predictions, h = model(X)
#             predictions = model(X)
            loss = criterion(predictions, y)
            acc = accuracy(predictions, y)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

test_loss, test_acc = evaluate(model, test_loader, loss_fn)
print(f'Regression test loss: {test_loss:.4f}, test acc: {test_acc:.4f}')

Regression test loss: 0.0806, test acc: 0.1205


In [247]:
total_p = 0
total_y = 0
with torch.no_grad():
    for batch in test_loader:
        X, y = batch
        predictions, h = model(X)
#         predictions = model(X)
        total_p += predictions.sum()
        total_y += y.sum()
print(f'Average predicted p_recall: {total_p/len(test):.4f}')
print(f'Average true p_recall: {total_y/len(test):.4f}')

Average predicted p_recall: 0.9719
Average true p_recall: 0.8977
