In [19]:
# from statsmodels.genmod.families.links import Logit
import numpy as np
import pandas as pd

import pickle, os

from scipy.sparse import load_npz, csr_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss, brier_score_loss

pd.set_option('display.max_rows', None)



In [20]:
from sklearn.model_selection import ShuffleSplit

def make_splits(df, n_splits, test_size=0.1):
  unq_ids = df["user_id"].unique()
  splitter = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=0)
  return list(splitter.split(unq_ids))


def get_file_path(ds, generation_type, model_type, fold_num):
  dir_path = f"./simulation-result/{ds}/{model_type}"
  os.makedirs(dir_path, exist_ok=True)
  path = f"{dir_path}/{ds}_simulated_{generation_type}_model_{model_type}_{fold_num}"
  return f"{path}.sav", f"{path}_test.csv"


def run_lr(df, X_fp, splits, ds, generation_model, simulated_model):
  X = csr_matrix(load_npz(X_fp))
  for idx, (users_train, users_test) in enumerate(splits):
    user_ids = X[:, 0].toarray().flatten()
    train = X[np.where(np.isin(user_ids, users_train))]
    test = X[np.where(np.isin(user_ids, users_test))]

    test_df = df[df["user_id"].isin(users_test)]

    X_train, y_train = train[:, 5:], train[:, 3].toarray().flatten()
    X_test, y_test = test[:, 5:], test[:, 3].toarray().flatten()

    # print(test_df.tail(10))
    # print('====')
    # print(X_test[-10:])
    # print('+++++')


    model = LogisticRegression(solver="lbfgs", max_iter=1000)
    model.fit(X_train, y_train)

    y_pred = model.predict_proba(X_test)[:, 1]
    test_df['y_pred'] = y_pred
    test_df['y_true'] = y_test

    # Save model
    model_path, csv_path = get_file_path(ds, generation_model, simulated_model, idx)
    pickle.dump(model, open(model_path, 'wb'))
    test_df.to_csv(csv_path, sep="\t", index=False)

  ''' 
  In the folder:
  - algebra05
    - bestlr
      - algebra05_simulated_bestlr_model_bestlr_0.sav
      - algebra05_simulated_bestlr_model_dkt_0.sav
      - algebra05_simulated_bestlr_model_pfa_0.sav
      - algebra05_simulated_bestlr_model_bestlr_1.sav
      - algebra05_simulated_bestlr_model_dkt_1.sav
      - algebra05_simulated_bestlr_model_pfa_1.sav
      - algebra05_simulated_bestlr_model_bestlr_test_0.csv
      - algebra05_simulated_bestlr_model_dkt_test_0.csv
  '''


In [None]:
import argparse
import pandas as pd
from random import shuffle
from sklearn.metrics import roc_auc_score, accuracy_score

import torch.nn as nn
from torch.optim import Adam
from torch.nn.utils.rnn import pad_sequence

from model_dkt2 import DKT2
from train_dkt2 import get_data, prepare_batches, compute_auc, compute_loss, train
def run_dkt(df, splits, ds, generation_model, simulated_model):
  for idx, (users_train, users_test) in enumerate(splits):
    test_df = df[df["user_id"].isin(users_test)]
    train_df = df[df["user_id"].isin(users_train)]

    model = train_dkt(train_df, test_df)
    preds = eval_dkt(model, test_df)
    test_df['y_pred'] = preds

    # Save model
    model_path, csv_path = get_file_path(ds, generation_model, simulated_model, idx)
    pickle.dump(model, open(model_path, 'wb'))
    test_df.to_csv(csv_path, sep="\t", index=False)

def train_dkt(train_df, test_df, hid_size=200, embed_size=200, 
              num_hid_layers=1, drop_prob=.5, _batch_size=64,
              log_dir='runs/dkt', savedir='save/dkt',
              lr=1e-2, num_epochs=100, seed=0):
  set_random_seeds(seed)
  train_data, val_data = get_data(train_df, train_split=0.8)

  max_item = max(int(train_df["item_id"].max()), int(test_df["item_id"].max()))
  max_skill = max(int(train_df["skill_id"].max()), int(test_df["skill_id"].max()))

  model = DKT2(max_item, max_skill, hid_size,
                embed_size, num_hid_layers, drop_prob).cuda()
  optimizer = Adam(model.parameters(), lr=lr)

  # Reduce batch size until it fits on GPU
  while True:
    try:
      # Train
      param_str = f"{dataset}"
      logger = Logger(os.path.join(log_dir, param_str))
      saver = Saver(savedir, param_str)
      train(train_data, val_data, model, optimizer, logger, saver, num_epochs, batch_size)
      break
    except RuntimeError as e:
      print(e)
      batch_size = batch_size // 2
      print(f'Batch does not fit on gpu, reducing size to {batch_size}')
  
  logger.close()
  return model

def eval_dkt(model, test_df, savedir='save/dkt'):
    if(isinstance(model, str)):
        saver = Saver(savedir, model)
        model = saver.load()
    test_data, _ = get_data(test_df, train_split=1.0, randomize=False)
    test_batches = prepare_batches(test_data, batch_size, randomize=False)
    test_preds = np.empty(0)
    test_skill_preds = np.empty(0)

    # Predict on test set
    model.eval()
    for item_inputs, skill_inputs, label_inputs, item_ids, skill_ids, labels in test_batches:
        with torch.no_grad():
            item_inputs = item_inputs.cuda()
            skill_inputs = skill_inputs.cuda()
            label_inputs = label_inputs.cuda()
            item_ids = item_ids.cuda()
            skill_ids = skill_ids.cuda()

            # Make the per-item prediction
            preds = model(item_inputs, skill_inputs, label_inputs, item_ids, skill_ids)
            preds = torch.sigmoid(preds[labels >= 0]).cpu().numpy()
            test_preds = np.concatenate([test_preds, preds])
    return test_preds

In [None]:
# datasets = ['algebra05', 'assistments09', 'assistments15', 'assistments17', 'bridge_algebra06', 'spanish', 'statics', 'assistments12']
datasets = ['algebra05']
n_folds = 1
for d in datasets:
  # for simulated_model in ['bestlr', 'pfa', 'dkt']:
  for simulated_model in ['bestlr']:
    dir_path = f'./simulation/{d}/{simulated_model}'
    dataset_fp = f'{dir_path}/simulated-{simulated_model}.csv'
    X_pfa_fp = f'{dir_path}/X-sscwa.npz'
    X_bestlr_fp = f'{dir_path}/X-isicsctcwa.npz'

    df = pd.read_csv(dataset_fp, sep="\t")
    splits = make_splits(df, n_folds)
    run_lr(df, X_bestlr_fp, splits, d, 'bestlr', simulated_model)
    run_lr(df, X_pfa_fp, splits, d, 'pfa', simulated_model)