In [1]:
# from statsmodels.genmod.families.links import Logit
import numpy as np
import pandas as pd
from tqdm import tqdm

import pickle
import joblib

from scipy.sparse import load_npz, csr_matrix

datasets = ['algebra05', 'assistments09', 'assistments15', 'assistments17', 'bridge_algebra06', 'spanish', 'statics', 'assistments12']

pd.set_option('display.max_rows', None)
# pd.reset_option('display.max_rows')

def loading(iters):
    return tqdm(iters, desc="Running ...", ascii=False, ncols=75)

def inv_logit(z):
  return 1 / (1 + np.exp(-z))

def calc_pfa_prob(s_opp, f_opp, s_slope, f_slope, stu_intercept, kc_intercept, ovr_intercept):
  z = (s_slope * s_opp) + (f_slope * f_opp) + stu_intercept + kc_intercept + ovr_intercept
  return inv_logit(z)

In [10]:
import encode_fast as ef
from importlib import reload
reload(ef)

# ACTIVE_FEATURES = ['i', 's', 'ic', 'sc', 'tc', 'w', 'a']
ACTIVE_FEATURES = ['s', 'sc', 'w', 'a']
def simulate_bestlr(ds):
  csv_fp = f'./data/real/{ds}/preprocessed_data.csv'
  model_fp = f'./data/real/{ds}/real-pfa-model.sav'
  q_fp = f'./data/real/{ds}/q_mat.npz'

  columns = ['user_id', 'item_id', 'timestamp', 'correct', 'skill_id', 'prob']
  opps = {}
  rows = []
  df = pd.read_csv(csv_fp, delimiter='\t')
  Q_mat = load_npz(q_fp).toarray()
  
  num_items, num_skills = Q_mat.shape
  model = pickle.load(open(model_fp, 'rb'))

  for user_id, group in df.groupby('user_id'):
    if user_id not in opps:
      opps[user_id] = {
        'items': {}, 
        'skills': {
          'count': np.zeros(num_skills),
          'success': np.zeros(num_skills),
        },
        'total': { 'count': 0, 'success': 0}
        }

    for idx, row in group.iterrows():
      skill_id = row['skill_id']
      item_id = row['item_id']
      if item_id not in opps[user_id]['items']:
        opps[user_id]['items'][item_id] = { 'count': 0, 'success': 0 }

      skills = Q_mat[row['item_id']].copy()
      opps[user_id]
        
      skill_ids = np.nonzero(skills)[0]

      # X = ef.single_to_sparse(group[idx], Q_mat, ACTIVE_FEATURES)
      X = ef.single_to_sparse(df, Q_mat, user_id, 
                              item_id, 
                              opps[user_id]['items'][item_id]['count'],
                              opps[user_id]['items'][item_id]['success'],
                              skills, opps[user_id]['skills']['count'],
                              opps[user_id]['skills']['success'],
                              opps[user_id]['total']['count'], 
                              opps[user_id]['total']['success'], 
                              ACTIVE_FEATURES)

      prob = model.predict_proba(X)[0][1]
      correct = np.random.choice([1, 0], p=[prob, 1-prob])
      # correct = row['correct']

      opps[user_id]['items'][item_id]['count'] += 1
      opps[user_id]['items'][item_id]['success'] += 1 if correct == 1 else 0
      opps[user_id]['total']['count'] += 1
      opps[user_id]['total']['success'] += 1 if correct == 1 else 0

      opps[user_id]['skills']['count'] += skills
      if True or correct == 1:
        skill_ids = np.nonzero(skills)[0]
        l = len(skill_ids)
        opps[user_id]['skills']['success'] += csr_matrix((np.ones(l), (np.zeros(l), skill_ids)), shape=(1, num_skills)).toarray().reshape(-1)

      dat = [user_id, row['item_id'], row['timestamp'], correct, skill_id, prob]
      rows.append(dat)
    
  final = pd.DataFrame(rows, columns=columns)
  final.to_csv(f'./simulation/simulated-data-pfa/{ds}/{ds}.csv', index=False, sep='\t')
  # final.to_csv(f'./simulation/simulated-data/{ds}/foo.csv', index=False, sep='\t')

# simulate_bestlr('statics')

In [23]:
import os
import encode_fast as ef
# for f in os.listdir('./data'):
for f in ['algebra05', 'assistments09', 'assistments15', 'assistments17', 'bridge_algebra06', 'spanish', 'statics', 'assistments12']:
  print(f)
  simulate_bestlr(f)

# from scipy.sparse import load_npz, csr_matrix
# data_fp = "./data/statics/X-isicsctcwa.npz"
# X = csr_matrix(load_npz(data_fp))

algebra05


Running ...: 100%|███████████████████████| 567/567 [37:30<00:00,  3.97s/it]


assistments09


Running ...: 100%|█████████████████████| 3114/3114 [18:04<00:00,  2.87it/s]


In [5]:
import os

dataset = "statics"

def check_q_all_one(dataset):
  dir_path = f"./data/real/{dataset}"
  df = pd.read_csv(os.path.join(dir_path, "preprocessed_data.csv"), sep="\t")
  df = df[["user_id", "item_id", "timestamp", "correct", "skill_id"]]
  Q_mat = load_npz(os.path.join(dir_path, 'q_mat.npz')).toarray()

  if any([sum(q) != 1 for q in Q_mat]):
    print(f'{dataset}: There is != 1 - count={sum([1 for q in Q_mat if sum(q) != 1])}')
  else:
    print(f'{dataset}: Ok!')

# datasets = [f for f in os.listdir('./data/real')]
datasets = ['algebra05', 'assistments09', 'assistments15', 'assistments17', 'bridge_algebra06', 'spanish', 'statics', 'assistments12']
for d in datasets:
  check_q_all_one(d)

# check_q_all_one('algebra05')


algebra05: There is != 1 - count=59532
assistments09: There is != 1 - count=3016
assistments15: Ok!
assistments17: There is != 1 - count=697
bridge_algebra06: There is != 1 - count=1626
spanish: Ok!
statics: Ok!
assistments12: Ok!


In [None]:
from importlib import reload
import encode_fast as ef
# import encode as ec
from scipy import sparse
import os, time

reload(ef)

dataset = "algebra05"

dir_path = f"./data/real/{dataset}"
df = pd.read_csv(os.path.join(dir_path, "preprocessed_data.csv"), sep="\t")
df = df[["user_id", "item_id", "timestamp", "correct", "skill_id"]]
Q_mat = sparse.load_npz(os.path.join(dir_path, 'q_mat.npz')).toarray()

# Transform q-matrix into dictionary for fast lookup
num_items, num_skills = Q_mat.shape
Q_mat_dict = {i: set() for i in range(num_items)}
for i, j in np.argwhere(Q_mat == 1):
    Q_mat_dict[i].add(j)

active_features = ['i', 's', 'ic', 'sc', 'tc', 'w', 'a']
print('start ...')
start = time.time()
X = ef.df_to_sparse(df, Q_mat, active_features)
print(X.shape)
print('----')
print(X[-3,5:])
end = time.time()
print('Timespent: ', (end-start), 's')

# 3 seconds

In [44]:
npz_fp = './data/real/algebra05/X-isicsctcwa.npz'
orig = csr_matrix(load_npz(npz_fp))
# orig = orig[:, 5:]
print(X.shape)
print(orig.shape)

np.array_equal(X.data, orig.data)

(606983, 173458)
(606983, 173458)


True

In [65]:
dataset = "bridge_algebra06"
datasets = ['algebra05', 'assistments09', 'assistments15', 'assistments17', 'bridge_algebra06', 'spanish', 'statics', 'assistments12']
# datasets = ['statics']


for d in datasets:
  print(d)
  dir_path = f"./simulation/simulated-data-pfa/{d}/{d}.csv"
  df1 = pd.read_csv(dir_path, sep="\t")

  dir_path = f"./data/real/{d}"
  df2 = pd.read_csv(os.path.join(dir_path, "preprocessed_data.csv"), sep="\t")

  # print(sorted(df.item_id.unique()))
  # df[df.item_id == 0]
  print(df1.correct.value_counts())
  print(df2.correct.value_counts())
  print("=====")

algebra05
1    513578
0     93405
Name: correct, dtype: int64
1    458453
0    148530
Name: correct, dtype: int64
=====
assistments09
1    219344
0     58992
Name: correct, dtype: int64
1    183303
0     95033
Name: correct, dtype: int64
=====
assistments15
1    526963
0    129191
Name: correct, dtype: int64
1    479165
0    176989
Name: correct, dtype: int64
=====
assistments17
1    504531
0    430107
Name: correct, dtype: int64
0    584977
1    349661
Name: correct, dtype: int64
=====
bridge_algebra06
1    1610453
0     206940
Name: correct, dtype: int64
1    1512344
0     305049
Name: correct, dtype: int64
=====
spanish
1    498361
0     80365
Name: correct, dtype: int64
1    447372
0    131354
Name: correct, dtype: int64
=====
statics
1    159475
0     29822
Name: correct, dtype: int64
1    144883
0     44414
Name: correct, dtype: int64
=====
assistments12
1    2115594
0     566617
Name: correct, dtype: int64
1    1866608
0     815603
Name: correct, dtype: int64
=====


In [9]:
import os
import shutil

DATA_DIR = "/Volumes/NREXT/proj/research/data"
ORIG_SIM_DIR = "/Volumes/NREXT/proj/research/danny-edm/simulation"

for d in datasets:
  # src_file = f"{DATA_DIR}/real/{d}/simulated-bestlr.csv"
  src_file = f"{ORIG_SIM_DIR}/simulated-data-pfa/{d}/{d}.csv"
  dest_file = f"{DATA_DIR}/simulation/{d}/pfa/simulated-pfa.csv"

  shutil.copyfile(src_file, dest_file)
  # os.rename(src_file, dest_file)
  # for s in ['bestlr', 'pfa', 'dkt']:
  #   os.makedirs(f"{target_dir}/{s}")