In [5]:
# from statsmodels.genmod.families.links import Logit
import numpy as np
import pandas as pd
from tqdm import tqdm

from pymer4.models import Lmer
from pymer4.io import save_model, load_model


datasets = ['algebra05', 'assistments09', 'assistments15', 'assistments17', 'bridge_algebra06', 'spanish', 'statics'] # 'assistments12'

pd.set_option('display.max_rows', None)
# pd.reset_option('display.max_rows')

def loading(iters):
    return tqdm(iters, desc="Running ...", ascii=False, ncols=75)

def inv_logit(z):
  return 1 / (1 + np.exp(-z))

def calc_pfa_prob(s_opp, f_opp, s_slope, f_slope, stu_intercept, kc_intercept, ovr_intercept):
  z = (s_slope * s_opp) + (f_slope * f_opp) + stu_intercept + kc_intercept + ovr_intercept
  return inv_logit(z)

In [24]:
### RUN MODEL ###

formulas = {
  "AFM": "correct ~ opp + (opp|skill_id) + (1|user_id)",
  "PFA": "correct ~ (1|user_id) + (s_opp|skill_id) + (f_opp|skill_id)",
  "PFA1": "correct ~ (s_opp|skill_id) + (f_opp|skill_id) + (1|user_id)",
  "PFA2": "correct ~ s_opp + f_opp + (s_opp|skill_id) + (f_opp|skill_id) + (1|user_id)",
  "PFA3": "correct ~ (s_opp:skill_id) + (f_opp:skill_id) + (1|skill_id) + (1|user_id)",
  "PFA4": "correct ~ opp + (s_opp:skill_id) + (f_opp:skill_id) + (1|skill_id) + (1|user_id)", # opp = s_opp + f_opp
  "PFA5": "correct ~ (1|user_id) + (0+s_opp|skill_id) + (0+f_opp|skill_id) + (1|skill_id)", # Danny?
}

# model.fit(control="optimizer='Nelder_Mead', optCtrl = list(FtolAbs=1e-8, XtolRel=1e-8)")
def run(df, model_type="PFA"):
  model = Lmer(formulas[model_type], data=df, family="binomial")
  model.fit(control="optimizer='bobyqa', optCtrl=list(maxfun=2e5)", summarize=False)

  print(f"{model_type}: AIC: {model.AIC}, BIC: {model.BIC}, LogLik: {model.logLike}")
  return model

In [25]:
algebra_df = prepare_df('data/algebra05/preprocessed_data.csv')
model1 = run(algebra_df, "PFA1")
model2 = run(algebra_df, "PFA2")
model3 = run(algebra_df, "PFA3")
model4 = run(algebra_df, "PFA4")
model5 = run(algebra_df, "PFA5")

save_model(model1, './pfa_experiments/model1.joblib')
save_model(model2, './pfa_experiments/model2.joblib')
save_model(model3, './pfa_experiments/model3.joblib')
save_model(model4, './pfa_experiments/model4.joblib')
save_model(model5, './pfa_experiments/model5.joblib')


Model failed to converge with max|grad| = 0.00226471 (tol = 0.002, component 1) 

PFA1: AIC: 556833.1832772302, BIC: 556923.7133257347, LogLik: -278408.5916386151
Model failed to converge with max|grad| = 0.00384345 (tol = 0.002, component 1) 

PFA2: AIC: 556648.672887014, BIC: 556761.8354476446, LogLik: -278314.336443507
[1] "Model failed to converge with max|grad| = 9.1981 (tol = 0.002, component 1)"
[2] " \n"                                                                        

[1] "Model is nearly unidentifiable: very large eigenvalue\n - Rescale variables?"
[2] " \n"                                                                         

[1] "Model is nearly unidentifiable: large eigenvalue ratio\n - Rescale variables?"
[2] " \n"                                                                          

PFA3: AIC: 564260.7821094937, BIC: 564317.363389809, LogLik: -282125.39105474687
[1] "Model failed to converge with max|grad| = 313.449 (tol = 0.002, component 1)"
[2] " \n"  

: 

In [22]:
def prepare_df(fp):
  df = pd.read_csv(fp, delimiter='\t')
  df['opp'] = df.groupby(['user_id', 'skill_id']).cumcount()
  df['s_opp'] = ((df['correct'] == 1).groupby([df['user_id'], df['skill_id']]).cumsum()).fillna(0)
  df['f_opp'] = ((df['correct'] == 0).groupby([df['user_id'], df['skill_id']]).cumsum()).fillna(0)
  df.loc[df['correct'] == 1, 's_opp'] = df['s_opp'] - 1
  df.loc[df['correct'] == 0, 'f_opp'] = df['f_opp'] - 1
  return df

### Finished Running ###
 
# assistment09_df = prepare_df('data/assistments09/preprocessed_data.csv')
# assistment09_model = run(assistment09_df)

# assistment15_df = prepare_df('data/assistments15/preprocessed_data.csv')
# assistment15_model = run(assistment15_df)

# spanish_df = prepare_df('data/spanish/preprocessed_data.csv')
# spanish_model = run(spanish_df)

# assistment17_df = prepare_df('data/assistments17/preprocessed_data.csv')
# assistment17_model = run(assistment17_df)

# statics_df = prepare_df('data/statics/preprocessed_data.csv')
# statics_model = run(statics_df)

# bridge_df = prepare_df('data/bridge_algebra06/preprocessed_data.csv')
# bridge_model = run(bridge_df)

# algebra_df = prepare_df('data/algebra05/preprocessed_data.csv')
# algebra_model = run(algebra_df)

### To Run ###

# assistment12_df = prepare_df('data/assistments12/preprocessed_data.csv')
# assistment12_model = run(assistment12_df)

### Save Models and Data ###

# save_model(assistment09_model, './simulation/models/assistment09-model.joblib')
# save_model(assistment15_model, './simulation/models/assistment15-model.joblib')
# save_model(assistment17_model, './simulation/models/assistment17-model.joblib')
# save_model(spanish_model, './simulation/models/spanish-model.joblib')
# save_model(statics_model, './simulation/models/statics-model.joblib')
# save_model(bridge_model, './simulation/models/bridge-model.joblib')
# save_model(algebra_model, './simulation/models/algebra05.joblib')

# assistment09_df.to_csv('./simulation/extended-data/assistment09.csv')
# assistment15_df.to_csv('./simulation/extended-data/assistment15.csv')
# assistment17_df.to_csv('./simulation/extended-data/assistment17.csv')
# spanish_df.to_csv('./simulation/extended-data/spanish.csv')
# statics_df.to_csv('./simulation/extended-data/statics.csv')
# bridge_df.to_csv('./simulation/extended-data/bridge.csv')
# algebra_df.to_csv('./simulation/extended-data/algebra05.csv')

In [None]:
def _get_stu_and_kc_params_from_model_pfa(model, ds):
  # if ds in reverse:
  #   kc_params, stu_params = model.ranef
  # else:
  #   stu_params, kc_params = model.ranef

  stu_params, kc_params = model.ranef
  fixef = model.coefs.Estimate

  overall_int = fixef['(Intercept)']
  overall_s_slope = fixef['s_opp']
  overall_f_slope = fixef['f_opp']

  overall = pd.DataFrame([[overall_int, overall_s_slope, overall_f_slope, ds]], columns=['Intercept', 'S_Slope', 'F_Slope', 'Dataset'])

  stu_params['Dataset'] = ds
  stu_params.reset_index(inplace=True)
  stu_params = stu_params.rename(columns={
    "index": "Student",
    "X.Intercept.": "Intercept"
  })

  kc_params['Dataset'] = ds
  kc_params.reset_index(inplace=True)
  kc_params = kc_params.rename(columns={
    "index": "KC",
    "X.Intercept.": "PFA_S_Intercept",
    "X.Intercept..1": "PFA_F_Intercept",
    "s_opp": "S_Slope/KC",
    "f_opp": "F_Slope/KC",
  })

  return stu_params, kc_params, overall


def create_params_files():
  pfa_stu_params, pfa_kc_params, pfa_overall_params= [], [], []
  for ds in datasets:
    print(f"Running: {ds}")
    pfa_model = load_model(f'./simulation/models/{ds}-model.joblib')
    stu_params, kc_params, overall_params  = _get_stu_and_kc_params_from_model_pfa(pfa_model, ds)
    pfa_stu_params.append(stu_params)
    pfa_kc_params.append(kc_params)
    pfa_overall_params.append(overall_params)

  # - start - Run these to generate new model param files

  pfa_stu_params_concat = pd.concat(pfa_stu_params, axis=0).reset_index(drop=True)
  pfa_kc_params_concat = pd.concat(pfa_kc_params, axis=0).reset_index(drop=True)
  pfa_overall_params_concat = pd.concat(pfa_overall_params, axis=0).reset_index(drop=True)

  pfa_stu_params_concat.to_csv("./simulation/model-values/pfa_std_params.csv", sep=",", index=False)
  pfa_kc_params_concat.to_csv("./simulation/model-values/pfa_kc_params.csv", sep=",", index=False) 
  pfa_overall_params_concat.to_csv("./simulation/model-values/pfa_overall_params.csv", sep=",", index=False) 

# create_params_files()

In [3]:
### GET MODEL VALUES FROM MODEL VALUE CSV. ###

PFA_STU_FP = "./simulation/model-values/pfa_std_params.csv"
PFA_KC_FP = "./simulation//model-values/pfa_kc_params.csv"
PFA_OVERALL_FP = "./simulation/model-values/pfa_overall_params.csv"

def _get_kc_values(fp, get_value_func):
  df = pd.read_csv(fp, delimiter=',')
  kc_values_by_ds = {}

  for _, row in df.iterrows():
    ds = row['Dataset']
    if ds not in kc_values_by_ds:
      kc_values_by_ds[ds] = {}
    
    kc_name = row['KC']
    if kc_name in kc_values_by_ds[ds]:
      print(f"[ERROR] Duplicated KC: {kc_name}")
    kc_values_by_ds[ds][row['KC']] = get_value_func(row)
  return kc_values_by_ds

def _get_student_values(fp):
  df = pd.read_csv(fp, delimiter=',')
  stu_intercepts = {}
  for _, row in df.iterrows():
    ds = row['Dataset']
    if ds not in stu_intercepts:
      stu_intercepts[ds] = {}
    stu_intercepts[ds][row['Student']] = row['Intercept']
  return stu_intercepts

def get_pfa_kc_values():
  def _get_pfa(row):
    return {
      'intercept': row['PFA_S_Intercept'] + row['PFA_F_Intercept'],
      's_slope_kc': row['S_Slope/KC'],
      'f_slope_kc': row['F_Slope/KC']
    }
  return _get_kc_values(PFA_KC_FP, _get_pfa)

def get_pfa_student_values(): return _get_student_values(PFA_STU_FP)

def _get_overall_values(fp, get_value_func):
  df = pd.read_csv(fp, delimiter=',')
  overall_params = {}
  for _, row in df.iterrows():
    ds = row['Dataset']
    if ds not in overall_params:
      overall_params[ds] = {}

    overall_params[ds] = get_value_func(row)
  return overall_params

def get_pfa_overall_values():
  def _get_pfa(row):
    return {
      'intercept': row['Intercept'],
      's_slope': row['S_Slope'],
      'f_slope': row['F_Slope'],
    }
  return _get_overall_values(PFA_OVERALL_FP, _get_pfa)



In [None]:
kc_values_by_ds = get_pfa_kc_values()
stu_values_by_ds = get_pfa_student_values()
ovr_values_by_ds = get_pfa_overall_values()

def simulate(fp, ds):
  kc_values = kc_values_by_ds[ds]
  stu_intercepts = stu_values_by_ds[ds] 
  ovr_params = ovr_values_by_ds[ds]

  columns = ['user_id', 'item_id', 'timestamp', 'correct', 'skill_id', 's_opp', 'f_opp', 'prob']
  opps = {}
  rows = []
  df = pd.read_csv(fp, delimiter='\t')
  for _, row in loading(list(df.iterrows())):
    user_id = row['user_id']
    skill_id = row['skill_id']

    kc_value = kc_values[skill_id]
    stu_intercept = stu_intercepts[user_id]

    if user_id not in opps:
      opps[user_id] = {}
    if skill_id not in opps[user_id]:
      opps[user_id][skill_id] = {'s': 0, 'f': 0}

    prior_s = opps[user_id][skill_id]['s']
    prior_f = opps[user_id][skill_id]['f']

    dat = pd.DataFrame([[user_id, row['item_id'], row['timestamp'], None, skill_id, prior_s, prior_f, None]], columns=columns)

    print(dat)
    print(ovr_params)
    print(kc_value)
    print(stu_intercept)
    prob = calc_pfa_prob(prior_s, prior_f, 
                        ovr_params['s_slope'] + kc_value['s_slope_kc'], 
                        ovr_params['f_slope'] + kc_value['f_slope_kc'],
                        kc_value['intercept'], stu_intercept, ovr_params['intercept'])
    correct = np.random.choice([1, 0], p=[prob, 1-prob])

    # prob = model.predict(dat)[0]
    # correct = np.random.choice([1, 0], p=[prob, 1-prob])
    opps[user_id][skill_id]['s' if correct == 1 else 0] += 1

    dat['prob'] = prob
    dat['correct'] = correct
    rows.append(dat)
    
  final = pd.concat(rows, axis=0)
  final.to_csv(f'./simulation/simulated-data/{ds}.csv', index=False, sep='\t')

# simulate('data/statics/preprocessed_data.csv', 'statics')
simulate('data/spanish/preprocessed_data.csv', 'spanish')


In [4]:
# kc_values_by_ds['spanish']

import pickle
model = pickle.load(open('./data/statics/real-model.sav', 'rb'))
model

ModuleNotFoundError: No module named 'sklearn.linear_model.logistic'

In [None]:
excludes = ['statics']

for ds in datasets:
  if ds in excludes:
    continue
  simulate(f'data/{ds}/preprocessed_data.csv', ds)

In [2]:
import os

folder_path = "data"
folders = [f for f in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, f))]

print(folders)

['algebra05', 'assistments09', 'assistments12', 'assistments15', 'assistments17', 'assistments17_first_attempt', 'assistments17_single_attempt', 'bridge_algebra06', 'spanish', 'statics']


In [3]:
model = load_model(f'./simulation/models/statics-model.joblib')

In [8]:
print(model.coefs)
print(model.ranef)

             Estimate    2.5_ci   97.5_ci        SE        OR  OR_2.5_ci  \
(Intercept)  1.130310  0.941978  1.318643  0.096090  3.096618   2.565050   
s_opp        0.128398  0.077571  0.179225  0.025933  1.137006   1.080659   
f_opp       -0.206369 -0.287866 -0.124873  0.041581  0.813533   0.749862   

             OR_97.5_ci      Prob  Prob_2.5_ci  Prob_97.5_ci     Z-stat  \
(Intercept)    3.738344  0.755896     0.719499      0.788956  11.763075   
s_opp          1.196290  0.532055     0.519383      0.544687   4.951207   
f_opp          0.882609  0.448590     0.428526      0.468822  -4.963098   

                    P-val  Sig  
(Intercept)  6.049042e-32  ***  
s_opp        7.375464e-07  ***  
f_opp        6.937743e-07  ***  
[     X.Intercept.
0        0.301730
1       -0.838542
2        0.547372
3       -0.138380
4        0.232486
5       -1.704422
6       -1.444545
7       -0.340407
8       -1.003143
9       -0.291403
10       0.882705
11      -0.335110
12       0.461184
13       

In [7]:
model.fixef

[     (Intercept)     s_opp     f_opp
 0       1.432041  0.128398 -0.206369
 1       0.291769  0.128398 -0.206369
 2       1.677683  0.128398 -0.206369
 3       0.991930  0.128398 -0.206369
 4       1.362797  0.128398 -0.206369
 5      -0.574112  0.128398 -0.206369
 6      -0.314234  0.128398 -0.206369
 7       0.789904  0.128398 -0.206369
 8       0.127168  0.128398 -0.206369
 9       0.838908  0.128398 -0.206369
 10      2.013016  0.128398 -0.206369
 11      0.795200  0.128398 -0.206369
 12      1.591495  0.128398 -0.206369
 13      1.317469  0.128398 -0.206369
 14      0.682414  0.128398 -0.206369
 15      0.892277  0.128398 -0.206369
 16      0.753503  0.128398 -0.206369
 17      1.117269  0.128398 -0.206369
 18      0.581880  0.128398 -0.206369
 19      0.877707  0.128398 -0.206369
 20      0.370357  0.128398 -0.206369
 21      1.697117  0.128398 -0.206369
 22      1.866658  0.128398 -0.206369
 23      1.792020  0.128398 -0.206369
 24      1.415984  0.128398 -0.206369
 25      1.6

In [19]:
# df = pd.read_csv('simulation/simulated-data/statics.csv', delimiter='\t')
df = pd.read_csv('simulation/simulated-data/algebra05.csv', delimiter='\t')
df.s_opp.unique()
df.correct.value_counts()

correct
1    356114
0    250869
Name: count, dtype: int64

correct
1    119002
0     70295
Name: count, dtype: int64

In [20]:
df2 = pd.read_csv('data/algebra05/preprocessed_data.csv', delimiter='\t')
df.correct.value_counts()

correct
1    356114
0    250869
Name: count, dtype: int64

In [2]:
import os
files = [f'./data/{f}/real-model.sav'for f in os.listdir('./data')]
for f in files:
  size = os.path.getsize(f)
  print(f"{f} - {size/(1024*1024)}")



./data/algebra05/real-model.sav - 1.3240928649902344


FileNotFoundError: [Errno 2] No such file or directory: './data/assistments09/real-model.sav'

What do I need to do?

- Fit BestLR and the use it to generate data + know the ground truth
- Try to fit Both BestLR, PFA, and DKT on the generated data


assisstments09, assistments15, spanish, algebra05