In [1]:
# from statsmodels.genmod.families.links import Logit
import numpy as np
import pandas as pd
from tqdm import tqdm

from pymer4.models import Lmer
from pymer4.io import save_model, load_model


datasets = ['algebra05', 'assistments09', 'assistments15', 'assistments17', 'bridge_algebra06', 'spanish', 'statics'] # 'assistments12'

pd.set_option('display.max_rows', None)
# pd.reset_option('display.max_rows')

def loading(iters):
    return tqdm(iters, desc="Running ...", ascii=False, ncols=75)

def inv_logit(z):
  return 1 / (1 + np.exp(-z))

def calc_pfa_prob(s_opp, f_opp, s_slope, f_slope, stu_intercept, kc_intercept, ovr_intercept):
  z = (s_slope * s_opp) + (f_slope * f_opp) + stu_intercept + kc_intercept + ovr_intercept
  return inv_logit(z)

In [49]:
### RUN MODEL ###


#Statics PFA no skill intr: AIC: 190050.53378084814, BIC: 190152.04450574025, LogLik: -95015.26689042407
#Statics PFA w/ skill intr: AIC: 190052.53378337782, BIC: 190164.19558075914, LogLik: -95015.26689168891
#Statics PFA no s_opp + f_opp: AIC: 190086.13940559322, BIC: 190177.49905799612, LogLik: -95034.06970279661
#Statics PFA +0 s/f intercept: AIC: 190110.54467400096, BIC: 190161.30003644704, LogLik: -95050.27233700048

#(1|user_id) + (1|skill_id) + (s_opp-1|skill_id) + (f_opp-1|skill_id)
#Statics PFA: AIC: 190110.54467400096, BIC: 190161.30003644704, LogLik: -95050.27233700048

# "correct ~ (1|user_id) + (1+s_opp|skill_id) + (0+f_opp|skill_id)",+
#Statics PFA: AIC: 190111.7865006661, BIC: 190172.69293560134, LogLik: -95049.89325033304

# Napol's Formulas 12/4/2023
#formulas = {
#  "AFM": "correct ~ opp + (opp|skill_id) + (1|user_id)",
#  "PFA": "correct ~ s_opp + f_opp + (s_opp|skill_id) + (f_opp|skill_id) + (1|user_id)",
#}

formulas = {
   "AFM": "correct ~ user_id + skill_id + (opp|skill_id) ",
    #"PFA": "correct ~ (1|user_id) + (1|skill_id) + (s_opp|skill_id) + (f_opp|skill_id)",
   "PFA": "correct ~ (1|user_id) + (s_opp|skill_id) + (f_opp|skill_id)",
}


# model.fit(control="optimizer='Nelder_Mead', optCtrl = list(FtolAbs=1e-8, XtolRel=1e-8)")
def run(df, model_type="PFA"):
  model = Lmer(formulas[model_type], data=df, family="binomial")
  model.fit(control="optimizer='bobyqa', optCtrl=list(maxfun=2e5)", summarize=False)

  print(f"{model_type}: AIC: {model.AIC}, BIC: {model.BIC}, LogLik: {model.logLike}")
  return model

In [50]:
def prepare_df(fp):
  df = pd.read_csv(fp, delimiter='\t')
  df['opp'] = df.groupby(['user_id', 'skill_id']).cumcount()
  df['s_opp'] = ((df['correct'] == 1).groupby([df['user_id'], df['skill_id']]).cumsum()).fillna(0)
  df['f_opp'] = ((df['correct'] == 0).groupby([df['user_id'], df['skill_id']]).cumsum()).fillna(0)
  df.loc[df['correct'] == 1, 's_opp'] = df['s_opp'] - 1
  df.loc[df['correct'] == 0, 'f_opp'] = df['f_opp'] - 1
  return df

### Finished Running ###
 
# assistment09_df = prepare_df('data/assistments09/preprocessed_data.csv')
# assistment09_model = run(assistment09_df)

assistment15_df = prepare_df('data/assistments15/preprocessed_data.csv')
assistment15_model = run(assistment15_df)

# spanish_df = prepare_df('data/spanish/preprocessed_data.csv')
# spanish_model = run(spanish_df)

# assistment17_df = prepare_df('data/assistments17/preprocessed_data.csv')
# assistment17_model = run(assistment17_df)

#statics_df = prepare_df('data/statics/preprocessed_data.csv')
#print("PREPARED")
#statics_model = run(statics_df)

# bridge_df = prepare_df('data/bridge_algebra06/preprocessed_data.csv')
# bridge_model = run(bridge_df)

#algebra_df = prepare_df('data/algebra05/preprocessed_data.csv')
#algebra_model = run(algebra_df)

### To Run ###

# assistment12_df = prepare_df('data/assistments12/preprocessed_data.csv')
# assistment12_model = run(assistment12_df)

### Save Models and Data ###

# save_model(assistment09_model, './simulation/models/assistment09-model.joblib')
# save_model(assistment15_model, './simulation/models/assistment15-model.joblib')
# save_model(assistment17_model, './simulation/models/assistment17-model.joblib')
# save_model(spanish_model, './simulation/models/spanish-model.joblib')
# save_model(statics_model, './simulation/models/statics-model.joblib')
# save_model(bridge_model, './simulation/models/bridge-model.joblib')
# save_model(algebra_model, './simulation/models/algebra05.joblib')

# assistment09_df.to_csv('./simulation/extended-data/assistment09.csv')
# assistment15_df.to_csv('./simulation/extended-data/assistment15.csv')
# assistment17_df.to_csv('./simulation/extended-data/assistment17.csv')
# spanish_df.to_csv('./simulation/extended-data/spanish.csv')
# statics_df.to_csv('./simulation/extended-data/statics.csv')
# bridge_df.to_csv('./simulation/extended-data/bridge.csv')
# algebra_df.to_csv('./simulation/extended-data/algebra05.csv')

PFA: AIC: 695649.1045332984, BIC: 695740.2577396702, LogLik: -347816.5522666492


In [40]:
assistment15_model.ranef

[     X.Intercept.
 0        0.358658
 1       -1.163268
 2        0.633782
 3        0.357401
 4        0.318584
 5       -2.104224
 6       -1.670517
 7       -0.388328
 8       -1.158435
 9       -0.390218
 10       1.302089
 11      -0.379742
 12       0.536080
 13       0.088998
 14      -0.627176
 15      -0.351749
 16      -0.460498
 17      -0.069149
 18      -0.733915
 19      -0.292656
 20      -0.864406
 21       0.520616
 22       0.930382
 23       0.644573
 24       0.153157
 25       0.633038
 26      -0.486696
 27       0.168808
 28       0.633773
 29       0.267990
 30       0.201855
 31       0.485027
 32      -0.820568
 33      -0.183453
 34      -1.205741
 35      -0.182776
 36      -0.260512
 37      -0.818273
 38       0.375220
 39      -0.084673
 40      -0.170986
 41      -0.471774
 42      -1.396116
 43       0.447464
 44       0.076344
 45      -1.031629
 46      -1.278699
 47       0.021848
 48       0.130742
 49      -0.576627
 50       0.348531
 51       0.

In [41]:
assistment15_model.fixef

[     (Intercept)  s_opp:skill_id  skill_id:f_opp
 0       1.514645        0.000202       -0.000261
 1      -0.007281        0.000202       -0.000261
 2       1.789769        0.000202       -0.000261
 3       1.513388        0.000202       -0.000261
 4       1.474571        0.000202       -0.000261
 5      -0.948236        0.000202       -0.000261
 6      -0.514530        0.000202       -0.000261
 7       0.767660        0.000202       -0.000261
 8      -0.002448        0.000202       -0.000261
 9       0.765769        0.000202       -0.000261
 10      2.458077        0.000202       -0.000261
 11      0.776245        0.000202       -0.000261
 12      1.692067        0.000202       -0.000261
 13      1.244985        0.000202       -0.000261
 14      0.528811        0.000202       -0.000261
 15      0.804238        0.000202       -0.000261
 16      0.695489        0.000202       -0.000261
 17      1.086838        0.000202       -0.000261
 18      0.422072        0.000202       -0.000261


In [None]:
def _get_stu_and_kc_params_from_model_pfa(model, ds):
  # if ds in reverse:
  #   kc_params, stu_params = model.ranef
  # else:
  #   stu_params, kc_params = model.ranef

  stu_params, kc_params = model.ranef
  fixef = model.coefs.Estimate

  overall_int = fixef['(Intercept)']
  overall_s_slope = fixef['s_opp']
  overall_f_slope = fixef['f_opp']

  overall = pd.DataFrame([[overall_int, overall_s_slope, overall_f_slope, ds]], columns=['Intercept', 'S_Slope', 'F_Slope', 'Dataset'])

  stu_params['Dataset'] = ds
  stu_params.reset_index(inplace=True)
  stu_params = stu_params.rename(columns={
    "index": "Student",
    "X.Intercept.": "Intercept"
  })

  kc_params['Dataset'] = ds
  kc_params.reset_index(inplace=True)
  kc_params = kc_params.rename(columns={
    "index": "KC",
    "X.Intercept.": "PFA_S_Intercept",
    "X.Intercept..1": "PFA_F_Intercept",
    "s_opp": "S_Slope/KC",
    "f_opp": "F_Slope/KC",
  })

  return stu_params, kc_params, overall


def create_params_files():
  pfa_stu_params, pfa_kc_params, pfa_overall_params= [], [], []
  for ds in datasets:
    print(f"Running: {ds}")
    pfa_model = load_model(f'./simulation/models/{ds}-model.joblib')
    stu_params, kc_params, overall_params  = _get_stu_and_kc_params_from_model_pfa(pfa_model, ds)
    pfa_stu_params.append(stu_params)
    pfa_kc_params.append(kc_params)
    pfa_overall_params.append(overall_params)

  # - start - Run these to generate new model param files

  pfa_stu_params_concat = pd.concat(pfa_stu_params, axis=0).reset_index(drop=True)
  pfa_kc_params_concat = pd.concat(pfa_kc_params, axis=0).reset_index(drop=True)
  pfa_overall_params_concat = pd.concat(pfa_overall_params, axis=0).reset_index(drop=True)

  pfa_stu_params_concat.to_csv("./simulation/model-values/pfa_std_params.csv", sep=",", index=False)
  pfa_kc_params_concat.to_csv("./simulation/model-values/pfa_kc_params.csv", sep=",", index=False) 
  pfa_overall_params_concat.to_csv("./simulation/model-values/pfa_overall_params.csv", sep=",", index=False) 

# create_params_files()

In [3]:
### GET MODEL VALUES FROM MODEL VALUE CSV. ###

PFA_STU_FP = "./simulation/model-values/pfa_std_params.csv"
PFA_KC_FP = "./simulation//model-values/pfa_kc_params.csv"
PFA_OVERALL_FP = "./simulation/model-values/pfa_overall_params.csv"

def _get_kc_values(fp, get_value_func):
  df = pd.read_csv(fp, delimiter=',')
  kc_values_by_ds = {}

  for _, row in df.iterrows():
    ds = row['Dataset']
    if ds not in kc_values_by_ds:
      kc_values_by_ds[ds] = {}
    
    kc_name = row['KC']
    if kc_name in kc_values_by_ds[ds]:
      print(f"[ERROR] Duplicated KC: {kc_name}")
    kc_values_by_ds[ds][row['KC']] = get_valus_slope_kce_func(row)
  return kc_values_by_ds

def _get_student_values(fp):
  df = pd.read_csv(fp, delimiter=',')
  stu_intercepts = {}
  for _, row in df.iterrows():
    ds = row['Dataset']
    if ds not in stu_intercepts:
      stu_intercepts[ds] = {}
    stu_intercepts[ds][row['Student']] = row['Intercept']
  return stu_intercepts

def get_pfa_kc_values():
  def _get_pfa(row):
    return {
      'intercept': row['PFA_S_Intercept'] + row['PFA_F_Intercept'],
      's_slope_kc': row['S_Slope/KC'],
      'f_slope_kc': row['F_Slope/KC']
    }
  return _get_kc_values(PFA_KC_FP, _get_pfa)

def get_pfa_student_values(): return _get_student_values(PFA_STU_FP)

def _get_overall_values(fp, get_value_func):
  df = pd.read_csv(fp, delimiter=',')
  overall_params = {}
  for _, row in df.iterrows():
    ds = row['Dataset']
    if ds not in overall_params:
      overall_params[ds] = {}

    overall_params[ds] = get_value_func(row)
  return overall_params

def get_pfa_overall_values():
  def _get_pfa(row):
    return {
      'intercept': row['Intercept'],
      's_slope': row['S_Slope'],
      'f_slope': row['F_Slope'],
    }
  return _get_overall_values(PFA_OVERALL_FP, _get_pfa)



In [None]:
kc_values_by_ds = get_pfa_kc_values()
stu_values_by_ds = get_pfa_student_values()
ovr_values_by_ds = get_pfa_overall_values()

def simulate(fp, ds):
  kc_values = kc_values_by_ds[ds]
  stu_intercepts = stu_values_by_ds[ds] 
  ovr_params = ovr_values_by_ds[ds]

  columns = ['user_id', 'item_id', 'timestamp', 'correct', 'skill_id', 's_opp', 'f_opp', 'prob']
  opps = {}
  rows = []
  df = pd.read_csv(fp, delimiter='\t')
  for _, row in loading(list(df.iterrows())):
    user_id = row['user_id']
    skill_id = row['skill_id']

    kc_value = kc_values[skill_id]
    stu_intercept = stu_intercepts[user_id]

    if user_id not in opps:
      opps[user_id] = {}
    if skill_id not in opps[user_id]:
      opps[user_id][skill_id] = {'s': 0, 'f': 0}

    prior_s = opps[user_id][skill_id]['s']
    prior_f = opps[user_id][skill_id]['f']

    dat = pd.DataFrame([[user_id, row['item_id'], row['timestamp'], None, skill_id, prior_s, prior_f, None]], columns=columns)

    print(dat)
    print(ovr_params)
    print(kc_value)
    print(stu_intercept)
    prob = calc_pfa_prob(prior_s, prior_f, 
                        ovr_params['s_slope'] + kc_value['s_slope_kc'], 
                        ovr_params['f_slope'] + kc_value['f_slope_kc'],
                        kc_value['intercept'], stu_intercept, ovr_params['intercept'])
    correct = np.random.choice([1, 0], p=[prob, 1-prob])

    # prob = model.predict(dat)[0]
    # correct = np.random.choice([1, 0], p=[prob, 1-prob])
    opps[user_id][skill_id]['s' if correct == 'correct' else 'f'] += 1

    dat['prob'] = prob
    dat['correct'] = correct
    rows.append(dat)
    
  final = pd.concat(rows, axis=0)
  final.to_csv(f'./simulation/simulated-data/{ds}.csv', index=False, sep='\t')

# simulate('data/statics/preprocessed_data.csv', 'statics')
simulate('data/spanish/preprocessed_data.csv', 'spanish')


In [None]:
kc_values_by_ds['spanish']

In [None]:
excludes = ['statics']

for ds in datasets:
  if ds in excludes:
    continue
  simulate(f'data/{ds}/preprocessed_data.csv', ds)

In [2]:
import os

folder_path = "data"
folders = [f for f in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, f))]

print(folders)

['algebra05', 'assistments09', 'assistments12', 'assistments15', 'assistments17', 'assistments17_first_attempt', 'assistments17_single_attempt', 'bridge_algebra06', 'spanish', 'statics']
