## One-dimensional 2PL-IRT simulation


In [1]:
import os
os.chdir("../..")
import numpy as np
import pandas as pd
import scipy.stats as stats
from config.constants import SEEDS, DATA_PATH
from src.utils.sim import save_resp_matrix
from src.utils.misc import set_random_seeds, sigmoid

In [2]:
# hyperparameters
NUM_STUDENTS = 400
NUM_COURSES = 20
COURSE_PROP = 0.8  # proportion of courses average student took

# controls IRT parameters
ABILITY_VAR = 1
DIFF_VAR = 1
DISC_MEAN = 1
DISC_VAR = 0.5

In [3]:
def generate_2pl_parameters():
    thetas = stats.norm.rvs(loc=0.0, scale=ABILITY_VAR, size=NUM_STUDENTS)
    diffs = stats.norm.rvs(loc=0.0, scale=DIFF_VAR, size=NUM_COURSES)
    discs = stats.truncnorm.rvs(0 - DISC_MEAN, np.Inf, loc=DISC_MEAN,
                                scale=DISC_VAR, size=NUM_COURSES)
    return thetas, diffs, discs

def prob_2pl_correct(theta, diff, disc):
    x = disc * (theta - diff)
    return sigmoid(x)

def create_resp_matrix(thetas, diffs, discs, prop=COURSE_PROP):
    resp_matrix = []
    for t in thetas:
        cor_prob = prob_2pl_correct(t, diffs, discs)
        resp_vec = (np.random.random(diffs.shape[0]) <= cor_prob).astype(int)
        mask = stats.bernoulli.rvs(1 - prop, size=diffs.shape[0])
        resp_vec[mask.astype(bool)] = -1  # indicate missing responses
        resp_matrix.append(resp_vec)
    return np.vstack(resp_matrix)

In [4]:
# generate simulation data
for seed in SEEDS:
    set_random_seeds(seed)
    thetas, diffs, discs = generate_2pl_parameters()
    mat = create_resp_matrix(thetas, diffs, discs, prop=COURSE_PROP)

    # save response matrix
    resp_path = DATA_PATH["sim"] + "1d_2pl/" + "data_" + str(seed) + ".csv"
    save_resp_matrix(mat, resp_path)

    # save model parameters
    theta_path = DATA_PATH["sim"] + "1d_2pl/" + "theta_" + str(seed) + ".csv"
    df = pd.DataFrame(thetas, columns=["theta"])
    df.to_csv(theta_path, index=False)

    course_path = DATA_PATH["sim"] + "1d_2pl/" + "course_" + str(seed) + ".csv"
    df = pd.DataFrame(np.vstack([diffs, discs]).T,
                      columns=["difficulty", "discrimination"])
    df.to_csv(course_path, index=False)
