## One-dimensional 1PL-IRT (Rasch) simulation


In [1]:
import os
os.chdir("../..")
import numpy as np
import pandas as pd
import scipy.stats as stats
from config.constants import SEEDS, DATA_PATH
from src.utils.sim import save_resp_matrix
from src.utils.misc import set_random_seeds, sigmoid

In [2]:
# hyperparameters
NUM_STUDENTS = [25, 50, 75, 100, 150, 200, 250, 300, 350, 400, 450, 500]
NUM_COURSES = 19
COURSE_PROP = 0.71  # proportion of courses average student took

# controls IRT parameters
ABILITY_VAR = 1
DIFF_VAR = 1

In [3]:
def generate_1pl_parameters(num_stud):
    thetas = stats.norm.rvs(loc=0.0, scale=ABILITY_VAR, size=num_stud)
    diffs = stats.norm.rvs(loc=0.0, scale=DIFF_VAR, size=NUM_COURSES)
    return thetas, diffs

def prob_1pl_correct(theta, diff):
    return sigmoid(theta - diff)

def create_resp_matrix(thetas, diffs, prop=COURSE_PROP):
    resp_matrix = []
    for t in thetas:
        cor_prob = prob_1pl_correct(t, diffs)
        resp_vec = (np.random.random(diffs.shape[0]) <= cor_prob).astype(int)
        mask = stats.bernoulli.rvs(1 - prop, size=diffs.shape[0])
        resp_vec[mask.astype(bool)] = -1  # indicate missing responses
        resp_matrix.append(resp_vec)
    return np.vstack(resp_matrix)

In [4]:
# generate simulation data
for num_stud in NUM_STUDENTS:
    for seed in SEEDS:
        set_random_seeds(seed)
        thetas, diffs = generate_1pl_parameters(num_stud)
        mat = create_resp_matrix(thetas, diffs, prop=COURSE_PROP)
        print(seed, "- empirical density:", np.mean(mat != -1))

        suffix = "_n=" + str(num_stud) + "_s=" + str(seed) + ".csv"
        # save response matrix
        resp_path = DATA_PATH["sim"] + "1d_1pl/data" + suffix 
        save_resp_matrix(mat, resp_path)

        # save model parameters
        theta_path = DATA_PATH["sim"] + "1d_1pl/theta" + suffix
        df = pd.DataFrame(thetas, columns=["theta"])
        df.to_csv(theta_path, index=False)

        course_path = DATA_PATH["sim"] + "1d_1pl/difficulty" + suffix
        df = pd.DataFrame(np.vstack([diffs]).T, columns=["difficulty"])
        df.to_csv(course_path, index=False)


0 - empirical density: 0.7031578947368421
1 - empirical density: 0.7136842105263158
2 - empirical density: 0.7452631578947368
3 - empirical density: 0.6968421052631579
4 - empirical density: 0.7073684210526315
5 - empirical density: 0.7094736842105264
6 - empirical density: 0.7178947368421053
7 - empirical density: 0.6968421052631579
8 - empirical density: 0.7178947368421053
9 - empirical density: 0.7073684210526315
10 - empirical density: 0.728421052631579
11 - empirical density: 0.6989473684210527
12 - empirical density: 0.6842105263157895
13 - empirical density: 0.7052631578947368
14 - empirical density: 0.7178947368421053
15 - empirical density: 0.7052631578947368
16 - empirical density: 0.7115789473684211
17 - empirical density: 0.7178947368421053
18 - empirical density: 0.7263157894736842
19 - empirical density: 0.728421052631579
0 - empirical density: 0.6747368421052632
1 - empirical density: 0.7021052631578948
2 - empirical density: 0.7357894736842105
3 - empirical density: 0.7