## Data Cleaning/Formatting


In [27]:
import pandas as pd

# Load the data
resp_data = pd.read_csv('resp_data 1.csv', sep=";", encoding='utf-8')
iteminfo = pd.read_csv('iteminfo.csv', sep=';', encoding='utf-8')

resp_data.drop('item_score', axis=1, inplace=True)
resp_data.drop(resp_data.columns[0], axis=1, inplace=True)
resp_data.rename(columns={'vs': 'exam_score'}, inplace=True)
resp_data.rename(columns={'toets': 'time'}, inplace=True)
resp_data['exam_score'] = resp_data['exam_score'].str.replace(',', '.').astype(float)
resp_data = resp_data.merge(iteminfo[['item_id', 'domain']], on='item_id', how='left')
time_to_exam_id = {
    'M4': 1,
    'E4': 2,
    'M5': 3,
    'E5': 4,
    'M6': 5,
    'E6': 6,
    'M7': 7,
    'E7': 8
}

# Create exam_id column as integer based on time
resp_data['exam_id'] = resp_data['time'].map(time_to_exam_id)


for i in range(7):
    
    group_size = 1070430

    # Shift person_id by group_size rows
    resp_data['person_id'] = resp_data['person_id'].shift(group_size).fillna(resp_data['person_id'])

    # Convert any float person_ids back to string (if needed)
    resp_data['person_id'] = resp_data['person_id'].astype(str)

resp_data.head()


Unnamed: 0,exam_score,school_id,person_id,time,item_id,response,domain,exam_id
0,151.802923,1,s1p1,M4,item03001,0,A,1
1,151.802923,1,s1p1,M4,item03002,0,B,1
2,151.802923,1,s1p1,M4,item03003,1,C,1
3,151.802923,1,s1p1,M4,item03004,0,A,1
4,151.802923,1,s1p1,M4,item03005,1,B,1


In [28]:
# Test: Check if all person_id values in each group are the same
group_size = 1070430
passed = True

for start in range(0, len(resp_data), group_size):
    group = resp_data.iloc[start:start+group_size]
    if group['person_id'].nunique() != 1:
        print(f"Test failed in rows {start} to {start+group_size-1}")
        passed = False

if passed:
    print("Test passed: All person_id values in each group are the same.")


s1p1_rows = resp_data[resp_data['person_id'] == 's1p1']
print(s1p1_rows)
print(f"Total rows with person_id 's1p1': {len(s1p1_rows)}")

Test failed in rows 0 to 1070429
Test failed in rows 1070430 to 2140859
Test failed in rows 2140860 to 3211289
Test failed in rows 3211290 to 4281719
Test failed in rows 4281720 to 5352149
Test failed in rows 5352150 to 6422579
Test failed in rows 6422580 to 7493009
Test failed in rows 7493010 to 8563439
         exam_score  school_id person_id time    item_id  response domain  \
0        151.802923          1      s1p1   M4  item03001         0      A   
1        151.802923          1      s1p1   M4  item03002         0      B   
2        151.802923          1      s1p1   M4  item03003         1      C   
3        151.802923          1      s1p1   M4  item03004         0      A   
4        151.802923          1      s1p1   M4  item03005         1      B   
...             ...        ...       ...  ...        ...       ...    ...   
7493035  254.980855          1      s1p1   E7  item12026         1      B   
7493036  254.980855          1      s1p1   E7  item12027         1      C   
7

In [29]:
import json

# Assume resp_data has columns: school_id, person_id, exam_id, exam_score, time, domain, response

# First, calculate items_correct per student, per exam, per domain
items_correct = (
    resp_data
    .groupby(['school_id', 'person_id', 'exam_id', 'exam_score', 'time', 'domain'])['response']
    .sum()
    .reset_index()
)

# Pivot so each domain is a key in items_correct
items_pivot = (
    items_correct
    .pivot_table(index=['school_id', 'person_id', 'exam_id', 'exam_score', 'time'],
                 columns='domain', values='response', fill_value=0)
    .reset_index()
)

# Build the nested structure
schools = []
for school_id, school_df in items_pivot.groupby('school_id'):
    students = []
    for person_id, student_df in school_df.groupby('person_id'):
        exams = []
        for _, row in student_df.iterrows():
            # Collect items_correct as a dict of domain: count
            domains = row.index[5:]  # skip first 5 columns
            items_correct_dict = {domain: int(row[domain]) for domain in domains}
            exams.append({
                "exam_id": row['exam_id'],
                "exam_score": int(row['exam_score']),
                "time": row['time'],
                "items_correct": items_correct_dict
            })
        students.append({
            "id": f"student_{person_id}",
            "key": person_id,
            "exams": exams
        })
    schools.append({
        "id": f"school_{school_id}",
        "students": students
    })

result = {"schools": schools}

# Save to JSON file
with open("resp_data.json", "w", encoding="utf-8") as f:
    json.dump(result, f, ensure_ascii=False, indent=2)

In [30]:
import json
import numpy as np

# Load the JSON data
with open("resp_data.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Collect all exam scores by exam_id
exam_scores = {}

for school in data["schools"]:
    for student in school["students"]:
        for exam in student["exams"]:
            exam_id = exam["exam_id"]
            score = exam["exam_score"]
            if exam_id not in exam_scores:
                exam_scores[exam_id] = []
            exam_scores[exam_id].append(score)

# Calculate mean and std for each exam_id
for exam_id, scores in exam_scores.items():
    mean = np.mean(scores)
    std = np.std(scores)
    print(f"Exam ID: {exam_id} | Mean: {mean:.2f} | Std: {std:.2f}")

Exam ID: 1 | Mean: 168.81 | Std: 28.38
Exam ID: 2 | Mean: 190.53 | Std: 27.09
Exam ID: 3 | Mean: 206.68 | Std: 27.15
Exam ID: 4 | Mean: 218.15 | Std: 26.12
Exam ID: 5 | Mean: 229.61 | Std: 25.71
Exam ID: 6 | Mean: 241.46 | Std: 26.09
Exam ID: 7 | Mean: 252.45 | Std: 26.34
Exam ID: 8 | Mean: 261.43 | Std: 25.27


In [31]:
import json
import numpy as np

# Load the JSON data
with open("resp_data.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Collect all items_correct per domain by exam_id
items_correct_stats = {}

for school in data["schools"]:
    for student in school["students"]:
        for exam in student["exams"]:
            exam_id = exam["exam_id"]
            items = exam["items_correct"]
            if exam_id not in items_correct_stats:
                items_correct_stats[exam_id] = {}
            for domain, value in items.items():
                if domain not in items_correct_stats[exam_id]:
                    items_correct_stats[exam_id][domain] = []
                items_correct_stats[exam_id][domain].append(value)

# Calculate mean and std for each domain in each exam_id
for exam_id, domains in items_correct_stats.items():
    print(f"Exam ID: {exam_id}")
    for domain, values in domains.items():
        mean = np.mean(values)
        std = np.std(values)
        print(f"  Domain: {domain} | Mean: {mean:.2f} | Std: {std:.2f}")

Exam ID: 1
  Domain: A | Mean: 8.49 | Std: 1.19
  Domain: B | Mean: 5.79 | Std: 1.84
  Domain: C | Mean: 6.21 | Std: 1.75
Exam ID: 2
  Domain: A | Mean: 8.97 | Std: 1.04
  Domain: B | Mean: 7.68 | Std: 1.45
  Domain: C | Mean: 6.79 | Std: 1.67
Exam ID: 3
  Domain: A | Mean: 8.40 | Std: 1.20
  Domain: B | Mean: 8.10 | Std: 1.34
  Domain: C | Mean: 7.20 | Std: 1.54
Exam ID: 4
  Domain: A | Mean: 7.77 | Std: 1.33
  Domain: B | Mean: 7.45 | Std: 1.53
  Domain: C | Mean: 6.79 | Std: 1.70
Exam ID: 5
  Domain: A | Mean: 7.63 | Std: 1.44
  Domain: B | Mean: 7.64 | Std: 1.40
  Domain: C | Mean: 7.23 | Std: 1.49
Exam ID: 6
  Domain: A | Mean: 7.74 | Std: 1.48
  Domain: B | Mean: 7.52 | Std: 1.46
  Domain: C | Mean: 7.75 | Std: 1.43
Exam ID: 7
  Domain: A | Mean: 8.56 | Std: 1.24
  Domain: B | Mean: 8.42 | Std: 1.14
  Domain: C | Mean: 7.30 | Std: 1.53
Exam ID: 8
  Domain: A | Mean: 6.87 | Std: 1.52
  Domain: B | Mean: 7.80 | Std: 1.41
  Domain: C | Mean: 7.82 | Std: 1.36


In [32]:
import json
import numpy as np

# Load the JSON data
with open("resp_data.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# --- Exam score statistics ---
exam_scores = {}
for school in data["schools"]:
    for student in school["students"]:
        for exam in student["exams"]:
            exam_id = exam["exam_id"]
            score = exam["exam_score"]
            if exam_id not in exam_scores:
                exam_scores[exam_id] = []
            exam_scores[exam_id].append(score)

exam_score_stats = {}
for exam_id, scores in exam_scores.items():
    exam_score_stats[exam_id] = {
        "mean": float(np.mean(scores)),
        "std": float(np.std(scores))
    }

with open("exam_score_stats.json", "w", encoding="utf-8") as f:
    json.dump(exam_score_stats, f, ensure_ascii=False, indent=2)

# --- Items correct statistics ---
items_correct_stats = {}
for school in data["schools"]:
    for student in school["students"]:
        for exam in student["exams"]:
            exam_id = exam["exam_id"]
            items = exam["items_correct"]
            if exam_id not in items_correct_stats:
                items_correct_stats[exam_id] = {}
            for domain, value in items.items():
                if domain not in items_correct_stats[exam_id]:
                    items_correct_stats[exam_id][domain] = []
                items_correct_stats[exam_id][domain].append(value)

items_correct_stats_out = {}
for exam_id, domains in items_correct_stats.items():
    items_correct_stats_out[exam_id] = {}
    for domain, values in domains.items():
        items_correct_stats_out[exam_id][domain] = {
            "mean": float(np.mean(values)),
            "std": float(np.std(values))
        }

with open("items_correct_stats.json", "w", encoding="utf-8") as f:
    json.dump(items_correct_stats_out, f, ensure_ascii=False, indent=2)
    

In [None]:
import json
import numpy as np

# Load the JSON data
with open("resp_data.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Collect all items_correct per domain by exam_id
items_correct_stats = {}

for school in data["schools"]:
    for student in school["students"]:
        for exam in student["exams"]:
            exam_id = exam["exam_id"]
            items = exam["items_correct"]
            if exam_id not in items_correct_stats:
                items_correct_stats[exam_id] = {}
            for domain, value in items.items():
                if domain not in items_correct_stats[exam_id]:
                    items_correct_stats[exam_id][domain] = []
                items_correct_stats[exam_id][domain].append(value)

# Calculate mean and std for each domain in each exam_id and save as JSON
items_correct_stats_out = {}
for exam_id, domains in items_correct_stats.items():
    items_correct_stats_out[exam_id] = {}
    for domain, values in domains.items():
        items_correct_stats_out[exam_id][domain] = {
            "mean": float(np.mean(values)),
            "std": float(np.std(values))
        }

with open("items_correct_stats.json", "w", encoding="utf-8") as f:
    json.dump(items_correct_stats_out, f, ensure_ascii=False, indent=2)

In [1]:
import json

# Load the JSON data
with open("resp_data.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Count unique students across all schools
student_ids = set()
for school in data["schools"]:
    for student in school["students"]:
        student_ids.add(student["key"])

print(f"Total number of students: {len(student_ids)}")

Total number of students: 35681
