In [2]:
import os
import sys
current_dir = os.path.dirname(os.path.abspath('.'))
sys.path.append(current_dir)
import pandas as pd
from pathlib import Path
import random
from dotenv import load_dotenv
from langsmith.wrappers import wrap_openai
from openai import OpenAI

from SINKT.agents import QuestionAgent
from SINKT.models import (
    KnowledgeGraphModel, 
    Concept,
    Relation,
)
from SINKT.utils import (get_llm, Models)
from SINKT.knowledge_graph import GraphSelector, GraphXMLBuilder
from SINKT.simulator import StudentSimulator
from SINKT.student import StudentFactory
from SINKT.population import PopulationSimulator

load_dotenv('../.env')

assert os.getenv("OPENAI_API_KEY"), "OPENAI_API_KEY not found"
assert os.getenv("ANTHROPIC_API_KEY"), "ANTHROPIC_API_KEY not found"
assert os.getenv("LANGSMITH_API_KEY"), "ANTHROPIC_API_KEY not found"
client = wrap_openai(OpenAI())

In [3]:
def generate_synthetic_dataset(n_students=100):
    xml_builder = GraphXMLBuilder()
    input_path = Path(".")
    
    if not input_path.exists():
        print("Data path not found.")
        return None

    kg_model = xml_builder.load(input_path)
    sim = PopulationSimulator(kg_model)
    
    all_data = []
    print(f"Generating data for {n_students} students...")
    
    for i in range(1, n_students + 1):
        # Create unique profile
        profile = StudentFactory.create_student(i)
        
        # Random sequence length (30-60 interactions)
        n_steps = random.randint(1, 10)
        
        # Run Session
        student_history = sim.run_student_session(profile, n_steps, explain_error=True)
        all_data.extend(student_history)
        
        if i % 10 == 0:
            print(f"Processed {i} students...")
            
    df = pd.DataFrame(all_data)
    return df

In [4]:
df_population = generate_synthetic_dataset(n_students=1)

if df_population is not None:
    print("\n--- Dataset Generation Complete ---")
    print(f"Total Interactions: {len(df_population)}")
    print("\nSample Data:")
    display(df_population[['student_id', 'archetype', 'step', 'question_id', 'outcome', 'p_correct_observed']].head())
    
    # Save to CSV
    df_population.to_csv("synthetic_student_population.csv", index=False)
    print("Saved to synthetic_student_population.csv")

Generating data for 1 students...



--- Dataset Generation Complete ---
Total Interactions: 7

Sample Data:


Unnamed: 0,student_id,archetype,step,question_id,outcome,p_correct_observed
0,S001,The Gamer,1,find,0,0.526
1,S001,The Gamer,2,servidor_ftp,0,0.612
2,S001,The Gamer,3,xubuntu,1,0.683
3,S001,The Gamer,4,metacity,1,0.532
4,S001,The Gamer,5,calc,0,0.441


Saved to synthetic_student_population.csv
