# 🧠 Synthetic Student Data Generator

This notebook walks through the process of generating synthetic student data using metadata and training a simple language model (e.g., GPT-2) to generate new records.

In [None]:
# 📦 Install Dependencies (Uncomment if needed)
# !pip install pandas numpy faker transformers torch datasets

In [None]:
import pandas as pd
import numpy as np
from faker import Faker
import random
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import Dataset

## 📁 Define Metadata

In [None]:
metadata = {
    "student_id": {"type": "string", "pattern": "SID####"},
    "first_name": {"type": "string", "source": "faker.first_name"},
    "last_name": {"type": "string", "source": "faker.last_name"},
    "age": {"type": "int", "range": [16, 25]},
    "grade": {"type": "categorical", "values": ["A", "B", "C", "D", "F"]},
    "gpa": {"type": "float", "range": [2.0, 4.0]},
    "enrollment_status": {"type": "categorical", "values": ["enrolled", "dropped", "graduated"]}
}

## 🤖 Generate Synthetic Data

In [None]:
fake = Faker()

def generate_student_record(meta):
    record = {}
    for col, props in meta.items():
        if props["type"] == "string":
            if props.get("pattern") == "SID####":
                record[col] = f"SID{random.randint(1000, 9999)}"
            elif props.get("source") == "faker.first_name":
                record[col] = fake.first_name()
            elif props.get("source") == "faker.last_name":
                record[col] = fake.last_name()
        elif props["type"] == "int":
            record[col] = random.randint(*props["range"])
        elif props["type"] == "float":
            record[col] = round(random.uniform(*props["range"]), 2)
        elif props["type"] == "categorical":
            record[col] = random.choice(props["values"])
    return record

def generate_dataset(meta, num_rows=500):
    return pd.DataFrame([generate_student_record(meta) for _ in range(num_rows)])

df = generate_dataset(metadata)
df.head()

## 🧹 Prepare Text for Tokenization

In [None]:
# Format dataset for language model training
df["text"] = df.apply(lambda row: ", ".join([f"{col}: {val}" for col, val in row.items()]), axis=1)
dataset = Dataset.from_pandas(df[["text"]])

## 🧠 Tokenize and Train GPT-2 Model

In [None]:
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=64)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

model = AutoModelForCausalLM.from_pretrained(model_name)

training_args = TrainingArguments(
    output_dir="./synthetic_student_model",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=500,
    save_steps=10_000
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

trainer.train()

## 🧪 Generate Example Records

In [None]:
def generate_student(prompt="Student record:", max_new_tokens=50):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

print(generate_student())