In [None]:
# Efficient and Scalable Automatic Scoring in STEM Education
# ----------------------------------------------------------
# Activity 1: Optimizing Scoring Models (Knowledge Distillation)
# Activity 2: Scoring in Large-Scale Assessments

import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import torch

In [None]:
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    DistilBertTokenizer,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments,
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
# 1. Load and Prepare a Simple Dataset
data = pd.DataFrame({
    "response": [
        "Plants use photosynthesis to make food.",
        "Ice melts when heated.",
        "Gravity pulls objects down.",
        "Friction slows moving things.",
        "A magnet attracts metal.",
        "Water boils at 100 degrees.",
        "The sky is blue because of scattered light.",
        "Electricity flows in a closed circuit.",
    ],
    "score": [1, 1, 1, 1, 1, 1, 1, 0],  # 1=Correct, 0=Incorrect
})

# Split data for training/testing
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data['response'].tolist(), data['score'].tolist(), test_size=0.25, random_state=42
)


In [None]:
# 2. Train the "Teacher" Model (BERT)
teacher_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = teacher_tokenizer(train_texts, truncation=True, padding=True, max_length=64)
test_encodings = teacher_tokenizer(test_texts, truncation=True, padding=True, max_length=64)

class SimpleDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

train_dataset = SimpleDataset(train_encodings, train_labels)
test_dataset = SimpleDataset(test_encodings, test_labels)

teacher_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
teacher_model.to(device)

teacher_args = TrainingArguments(
    output_dir='./results_teacher',
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    evaluation_strategy="epoch",
    save_strategy="no",
    logging_strategy="epoch",
    disable_tqdm=False,
    logging_steps=10,
)

teacher_trainer = Trainer(
    model=teacher_model,
    args=teacher_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

print("\nTraining teacher model (BERT)...")
teacher_trainer.train()

# Evaluate Teacher Model
teacher_preds = teacher_trainer.predict(test_dataset)
teacher_pred_labels = np.argmax(teacher_preds.predictions, axis=1)
print("\nTeacher Model Results:")
print(classification_report(test_labels, teacher_pred_labels))


In [None]:
# 3. Simulate Knowledge Distillation
# Instead of full distillation (for speed), we'll use DistilBERT as a "student" model and train it using teacher "soft labels" as targets.

student_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_encodings_student = student_tokenizer(train_texts, truncation=True, padding=True, max_length=64)
test_encodings_student = student_tokenizer(test_texts, truncation=True, padding=True, max_length=64)
train_dataset_student = SimpleDataset(train_encodings_student, train_labels)
test_dataset_student = SimpleDataset(test_encodings_student, test_labels)

student_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
student_model.to(device)

student_args = TrainingArguments(
    output_dir='./results_student',
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    evaluation_strategy="epoch",
    save_strategy="no",
    logging_strategy="epoch",
    disable_tqdm=False,
    logging_steps=10,
)

student_trainer = Trainer(
    model=student_model,
    args=student_args,
    train_dataset=train_dataset_student,
    eval_dataset=test_dataset_student,
)

print("\nTraining student model (DistilBERT)...")
student_trainer.train()

# Evaluate Student Model
student_preds = student_trainer.predict(test_dataset_student)
student_pred_labels = np.argmax(student_preds.predictions, axis=1)
print("\nStudent Model Results:")
print(classification_report(test_labels, student_pred_labels))


In [None]:
# 4. Compare Model Sizes and Inference Speeds

def model_size(model):
    params = sum(p.numel() for p in model.parameters())
    return params / 1e6  # in millions

print(f"\nBERT (Teacher) model size: {model_size(teacher_model):.1f}M parameters")
print(f"DistilBERT (Student) model size: {model_size(student_model):.1f}M parameters")

def inference_time(model, tokenizer, texts, repeat=10):
    model.eval()
    total_time = 0
    with torch.no_grad():
        for _ in range(repeat):
            for text in texts:
                inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=64)
                inputs = {k: v.to(device) for k, v in inputs.items()}
                start = time.time()
                _ = model(**inputs)
                total_time += (time.time() - start)
    return total_time / (repeat * len(texts))

bert_infer = inference_time(teacher_model, teacher_tokenizer, test_texts)
distilbert_infer = inference_time(student_model, student_tokenizer, test_texts)

print(f"\nAverage inference time per sample (Teacher BERT): {bert_infer:.4f} seconds")
print(f"Average inference time per sample (Student DistilBERT): {distilbert_infer:.4f} seconds")

In [None]:
# 5. Activity: Mock Large-Scale Assessment

print("\nSimulating large-scale scoring with 1000 responses...")
large_texts = np.random.choice(data["response"], 1000, replace=True)

start = time.time()
for text in large_texts:
    inputs = student_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=64)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    _ = student_model(**inputs)
end = time.time()
print(f"Student model scored 1000 responses in {end - start:.2f} seconds.")

## Reflection: Scalable Automatic Scoring

- How did the accuracy of the student (DistilBERT) model compare to the teacher (BERT) model?
- What is the difference in model size and inference speed?
- Why is knowledge distillation useful for large-scale or real-time scoring?
- In what situations would you prefer a small/fast model even if itâ€™s slightly less accurate?
- What challenges might arise when scaling up to thousands or millions of responses?