In [None]:
# Fair Automatic Scoring in STEM Education
# ----------------------------------------
# Activity 1: Bias Analysis in AI Scoring
# Activity 2: Fairness in Dataset Curation

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch


In [None]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:

# 1. Load and Preview the Dataset
# For demo, we create a tiny dataset with gender and response/score.
# In class, replace with: pd.read_csv('data/sample_datasets/gender_science_responses.csv')
data = pd.DataFrame({
    "gender": ["male", "female", "female", "male", "female", "male"],
    "response": [
        "Gravity pulls the apple down.",
        "Heat causes ice to melt into water.",
        "Clouds form rain when they cool.",
        "A plant makes food using sunlight.",
        "Water boils at 100 degrees Celsius.",
        "A magnet attracts metal objects."
    ],
    "score": [1, 1, 1, 1, 1, 0],  # 1=Correct, 0=Incorrect (for demo)
})

print("Sample data:")
display(data)



In [None]:
# 2. Preprocess Data

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize(batch):
    return tokenizer(batch['response'], padding=True, truncation=True, max_length=64)

train_texts, test_texts, train_labels, test_labels, train_genders, test_genders = train_test_split(
    data['response'].tolist(), data['score'].tolist(), data['gender'].tolist(), test_size=0.33, random_state=42, stratify=data['gender']
)

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=64)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=64)

class SimpleDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

train_dataset = SimpleDataset(train_encodings, train_labels)
test_dataset = SimpleDataset(test_encodings, test_labels)



In [None]:
# 3. Train/Fine-tune BERT for Automatic Scoring

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.to(device)

training_args = TrainingArguments(
    output_dir='./results_fair',
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    evaluation_strategy="epoch",
    save_strategy="no",
    logging_strategy="epoch",
    disable_tqdm=False,
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

print("Training model...")
trainer.train()


In [None]:
# 4. Bias/Fairness Analysis

# Predict on test set
preds = trainer.predict(test_dataset)
pred_labels = np.argmax(preds.predictions, axis=1)
test_genders = list(test_genders)  # Ensure indexable

print("Classification report:")
print(classification_report(test_labels, pred_labels))

# Compute accuracy for each gender (Scoring Accuracy Difference)
results = pd.DataFrame({
    "gender": test_genders,
    "true_score": test_labels,
    "pred_score": pred_labels
})

acc_by_gender = results.groupby("gender").apply(lambda df: accuracy_score(df["true_score"], df["pred_score"]))
print("\nAccuracy by gender:")
print(acc_by_gender)

diff = abs(acc_by_gender.max() - acc_by_gender.min())
print(f"\nScoring Accuracy Difference between genders: {diff:.2f}")

# Mean Score Gaps (MSG) by Gender
results["score_gap"] = results["pred_score"] - results["true_score"]
msg_by_gender = results.groupby("gender")["score_gap"].mean()
print("\nMean Score Gap (AI-human) by gender:")
print(msg_by_gender)

# Equalized Odds (Simple version): 
# For demo: Check if model's error rates are similar for each gender.
def false_positive_rate(df):
    negatives = df[df["true_score"] == 0]
    if len(negatives) == 0: return np.nan
    return np.mean(negatives["pred_score"] == 1)

def false_negative_rate(df):
    positives = df[df["true_score"] == 1]
    if len(positives) == 0: return np.nan
    return np.mean(positives["pred_score"] == 0)

eo = results.groupby("gender").apply(lambda df: pd.Series({
    "FPR": false_positive_rate(df),
    "FNR": false_negative_rate(df)
}))
print("\nApproximate Equalized Odds (error rates) by gender:")
print(eo)


In [None]:
# 5. Activity: Curate and Test a More Diverse Dataset

print("\nNow, try editing/adding examples below to make the dataset more balanced and diverse, then rerun the analysis above.")

# For demo, create or ask students to edit this cell:
data2 = pd.DataFrame({
    "gender": ["male", "female", "female", "male", "female", "male", "nonbinary"],
    "response": [
        "Gravity pulls objects toward Earth.",
        "Ice becomes water when heated.",
        "Plants make oxygen.",
        "A compass points north due to magnetism.",
        "Sound travels through air.",
        "Metal expands when heated.",
        "Friction slows moving objects."
    ],
    "score": [1, 1, 1, 1, 1, 0, 1],
})

## Reflection: Fairness and Bias in AI Scoring

- Did the model's scoring accuracy differ by gender or other groups?
- Were the Mean Score Gaps or error rates (FPR/FNR) noticeably higher for some groups?
- How did adding or balancing data change the fairness metrics?
- What steps can we take to reduce bias in automatic scoring systems?
- Why is it important for educational AI to be fair and unbiased?