In [12]:
# make_binned_splits.py
# Creates train.csv / val.csv / test.csv with headers: content, score (1..5)
# - Bins are built from the average of the six traits
# - You control target proportions per bin via `target_props`

import os, numpy as np, pandas as pd, torch, torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_absolute_error, cohen_kappa_score
from transformers import AutoTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from datasets import load_dataset
from torch.utils.data import TensorDataset, DataLoader, WeightedRandomSampler
from tqdm import tqdm

# --------------------------
# CONFIG: tweak these freely
# --------------------------
binning = "quantile"              # "quantile" or "fixed"
n_bins = 5                        # number of bins/classes (1..5)
# If you choose "fixed", provide n_bins+1 edges covering the min..max of averages.
# Example below roughly centers around the mean ~3:
fixed_edges = [1.0, 2.2, 2.8, 3.3, 3.8, 5.01]  # inclusive-lowest; last edge should exceed max

# Desired class proportions (must sum to 1.0). Example: equal bins.
target_props = {1: 0.40, 2: 0.40, 3: 0.08, 4: 0.08, 5: 0.04}
# Another example (intentionally unbalanced):
# target_props = {1: 0.10, 2: 0.20, 3: 0.40, 4: 0.20, 5: 0.10}

subset_fraction = 1.0             # use 1.0 for all data; <1.0 to take a random subset first
oversample = True                 # if a bin is short, sample with replacement to hit target proportion
random_state = 42                 # reproducibility
train_frac, val_frac, test_frac = 0.80, 0.10, 0.10

# --------------------------
# Load & prep
# --------------------------
ds = load_dataset("tasksource/english-grading", split="train")
df = ds.to_pandas()

traits = ["cohesion","syntax","vocabulary","phraseology","grammar","conventions"]
df["avg"] = df[traits].mean(axis=1)

# Bin the averages -> integer scores 1..5
if binning == "quantile":
    # equal-frequency bins for a more even distribution
    # duplicates="drop" guards against pathological ties collapsing bins
    binned = pd.qcut(df["avg"], q=n_bins, labels=range(1, n_bins + 1), duplicates="drop")
    # If for some reason fewer than n_bins were created (rare), fall back to fixed-width bins over observed range
    if binned.isna().any() or len(binned.cat.categories) < n_bins:
        # Build fixed-width edges across the data range
        edges = np.linspace(df["avg"].min() - 1e-6, df["avg"].max() + 1e-6, n_bins + 1)
        binned = pd.cut(df["avg"], bins=edges, labels=range(1, n_bins + 1), include_lowest=True)
    df["score"] = binned.astype(int)
elif binning == "fixed":
    edges = fixed_edges
    assert len(edges) == n_bins + 1, "fixed_edges must have n_bins+1 numbers"
    df["score"] = pd.cut(df["avg"], bins=edges, labels=range(1, n_bins + 1), include_lowest=True).astype(int)
else:
    raise ValueError("binning must be 'quantile' or 'fixed'")

# Use only the two requested columns, dedupe on content
two_col = pd.DataFrame({"content": df["full_text"], "score": df["score"].astype(int)}).drop_duplicates("content")

# Optionally take a random subset first (before proportioning)
if subset_fraction < 1.0:
    two_col = two_col.sample(frac=subset_fraction, random_state=random_state).reset_index(drop=True)

# --------------------------
# Build a dataset to match target proportions
# --------------------------
# Normalize/validate proportions
keys = sorted(target_props.keys())
assert keys == list(range(1, n_bins + 1)), f"target_props must have keys 1..{n_bins}"
props = np.array([target_props[k] for k in keys], dtype=float)
props = props / props.sum()

total = len(two_col)
# Compute exact desired counts using largest remainder method
raw = props * total
base = np.floor(raw).astype(int)
remainder = total - base.sum()
# Distribute the remaining items to classes with largest fractional parts
order = np.argsort(-(raw - base))  # descending by fractional part
for i in range(remainder):
    base[order[i]] += 1
desired_counts = dict(zip(keys, base))

# Sample per class
rng = np.random.default_rng(random_state)
parts = []
for cls, n_needed in desired_counts.items():
    group = two_col[two_col["score"] == cls]
    if len(group) == 0:
        continue  # no examples of this class available
    replace = oversample and (n_needed > len(group))
    sampled = group.sample(n=min(n_needed, len(group)) if not replace else n_needed,
                           replace=replace, random_state=random_state)
    parts.append(sampled)

balanced = pd.concat(parts, axis=0).sample(frac=1.0, random_state=random_state).reset_index(drop=True)

# --------------------------
# Stratified train/val/test split (preserves your chosen proportions)
# --------------------------
train_df, temp_df = train_test_split(
    balanced, test_size=(1 - train_frac), stratify=balanced["score"], random_state=random_state
)
relative_test = test_frac / (val_frac + test_frac)  # split temp into val/test
val_df, test_df = train_test_split(
    temp_df, test_size=relative_test, stratify=temp_df["score"], random_state=random_state
)

# --------------------------
# Save CSVs
# --------------------------
train_df.to_csv("train.csv", index=False)
val_df.to_csv("val.csv", index=False)
test_df.to_csv("test.csv", index=False)

# Quick class balance report
def counts(df):
    return df["score"].value_counts().sort_index().to_dict()
print({
    "total": len(balanced),
    "target": desired_counts,
    "train": counts(train_df),
    "val": counts(val_df),
    "test": counts(test_df),
})


Repo card metadata block was not found. Setting CardData to empty.


{'total': 3911, 'target': {1: np.int64(1564), 2: np.int64(1564), 3: np.int64(313), 4: np.int64(313), 5: np.int64(157)}, 'train': {1: 1251, 2: 1251, 3: 250, 4: 250, 5: 126}, 'val': {1: 156, 2: 156, 3: 31, 4: 32, 5: 16}, 'test': {1: 157, 2: 157, 3: 32, 4: 31, 5: 15}}


In [None]:
tok = AutoTokenizer.from_pretrained("bert-tiny", local_files_only=True, use_fast=True)
device = "cuda" if torch.cuda.is_available() else "cpu"

# inverse-frequency class weights (normalized to mean=1)
cls_counts   = train_df["score"].astype(int).value_counts().sort_index()
inv          = 1.0 / cls_counts
class_weight = (inv / inv.mean()).to_dict()


class_weight_vec = torch.tensor(
    [class_weight.get(k, 1.0) for k in [1,2,3,4,5]],
    dtype=torch.float32, device=device
)

# DataLoaders that yield just indices; we fetch & chunk texts on-the-fly
train_idx = torch.arange(len(train_df))
val_idx   = torch.arange(len(val_df))

w_train = torch.tensor(train_df["score"].astype(int).map(class_weight).values, dtype=torch.double)
sampler = WeightedRandomSampler(weights=w_train, num_samples=len(w_train), replacement=True)

BATCH_ES = 8  # essays per batch (CPU-friendly)
# shuffle 
#train_dl = DataLoader(TensorDataset(train_idx), batch_size=BATCH_ES, sampler=sampler)
train_dl = DataLoader(TensorDataset(train_idx), batch_size=BATCH_ES, shuffle=True)

val_dl   = DataLoader(TensorDataset(val_idx),   batch_size=BATCH_ES, shuffle=False)

# chunking helpers
MAX_LEN = 512
STRIDE  = 128
EPOCHS = 20

def chunk_encode(text):
    enc = tok(
        text,
        max_length=MAX_LEN,
        truncation=True,
        stride=STRIDE,
        return_overflowing_tokens=True,
        return_tensors="pt",
        return_token_type_ids=True,
        padding="max_length",
    )
    if "token_type_ids" not in enc:
        enc["token_type_ids"] = torch.zeros_like(enc["input_ids"])
    return enc

@torch.no_grad()
def chunk_predict_mean(model, text):
    enc = chunk_encode(text)
    out = model(
        input_ids=enc["input_ids"].to(device),
        attention_mask=enc["attention_mask"].to(device),
        token_type_ids=enc["token_type_ids"].to(device),
    ).logits.squeeze(-1)
    return out.mean().item()

def clip_round(x): return np.clip(np.rint(x), 1, 5).astype(int)
def score_to_class(s):  # scores 1..5 → class ids 0..4
    return int(round(float(s))) - 1

def class_to_score(c):  # class ids 0..4 → scores 1..5
    return int(c) + 1

In [14]:
device = "cuda" if torch.cuda.is_available() else "cpu"


# model & opt
model = BertForSequenceClassification.from_pretrained(
    "bert-tiny", local_files_only=True, num_labels=5,ignore_mismatched_sizes=True
).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
num_training_steps = len(train_dl) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=max(1, num_training_steps//10), num_training_steps=num_training_steps
)
mse = nn.MSELoss(reduction="none")
criterion = torch.nn.CrossEntropyLoss(weight=class_weight_vec, label_smoothing=0.05)

# ---- train (1 epoch), chunking + mean aggregation ----
model.train()
for epoch in range(EPOCHS):
    pbar = tqdm(train_dl, desc=f"Epoch {epoch+1}/{EPOCHS}")
    for (idx_batch,) in pbar:
        optimizer.zero_grad(set_to_none=True)

        batch_size = len(idx_batch)
        for idx in idx_batch.tolist():
            row   = train_df.iloc[idx]
            text  = row["content"]
            target = torch.tensor(score_to_class(row["score"]), dtype=torch.long, device=device)

            enc = tok(
                text,
                max_length=512, truncation=True, stride=STRIDE,
                return_overflowing_tokens=True, return_tensors="pt",
                return_token_type_ids=True, padding="max_length",
            )
            if "token_type_ids" not in enc:
                enc["token_type_ids"] = torch.zeros_like(enc["input_ids"])

            # logits per chunk: (n_chunks, 5)
            logits_chunks = model(
                input_ids=enc["input_ids"].to(device),
                attention_mask=enc["attention_mask"].to(device),
                token_type_ids=enc["token_type_ids"].to(device),
            ).logits

            # lightly weight later chunks (optional); mean is also fine
            n = logits_chunks.shape[0]
            w = torch.linspace(0.9, 1.1, steps=n, device=logits_chunks.device).unsqueeze(1)
            logits_essay = (logits_chunks * w).sum(dim=0) / w.sum()  # shape (5,)

            loss_i = criterion(logits_essay.unsqueeze(0), target.unsqueeze(0))
            (loss_i / batch_size).backward()

        optimizer.step()
        scheduler.step()
        pbar.set_postfix(loss=f"{loss_i.item():.4f}")



# ---- validate (chunked) ----
model.eval()
preds, trues = [], []
with torch.no_grad():
    for (idx_batch,) in val_dl:
        for idx in idx_batch.tolist():
            row   = val_df.iloc[idx]
            text  = row["content"]
            true_score = int(round(row["score"]))

            enc = tok(
                text,
                max_length=MAX_LEN, truncation=True, stride=STRIDE,
                return_overflowing_tokens=True, return_tensors="pt",
                return_token_type_ids=True, padding="max_length",
            )
            if "token_type_ids" not in enc:
                enc["token_type_ids"] = torch.zeros_like(enc["input_ids"])

            logits_chunks = model(
                input_ids=enc["input_ids"].to(device),
                attention_mask=enc["attention_mask"].to(device),
                token_type_ids=enc["token_type_ids"].to(device),
            ).logits  # (n_chunks, 5)

            n = logits_chunks.shape[0]
            w = torch.linspace(0.9, 1.1, steps=n, device=logits_chunks.device).unsqueeze(1)
            logits_essay = (logits_chunks * w).sum(dim=0) / w.sum()

            pred_class = int(torch.argmax(logits_essay).item())  # 0..4
            pred_score = class_to_score(pred_class)              # 1..5

            preds.append(pred_score)
            trues.append(true_score)

import numpy as np
from sklearn.metrics import accuracy_score, cohen_kappa_score
preds = np.array(preds, dtype=int); trues = np.array(trues, dtype=int)
acc = accuracy_score(trues, preds)
qwk = cohen_kappa_score(trues, preds, weights="quadratic")
print({"val_acc": round(acc,4), "QWK": round(qwk,4), "target_met_≥75%": acc >= 0.75})


# ---- save model + tokenizer + optimizer/scheduler ----
SAVE_DIR = "finetuned_bert_tiny_chunked"
os.makedirs(SAVE_DIR, exist_ok=True)
model.save_pretrained(SAVE_DIR)
tok.save_pretrained(SAVE_DIR)
torch.save({"optimizer": optimizer.state_dict(),
            "scheduler": scheduler.state_dict(),
            "class_weight": class_weight},
           os.path.join(SAVE_DIR, "optimizer_scheduler.pt"))
print("Saved to", SAVE_DIR)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-tiny and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1]) in the checkpoint and torch.Size([5]) in the model instantiated
- classifier.weight: found shape torch.Size([1, 128]) in the checkpoint and torch.Size([5, 128]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/10: 100%|██████████| 391/391 [01:41<00:00,  3.84it/s, loss=1.5810]
Epoch 2/10: 100%|██████████| 391/391 [01:11<00:00,  5.50it/s, loss=1.5731]
Epoch 3/10: 100%|██████████| 391/391 [01:11<00:00,  5.44it/s, loss=2.0370]
Epoch 4/10: 100%|██████████| 391/391 [01:10<00:00,  5.52it/s, loss=1.1719]
Epoch 5/10: 100%|██████████| 391/391 [01:10<00:00,  5.54it/s, loss=2.2015]
Epoch 6/10: 100%|██████████| 391/391 [01:11<00:00,  5.49it/s, loss=2.4405]
Epoch 7/10: 100%|██████████| 391/391 [0

{'val_acc': 0.6138, 'QWK': 0.2884, 'target_met_≥75%': False}
Saved to finetuned_bert_tiny_chunked


In [15]:
import numpy as np, pandas as pd, torch
from torch.utils.data import TensorDataset, DataLoader
from transformers import AutoTokenizer, BertForSequenceClassification
from tqdm import tqdm

MODEL_DIR = "finetuned_bert_tiny_chunked"   # ← adjust if you used a different save path
TEST_CSV  = "test.csv"
OUT_CSV   = "submissions.csv"

# ---- load test data ----
test_df = pd.read_csv(TEST_CSV)
assert "content" in test_df.columns, "test.csv must have a 'content' column"
test_df = test_df.dropna(subset=["content"]).copy()
test_df["content"] = test_df["content"].astype(str).str.strip()

# ---- load tokenizer & model ----
tok = AutoTokenizer.from_pretrained(MODEL_DIR, local_files_only=True, use_fast=True)
model = BertForSequenceClassification.from_pretrained(MODEL_DIR).to(device)
model.eval()

# ---- chunking helpers ----
MAX_LEN, STRIDE = 512, 128
def chunk_encode(text):
    enc = tok(
        text,
        max_length=MAX_LEN,
        truncation=True,
        stride=STRIDE,
        return_overflowing_tokens=True,
        return_tensors="pt",
        return_token_type_ids=True,
        padding="max_length",
    )
    if "token_type_ids" not in enc:
        enc["token_type_ids"] = torch.zeros_like(enc["input_ids"])
    return enc

@torch.no_grad()
def chunk_predict_mean(text):
    enc = chunk_encode(text)
    out = model(
        input_ids=enc["input_ids"].to(device),
        attention_mask=enc["attention_mask"].to(device),
        token_type_ids=enc["token_type_ids"].to(device),
    ).logits.squeeze(-1)
    return out.mean().item()

def clip_round(x): return np.clip(np.rint(x), 1, 5).astype(int)

# ---- DataLoader of indices (fetch text on the fly) ----
idx = torch.arange(len(test_df))
test_dl = DataLoader(TensorDataset(idx), batch_size=8, shuffle=False)

# ---- inference ----
preds = []
with torch.no_grad():
    for (idx_batch,) in tqdm(test_dl, desc="Infer (chunked)"):
        for i in idx_batch.tolist():
            preds.append(chunk_predict_mean(test_df.iloc[i]["content"]))

preds = np.array(preds, dtype=np.float32)
preds_int = clip_round(preds)

# ---- save submissions ----
out = pd.DataFrame({
    "content": test_df["content"].values,
    "score": preds_int
})
out.to_csv(OUT_CSV, index=False)
print(f"Saved {OUT_CSV} with {len(out)} rows.")



Infer (chunked): 100%|██████████| 49/49 [00:02<00:00, 19.29it/s]

Saved submissions.csv with 392 rows.



