### ModernBERT Reference  
Warner et al., “Smarter, Better, Faster, Longer: A Modern Bidirectional Encoder…”, arXiv:2412.13663 (2024).

@misc{modernbert,
  title={Smarter, Better, Faster, Longer: A Modern Bidirectional Encoder for Fast, Memory Efficient, and Long Context Finetuning and Inference},
  author={Benjamin Warner and Antoine Chaffin and Benjamin Clavié and Orion Weller and Oskar Hallström and Said Taghadouini and Alexis Gallagher and Raja Biswas and Faisal Ladhak and Tom Aarsen and Nathan Cooper and Griffin Adams and Jeremy Howard and Iacopo Poli},
  year={2024},
  eprint={2412.13663},
  archivePrefix={arXiv},
  primaryClass={cs.CL},
  url={https://arxiv.org/abs/2412.13663}
}

In [8]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["XLA_FLAGS"] = "--xla_gpu_cuda_data_dir=/tmp"
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"                  

import os, datetime, warnings, re, unicodedata
import pandas as pd
from sklearn.model_selection import StratifiedKFold

import torch
import torch.optim as optim
from torch import nn
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModel, default_data_collator, get_linear_schedule_with_warmup

# TensorBoard writer (PyTorch version)
logdir = os.path.join(
    "tb_logs",
    "run_" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
)
os.makedirs(logdir, exist_ok=True)
writer = SummaryWriter(logdir)
print(f"TensorBoard logs → {logdir}")

# Inspect input files
for dirname, _, filenames in os.walk("/kaggle/input"):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Load training CSV
df = pd.read_csv(
    "/kaggle/input/map-charting-student-math-misunderstandings/train.csv"
)
pd.set_option("display.max_colwidth", None)

# Device info
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running on:", device)

TensorBoard logs → tb_logs/run_20250803-220253
/kaggle/input/map-charting-student-math-misunderstandings/sample_submission.csv
/kaggle/input/map-charting-student-math-misunderstandings/train.csv
/kaggle/input/map-charting-student-math-misunderstandings/test.csv
Running on: cuda


In [2]:
# 0. helper to normalise whitespace/Unicode
def _clean(txt: str) -> str:
    txt = unicodedata.normalize("NFKC", txt)
    txt = re.sub(r"\s+", " ", txt)
    return txt.strip()

# 1. minimal preprocessing
def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # text fields
    df["StudentExplanation"] = (
        df["StudentExplanation"].fillna("").apply(_clean)
    )
    df["QuestionText"] = df["QuestionText"].apply(_clean)
    df["MC_Answer"]    = df["MC_Answer"].apply(_clean)

    # misconception field
    df["Misconception"] = (
        df["Misconception"]
          .fillna("NA")
          .astype(str)
          .str.strip()
          .replace({"Wrong_fraction": "Wrong_Fraction"})
    )
    mask = df["Category"].str.endswith("Misconception")
    df.loc[~mask, "Misconception"] = "NA"

    # joint label string
    df["label_str"] = df["Category"] + ":" + df["Misconception"]
    return df

# 2. build label maps + attach label_id
def build_label_maps(df: pd.DataFrame):
    labels = sorted(df["label_str"].unique())
    label2id = {lbl: i for i, lbl in enumerate(labels)}
    id2label = {i: lbl for lbl, i in label2id.items()}
    df["label_id"] = df["label_str"].map(label2id).astype(int)
    return df, label2id, id2label

# 3. run the pipeline
df = preprocess(df)                     
df, label2id, id2label = build_label_maps(df)   

# 4. stratified group K-fold on QuestionId
warnings.filterwarnings("ignore", message="The least populated class")

k = 5
skf = StratifiedKFold(
    n_splits=k,
    shuffle=True,
    random_state=42
)

df["fold"] = -1
for fold, (_, val_idx) in enumerate(skf.split(df, y=df["label_id"])):
    df.loc[val_idx, "fold"] = fold

# sanity check
assert (df["fold"] >= 0).all()
print(df["fold"].value_counts().sort_index())

fold
0    7340
1    7339
2    7339
3    7339
4    7339
Name: count, dtype: int64


In [3]:
# 5. loading ModernBERT tokenizer
MODEL_NAME = 'answerdotai/ModernBERT-base'
NUM_LABELS = len(label2id)
MAX_LEN = 128

# 6. fast tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    use_fast=True
)

# 7. template builder
TEMPLATE = "{q} [SEP] {a} [SEP] {e}"

def build_text(row):
    return TEMPLATE.format(
        q=row["QuestionText"],
        a=row["MC_Answer"],
        e=row["StudentExplanation"]
    )

df["text"] = df.apply(build_text, axis=1)

# 8. sampling sequence lenght distribution
tok_lens = df["text"].apply(lambda s: len(tokenizer.tokenize(s)))
print(tok_lens.describe(percentiles=[.5,.75,.9,.95,.99]))

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

count    36696.000000
mean        56.639825
std         18.143780
min         19.000000
50%         55.000000
75%         67.000000
90%         80.000000
95%         92.000000
99%        108.000000
max        215.000000
Name: text, dtype: float64


In [4]:
# 9. PyTorch Dataset
class MAPDataset(torch.utils.data.Dataset):
    def __init__(self, frame):
        self.texts = frame["text"].tolist()
        self.labels = frame["label_id"].tolist()

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=MAX_LEN,
            return_attention_mask=True,
            return_token_type_ids=True,
        )
        item = {
            "input_ids": torch.tensor(enc["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(enc["attention_mask"], dtype=torch.long),
            "token_type_ids": torch.tensor(enc["token_type_ids"], dtype=torch.long),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long),
        }
        return item

# 10. dataframe → DataLoader
def make_loader(frame, batch_size=16, shuffle=True):
    ds = MAPDataset(frame)                           
    return DataLoader(
        ds,
        batch_size=batch_size,
        shuffle=shuffle,           
        num_workers=2,                               
        pin_memory=True
    )

# 11. choose one validation fold
val_fold = 4                                         

train_df = df[df["fold"] != val_fold].reset_index(drop=True)
val_df = df[df["fold"] == val_fold].reset_index(drop=True)
print(len(train_df), "train rows |", len(val_df), "val rows")

train_loader = make_loader(train_df, batch_size=16, shuffle=True)
val_loader = make_loader(val_df, batch_size=32, shuffle=False)

29357 train rows | 7339 val rows


In [5]:
MODEL_NAME  = "answerdotai/ModernBERT-base"   
HIDDEN_SIZE = 768                             
NUM_LABELS = len(label2id)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 12. ModernBERT + single-layer classification head
class ModernBertClassifier(nn.Module):
    def __init__(self, num_labels=NUM_LABELS, dropout=0.1):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(MODEL_NAME)
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(self.encoder.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, labels=None):
        out = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )
        cls_vec  = out.last_hidden_state[:, 0]         
        logits   = self.classifier(self.dropout(cls_vec))

        loss = None
        if labels is not None:
            loss = nn.functional.cross_entropy(logits, labels)

        return {"logits": logits, "loss": loss}

# 13. Factory that returns model, optimizer, scheduler
def build_model(total_train_steps, lr=2e-5, weight_decay=0.01, warmup_ratio=0.1):

    model = ModernBertClassifier().to(device)

    # weight-decay only on non-bias / non-LayerNorm weights
    no_decay = ["bias", "LayerNorm.weight"]
    param_groups = [
        {
            "params": [p for n, p in model.named_parameters()
                       if not any(nd in n for nd in no_decay)],
            "weight_decay": weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters()
                       if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]

    optimizer = optim.AdamW(param_groups, lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(total_train_steps * warmup_ratio),
        num_training_steps=total_train_steps,
    )

    return model, optimizer, scheduler

In [None]:
epochs = 5
total_steps = len(train_loader) * epochs
model, optim, sched = build_model(total_train_steps=total_steps)

for epoch in range(epochs):
    model.train()
    running = 0.0

    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}

        out = model(**batch)
        loss = out["loss"]

        loss.backward()
        optim.step()
        sched.step()
        optim.zero_grad()

        running += loss.item()

    avg_train = running / len(train_loader)
    writer.add_scalar("Loss/train", avg_train, epoch)

    # validation
    model.eval()
    val_running, preds, y_true = 0.0, [], []

    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            out = model(**batch)

            val_running += out["loss"].item()
            preds.append(out["logits"].cpu().numpy())
            y_true.extend(batch["labels"].cpu().numpy())

    avg_val = val_running / len(val_loader)
    writer.add_scalar("Loss/val", avg_val, epoch)

    # quick MAP@3
    import numpy as np
    preds = np.vstack(preds)
    top3  = preds.argsort(axis=1)[:, -3:][:, ::-1]
    map3  = np.mean([
        1/(row.tolist().index(y)+1) if y in row else 0
        for y, row in zip(y_true, top3)
    ])
    writer.add_scalar("MAP3/val", map3, epoch)

    print(f"Epoch {epoch+1}: train_loss={avg_train:.4f}  "
          f"val_loss={avg_val:.4f}  MAP@3={map3:.4f}")