### ModernBERT Reference  
Warner et al., “Smarter, Better, Faster, Longer: A Modern Bidirectional Encoder…”, arXiv:2412.13663 (2024).

@misc{modernbert,
  title={Smarter, Better, Faster, Longer: A Modern Bidirectional Encoder for Fast, Memory Efficient, and Long Context Finetuning and Inference},
  author={Benjamin Warner and Antoine Chaffin and Benjamin Clavié and Orion Weller and Oskar Hallström and Said Taghadouini and Alexis Gallagher and Raja Biswas and Faisal Ladhak and Tom Aarsen and Nathan Cooper and Griffin Adams and Jeremy Howard and Iacopo Poli},
  year={2024},
  eprint={2412.13663},
  archivePrefix={arXiv},
  primaryClass={cs.CL},
  url={https://arxiv.org/abs/2412.13663 }
}

In [1]:
# imports
import os, datetime, warnings, re, unicodedata, pandas as pd
from sklearn.model_selection import StratifiedKFold
from tqdm.auto import tqdm
import random, numpy as np, torch
random.seed(42); np.random.seed(42); torch.manual_seed(42)

from torch import nn, optim
from torch.utils.data import DataLoader
from transformers import (
    AutoTokenizer,
    AutoModel,
    get_linear_schedule_with_warmup,
    DataCollatorWithPadding
)
from torch.utils.tensorboard import SummaryWriter  

logdir = f'tb_logs/run_{datetime.datetime.now():%Y%m%d-%H%M%S}'
os.makedirs(logdir, exist_ok=True)
writer = SummaryWriter(logdir)
print(f'TensorBoard logs → {logdir}')

os.environ['TOKENIZERS_PARALLELISM'] = 'false'
warnings.filterwarnings('ignore', message=r'Some known HF warning regex')

df = pd.read_csv('/kaggle/input/map-charting-student-math-misunderstandings/train.csv')
pd.set_option('display.max_colwidth', None)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Running on:', device)

2025-08-05 00:46:11.376048: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754354771.766379      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754354771.872150      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


TensorBoard logs → tb_logs/run_20250805-004628
Running on: cuda


In [2]:
# 0. helper to normalise whitespace/Unicode
def _clean(txt: str) -> str:
    txt = unicodedata.normalize('NFKC', txt)
    txt = re.sub(r'\s+', ' ', txt)
    return txt.strip()

# 1. minimal preprocessing
def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # text fields
    df['StudentExplanation'] = (
        df['StudentExplanation'].fillna('').apply(_clean)
    )
    df['QuestionText'] = df['QuestionText'].apply(_clean)
    df['MC_Answer']    = df['MC_Answer'].apply(_clean)

    # misconception field
    df['Misconception'] = (
        df['Misconception']
          .fillna('NA')
          .astype(str)
          .str.strip()
          .replace({'Wrong_fraction': 'Wrong_Fraction'})
    )
    mask = df['Category'].str.endswith('Misconception')
    df.loc[~mask, 'Misconception'] = 'NA'

    # joint label string
    df['label_str'] = df['Category'] + ':' + df['Misconception']
    return df

# 2. build label maps + attach label_id
def build_label_maps(df: pd.DataFrame):
    labels = sorted(df['label_str'].unique())
    label2id = {lbl: i for i, lbl in enumerate(labels)}
    id2label = {i: lbl for lbl, i in label2id.items()}
    df['label_id'] = df['label_str'].map(label2id).astype(int)
    return df, label2id, id2label

# 3. run the pipeline
df = preprocess(df)                     
df, label2id, id2label = build_label_maps(df)   

# 4. stratified K-fold
warnings.filterwarnings('ignore', message='The least populated class')

k = 5
skf = StratifiedKFold(
    n_splits=k,
    shuffle=True,
    random_state=42
)

df['fold'] = -1
for fold, (_, val_idx) in enumerate(skf.split(df, y=df['label_id'])):
    df.loc[val_idx, 'fold'] = fold

# sanity check
assert (df['fold'] >= 0).all()
print(df['fold'].value_counts().sort_index())

fold
0    7340
1    7339
2    7339
3    7339
4    7339
Name: count, dtype: int64


In [3]:
# 5. loading ModernBERT tokenizer
MODEL_NAME = 'answerdotai/ModernBERT-base'
NUM_LABELS = len(label2id)
MAX_LEN = 256

# 6. fast tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    use_fast=True
)
SPECIAL_TOKENS = ['<Q>', '</Q>', '<A>', '</A>', '<E>', '</E>']
tokenizer.add_special_tokens({'additional_special_tokens': SPECIAL_TOKENS})
for tok in SPECIAL_TOKENS:
    assert tokenizer.convert_tokens_to_ids(tok) != tokenizer.unk_token_id, f'{tok} is UNK!'
    
# 7. template builder
TEMPLATE = '<Q> {q} </Q> <A> {a} </A> <E> {e} </E>'

def build_text(row):
    return TEMPLATE.format(
        q=row['QuestionText'],
        a=row['MC_Answer'],
        e=row['StudentExplanation']
    )

df['text'] = df.apply(build_text, axis=1)

# 8. sampling sequence lenght distribution
tok_lens = df['text'].apply(lambda s: len(tokenizer.tokenize(s)))
print(tok_lens.describe(percentiles=[.5,.75,.9,.95,.99]))

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

count    36696.000000
mean        63.591699
std         18.033133
min         26.000000
50%         62.000000
75%         74.000000
90%         87.000000
95%         98.000000
99%        114.000000
max        222.000000
Name: text, dtype: float64


In [4]:
# 9. PyTorch Dataset  ─ no token_type_ids for ModernBERT
class MAPDataset(torch.utils.data.Dataset):
    def __init__(self, frame, tokenizer):
        self.encodings = tokenizer(
            frame['text'].tolist(),
            truncation=True,
            max_length=MAX_LEN,
            return_attention_mask=True,
            return_token_type_ids=False,
        )
        self.labels = frame['label_id'].tolist()

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {k: v[idx] for k, v in self.encodings.items()}  
        item['labels'] = self.labels[idx]                      
        return item

# 10. dataframe → DataLoader
collate_fn = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)

def make_loader(frame, batch_size=16, shuffle=True):
    ds = MAPDataset(frame, tokenizer)
    return DataLoader(
        ds,
        batch_size=batch_size,
        shuffle=shuffle,
        collate_fn=collate_fn,
        num_workers=2,
        pin_memory=True,
    )

# 11. hold-out fold
val_fold = 4

train_df = df[df['fold'] != val_fold].reset_index(drop=True)
val_df = df[df['fold'] == val_fold].reset_index(drop=True)
print(len(train_df), 'train rows |', len(val_df), 'val rows')

train_loader = make_loader(train_df, batch_size=16, shuffle=True)
val_loader = make_loader(val_df, batch_size=32, shuffle=False)

29357 train rows | 7339 val rows


In [5]:
MODEL_NAME = 'answerdotai/ModernBERT-base'
NUM_LABELS = len(label2id)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 12. ModernBERT + single-layer classification head (no token_type_ids)
class ModernBertClassifier(nn.Module):
    def __init__(self, num_labels: int = NUM_LABELS, dropout: float = 0.1):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(MODEL_NAME)
        self.encoder.resize_token_embeddings(len(tokenizer))
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(self.encoder.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask=None, labels=None):
        out = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        cls_vec = out.last_hidden_state[:, 0]                     
        logits = self.classifier(self.dropout(cls_vec))

        loss = None
        if labels is not None:
            loss = nn.functional.cross_entropy(logits, labels)

        return {'logits': logits, 'loss': loss}

# 13. factory → model, optimizer, scheduler
def build_model(total_train_steps, lr=2e-5, weight_decay=0.01, warmup_ratio=0.1):
    model = ModernBertClassifier().to(device)

    # weight-decay only on non-bias / non-LayerNorm parameters
    no_decay = {'bias', 'LayerNorm.weight'}
    param_groups = [
        {
            'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            'weight_decay': weight_decay,
        },
        {
            'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            'weight_decay': 0.0,
        },
    ]

    optimizer = optim.AdamW(param_groups, lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(total_train_steps * warmup_ratio),
        num_training_steps=total_train_steps,
    )
    return model, optimizer, scheduler

In [6]:
# 14. training the model + val metrics MAP@3 — AMP version
%load_ext tensorboard
%tensorboard --logdir tb_logs --host 0.0.0.0

epochs = 10
total_steps = len(train_loader) * epochs
model, optim, sched = build_model(total_train_steps=total_steps)
scaler = torch.cuda.amp.GradScaler()                     

for epoch in range(epochs):
    model.train()
    running = 0.0

    for batch in tqdm(train_loader, desc=f'Epoch {epoch+1}/{epochs}', leave=False):
        batch = {k: v.to(device) for k, v in batch.items()}

        with torch.cuda.amp.autocast():                  
            out  = model(**batch)
            loss = out['loss']

        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  
        scaler.step(optim)
        scaler.update()
        sched.step()
        optim.zero_grad(set_to_none=True)

        running += loss.item()

    avg_train = running / len(train_loader)
    writer.add_scalar('Loss/train', avg_train, epoch)

    # validation
    model.eval()
    val_running, preds, y_true = 0.0, [], []
    with torch.no_grad(), torch.cuda.amp.autocast():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            out = model(**batch)

            val_running += out['loss'].item()
            preds.append(out['logits'].float().cpu())    
            y_true.extend(batch['labels'].cpu())

    avg_val = val_running / len(val_loader)
    writer.add_scalar('Loss/val', avg_val, epoch)

    # MAP@3
    preds = torch.vstack(preds)                          
    top3 = torch.topk(preds, 3, dim=1).indices.cpu()    
    hits = (top3 == torch.tensor(y_true).unsqueeze(1))  
    ranks = hits.float() * (1 / (torch.arange(1, 4).float()))  
    map3 = ranks.sum(dim=1).mean().item()

    writer.add_scalar('MAP3/val', map3, epoch)
    print(f'Epoch {epoch+1}: train_loss={avg_train:.4f}  '
          f'val_loss={avg_val:.4f}  MAP@3={map3:.4f}')

<IPython.core.display.Javascript object>

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/599M [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
  scaler = torch.cuda.amp.GradScaler()


Epoch 1/10:   0%|          | 0/1835 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast():
  with torch.no_grad(), torch.cuda.amp.autocast():


Epoch 1: train_loss=1.4822  val_loss=0.7932  MAP@3=0.8328


Epoch 2/10:   0%|          | 0/1835 [00:00<?, ?it/s]

Epoch 2: train_loss=0.6155  val_loss=0.5464  MAP@3=0.8878


Epoch 3/10:   0%|          | 0/1835 [00:00<?, ?it/s]

Epoch 3: train_loss=0.4916  val_loss=0.4720  MAP@3=0.9071


Epoch 4/10:   0%|          | 0/1835 [00:00<?, ?it/s]

Epoch 4: train_loss=0.4162  val_loss=0.4586  MAP@3=0.9119


Epoch 5/10:   0%|          | 0/1835 [00:00<?, ?it/s]

Epoch 5: train_loss=0.3946  val_loss=0.4448  MAP@3=0.9143


Epoch 6/10:   0%|          | 0/1835 [00:00<?, ?it/s]

Epoch 6: train_loss=0.3529  val_loss=0.4118  MAP@3=0.9224


Epoch 7/10:   0%|          | 0/1835 [00:00<?, ?it/s]

Epoch 7: train_loss=0.2616  val_loss=0.4121  MAP@3=0.9251


Epoch 8/10:   0%|          | 0/1835 [00:00<?, ?it/s]

Epoch 8: train_loss=0.1991  val_loss=0.4206  MAP@3=0.9275


Epoch 9/10:   0%|          | 0/1835 [00:00<?, ?it/s]

Epoch 9: train_loss=0.1639  val_loss=0.4427  MAP@3=0.9257


Epoch 10/10:   0%|          | 0/1835 [00:00<?, ?it/s]

Epoch 10: train_loss=0.1323  val_loss=0.4576  MAP@3=0.9280
