### Multi‑Column RoBERTa Fine‑Tuning (no text concatenation)

This notebook fine‑tunes **`roberta-base`** to predict **`experience_level`** (`junior` / `mid` / `senior`) from **`cleaned_resumes.csv`** (in the same folder).

Core design choices:
- Each CSV column is **tokenized separately** (we never concatenate columns into one long string).
- Each column gets its own RoBERTa encoding, then we **aggregate column embeddings with column‑level self‑attention**.
- Training prints accuracy + a confusion matrix for quick comparison between runs.

Next cell: imports + experiment settings (hyperparameters, device, seeds).


In [None]:
import csv
import random
from collections import Counter
from pathlib import Path

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer, RobertaModel, get_linear_schedule_with_warmup

import matplotlib.pyplot as plt

# Paths / columns
DATA_PATH = Path("cleaned_resumes.csv")
LABEL_COL = "experience_level"

# Model / tokenization
MODEL_NAME = "roberta-base"
MAX_LEN = 128

# Training
SEED = 42
TRAIN_RATIO = 0.8
BATCH_SIZE = 4
EPOCHS = 5
LR = 2e-5
WEIGHT_DECAY = 0.01
WARMUP_RATIO = 0.1
GRAD_CLIP_NORM = 1.0

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print("device:", DEVICE)
print("data:", DATA_PATH.resolve())


### Load the CSV and quickly sanity‑check it

This cell reads `cleaned_resumes.csv`, verifies the label column (`experience_level`) exists, and prints a quick snapshot: number of rows, feature columns, label balance, and which columns are mostly empty.


In [None]:
def _is_empty(v: object) -> bool:
    return v is None or str(v).strip() == ""


def read_csv_rows(path: Path) -> tuple[list[str], list[dict[str, str]]]:
    with path.open(newline="", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        if reader.fieldnames is None:
            raise ValueError("CSV has no header row")
        rows = list(reader)
        return list(reader.fieldnames), rows


if not DATA_PATH.exists():
    raise FileNotFoundError(f"Missing file: {DATA_PATH.resolve()}")

columns, rows = read_csv_rows(DATA_PATH)

if LABEL_COL not in columns:
    raise ValueError(f"Label column '{LABEL_COL}' not found. Columns: {columns}")

FEATURE_COLUMNS = [c for c in columns if c != LABEL_COL]

labels = [str(r[LABEL_COL]).strip().lower() for r in rows]
label_counts = Counter(labels)

print("rows:", len(rows))
print("feature_columns (count):", len(FEATURE_COLUMNS))
print("feature_columns:", FEATURE_COLUMNS)
print("label_counts:", dict(label_counts))

empty_frac = {
    c: sum(_is_empty(r.get(c, "")) for r in rows) / max(1, len(rows))
    for c in columns
}

mostly_empty = sorted(empty_frac.items(), key=lambda kv: kv[1], reverse=True)[:8]
print("\nmost_empty_columns (top 8):")
for c, frac in mostly_empty:
    print(f"- {c}: {frac:.1%}")

print("\nexample rows (label + 3 columns):")
preview_cols = FEATURE_COLUMNS[:3]
for i in range(min(2, len(rows))):
    r = rows[i]
    preview = {c: str(r.get(c, ""))[:80] for c in preview_cols}
    print(f"[{i}] label={r[LABEL_COL]!r} preview={preview}")


### Tokenize *each column separately* and build tensors

This cell initializes the tokenizer and tokenizes every feature column independently into `input_ids`/`attention_mask` with shape **[N, num_columns, MAX_LEN]**. It also creates a **stratified** train/validation split so class balance stays intact.


In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

label_names = sorted(label_counts.keys())
label2id = {name: i for i, name in enumerate(label_names)}
id2label = {i: name for name, i in label2id.items()}

y = torch.tensor([label2id[l] for l in labels], dtype=torch.long)

texts_by_col = {
    c: [str(r.get(c, "") or "") for r in rows]
    for c in FEATURE_COLUMNS
}

input_ids_cols = []
attention_mask_cols = []

for c in FEATURE_COLUMNS:
    enc = tokenizer(
        texts_by_col[c],
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN,
    )
    input_ids_cols.append(torch.tensor(enc["input_ids"], dtype=torch.long))
    attention_mask_cols.append(torch.tensor(enc["attention_mask"], dtype=torch.long))

input_ids = torch.stack(input_ids_cols, dim=1)
attention_mask = torch.stack(attention_mask_cols, dim=1)

print("input_ids:", tuple(input_ids.shape))
print("attention_mask:", tuple(attention_mask.shape))
print("labels:", tuple(y.shape))

rng = random.Random(SEED)
indices_by_label = {name: [] for name in label_names}
for i, lab in enumerate(labels):
    indices_by_label[lab].append(i)

train_idx: list[int] = []
val_idx: list[int] = []

for lab, idxs in indices_by_label.items():
    rng.shuffle(idxs)
    cut = int(len(idxs) * TRAIN_RATIO)
    train_idx.extend(idxs[:cut])
    val_idx.extend(idxs[cut:])

rng.shuffle(train_idx)
rng.shuffle(val_idx)

train_input_ids = input_ids[train_idx]
train_attention_mask = attention_mask[train_idx]
train_y = y[train_idx]

val_input_ids = input_ids[val_idx]
val_attention_mask = attention_mask[val_idx]
val_y = y[val_idx]

print("\ntrain size:", len(train_idx), Counter([labels[i] for i in train_idx]))
print("val size:", len(val_idx), Counter([labels[i] for i in val_idx]))


### Create datasets + dataloaders

This cell wraps the tokenized tensors into `Dataset` objects and builds `DataLoader`s that return batches shaped **[batch, num_columns, MAX_LEN]** for both `input_ids` and `attention_mask`.


In [None]:
class ResumeTensorDataset(Dataset):
    def __init__(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, labels: torch.Tensor):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    def __len__(self) -> int:
        return int(self.labels.shape[0])

    def __getitem__(self, idx: int):
        return self.input_ids[idx], self.attention_mask[idx], self.labels[idx]


train_ds = ResumeTensorDataset(train_input_ids, train_attention_mask, train_y)
val_ds = ResumeTensorDataset(val_input_ids, val_attention_mask, val_y)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)

batch_input_ids, batch_attention_mask, batch_labels = next(iter(train_loader))
print("batch input_ids:", tuple(batch_input_ids.shape))
print("batch attention_mask:", tuple(batch_attention_mask.shape))
print("batch labels:", tuple(batch_labels.shape))
