### Multi‑Column RoBERTa Fine‑Tuning (no text concatenation)

This notebook fine‑tunes **`roberta-base`** to predict **`experience_level`** (`junior` / `mid` / `senior`) from **`cleaned_resumes.csv`** (in the same folder).

Core design choices:
- Each CSV column is **tokenized separately** (we never concatenate columns into one long string).
- Each column gets its own RoBERTa encoding, then we **aggregate column embeddings with column‑level self‑attention**.
- Training prints accuracy + a confusion matrix for quick comparison between runs.

Next cell: imports + experiment settings (hyperparameters, device, seeds).


In [1]:
import csv
import random
from collections import Counter
from pathlib import Path

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer, RobertaModel, get_linear_schedule_with_warmup

import matplotlib.pyplot as plt

# Paths / columns
DATA_PATH = Path("cleaned_resumes.csv")
LABEL_COL = "experience_level"

# Model / tokenization
MODEL_NAME = "roberta-base"
MAX_LEN = 128

# Training
SEED = 42
TRAIN_RATIO = 0.8
BATCH_SIZE = 4
EPOCHS = 5
LR = 2e-5
WEIGHT_DECAY = 0.01
WARMUP_RATIO = 0.1
GRAD_CLIP_NORM = 1.0

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print("device:", DEVICE)
print("data:", DATA_PATH.resolve())


Skipping import of cpp extensions due to incompatible torch version 2.6.0+cu124 for torchao version 0.14.1             Please see https://github.com/pytorch/ao/issues/2919 for more info


device: cuda
data: C:\Users\Rane\Desktop\GenAI Baseline\SeniortyPrediction\Smaller Models\cleaned_resumes.csv


### Load the CSV and quickly sanity‑check it

This cell reads `cleaned_resumes.csv`, verifies the label column (`experience_level`) exists, and prints a quick snapshot: number of rows, feature columns, label balance, and which columns are mostly empty.


In [2]:
def _is_empty(v: object) -> bool:
    return v is None or str(v).strip() == ""


def read_csv_rows(path: Path) -> tuple[list[str], list[dict[str, str]]]:
    with path.open(newline="", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        if reader.fieldnames is None:
            raise ValueError("CSV has no header row")
        rows = list(reader)
        return list(reader.fieldnames), rows


if not DATA_PATH.exists():
    raise FileNotFoundError(f"Missing file: {DATA_PATH.resolve()}")

columns, rows = read_csv_rows(DATA_PATH)

if LABEL_COL not in columns:
    raise ValueError(f"Label column '{LABEL_COL}' not found. Columns: {columns}")

FEATURE_COLUMNS = [c for c in columns if c != LABEL_COL]

labels = [str(r[LABEL_COL]).strip().lower() for r in rows]
label_counts = Counter(labels)

print("rows:", len(rows))
print("feature_columns (count):", len(FEATURE_COLUMNS))
print("feature_columns:", FEATURE_COLUMNS)
print("label_counts:", dict(label_counts))

empty_frac = {
    c: sum(_is_empty(r.get(c, "")) for r in rows) / max(1, len(rows))
    for c in columns
}

mostly_empty = sorted(empty_frac.items(), key=lambda kv: kv[1], reverse=True)[:8]
print("\nmost_empty_columns (top 8):")
for c, frac in mostly_empty:
    print(f"- {c}: {frac:.1%}")

print("\nexample rows (label + 3 columns):")
preview_cols = FEATURE_COLUMNS[:3]
for i in range(min(2, len(rows))):
    r = rows[i]
    preview = {c: str(r.get(c, ""))[:80] for c in preview_cols}
    print(f"[{i}] label={r[LABEL_COL]!r} preview={preview}")


rows: 2100
feature_columns (count): 16
feature_columns: ['name', 'email', 'summary', 'linkedin', 'github', 'experience', 'education', 'skills', 'projects', 'certifications', 'summary_count', 'last_experience_only', 'total_experience_time', 'last_experience_time', 'job title', 'target_experience_text']
label_counts: {'senior': 700, 'mid': 700, 'junior': 700}

most_empty_columns (top 8):
- name: 0.0%
- email: 0.0%
- summary: 0.0%
- linkedin: 0.0%
- github: 0.0%
- experience: 0.0%
- education: 0.0%
- skills: 0.0%

example rows (label + 3 columns):
[0] label='senior' preview={'name': 'Brenda Garza', 'email': 'williamsrichard@example.org', 'summary': 'Passionate Deep Learning Engineer with expertise in neural network design, train'}
[1] label='senior' preview={'name': 'Michele Clark', 'email': 'kennethpark@example.org', 'summary': 'Blockchain Developer with experience in smart contract development and integrati'}


### Tokenize *each column separately* and build tensors

This cell initializes the tokenizer and tokenizes every feature column independently into `input_ids`/`attention_mask` with shape **[N, num_columns, MAX_LEN]**. It also creates a **stratified** train/validation split so class balance stays intact.


In [3]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

label_names = sorted(label_counts.keys())
label2id = {name: i for i, name in enumerate(label_names)}
id2label = {i: name for name, i in label2id.items()}

y = torch.tensor([label2id[l] for l in labels], dtype=torch.long)

texts_by_col = {
    c: [str(r.get(c, "") or "") for r in rows]
    for c in FEATURE_COLUMNS
}

input_ids_cols = []
attention_mask_cols = []

for c in FEATURE_COLUMNS:
    enc = tokenizer(
        texts_by_col[c],
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN,
    )
    input_ids_cols.append(torch.tensor(enc["input_ids"], dtype=torch.long))
    attention_mask_cols.append(torch.tensor(enc["attention_mask"], dtype=torch.long))

input_ids = torch.stack(input_ids_cols, dim=1)
attention_mask = torch.stack(attention_mask_cols, dim=1)

print("input_ids:", tuple(input_ids.shape))
print("attention_mask:", tuple(attention_mask.shape))
print("labels:", tuple(y.shape))

rng = random.Random(SEED)
indices_by_label = {name: [] for name in label_names}
for i, lab in enumerate(labels):
    indices_by_label[lab].append(i)

train_idx: list[int] = []
val_idx: list[int] = []

for lab, idxs in indices_by_label.items():
    rng.shuffle(idxs)
    cut = int(len(idxs) * TRAIN_RATIO)
    train_idx.extend(idxs[:cut])
    val_idx.extend(idxs[cut:])

rng.shuffle(train_idx)
rng.shuffle(val_idx)

train_input_ids = input_ids[train_idx]
train_attention_mask = attention_mask[train_idx]
train_y = y[train_idx]

val_input_ids = input_ids[val_idx]
val_attention_mask = attention_mask[val_idx]
val_y = y[val_idx]

print("\ntrain size:", len(train_idx), Counter([labels[i] for i in train_idx]))
print("val size:", len(val_idx), Counter([labels[i] for i in val_idx]))


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

input_ids: (2100, 16, 128)
attention_mask: (2100, 16, 128)
labels: (2100,)

train size: 1680 Counter({'junior': 560, 'mid': 560, 'senior': 560})
val size: 420 Counter({'senior': 140, 'mid': 140, 'junior': 140})


### Create datasets + dataloaders

This cell wraps the tokenized tensors into `Dataset` objects and builds `DataLoader`s that return batches shaped **[batch, num_columns, MAX_LEN]** for both `input_ids` and `attention_mask`.


In [4]:
class ResumeTensorDataset(Dataset):
    def __init__(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, labels: torch.Tensor):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    def __len__(self) -> int:
        return int(self.labels.shape[0])

    def __getitem__(self, idx: int):
        return self.input_ids[idx], self.attention_mask[idx], self.labels[idx]


train_ds = ResumeTensorDataset(train_input_ids, train_attention_mask, train_y)
val_ds = ResumeTensorDataset(val_input_ids, val_attention_mask, val_y)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)

batch_input_ids, batch_attention_mask, batch_labels = next(iter(train_loader))
print("batch input_ids:", tuple(batch_input_ids.shape))
print("batch attention_mask:", tuple(batch_attention_mask.shape))
print("batch labels:", tuple(batch_labels.shape))


batch input_ids: (4, 16, 128)
batch attention_mask: (4, 16, 128)
batch labels: (4,)


### Define the multi‑column RoBERTa model (column‑level self‑attention)

This cell defines a model that:
- Runs RoBERTa on **all columns in one batched forward pass** (faster than looping columns)
- Treats the resulting per‑column embeddings as a short sequence and applies **self‑attention across columns**
- Pools across columns and predicts `experience_level`.


In [5]:
class ColumnSelfAttentionBlock(nn.Module):
    def __init__(self, hidden_size: int, num_heads: int, dropout: float):
        super().__init__()
        self.mha = nn.MultiheadAttention(
            embed_dim=hidden_size,
            num_heads=num_heads,
            dropout=dropout,
            batch_first=True,
        )
        self.ln1 = nn.LayerNorm(hidden_size)
        self.ff = nn.Sequential(
            nn.Linear(hidden_size, hidden_size * 4),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size * 4, hidden_size),
        )
        self.ln2 = nn.LayerNorm(hidden_size)
        self.drop = nn.Dropout(dropout)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        attn_out, _ = self.mha(x, x, x, need_weights=False)
        x = self.ln1(x + self.drop(attn_out))
        ff_out = self.ff(x)
        x = self.ln2(x + self.drop(ff_out))
        return x


class MultiColumnRobertaClassifier(nn.Module):
    def __init__(
        self,
        model_name: str,
        num_columns: int,
        num_labels: int,
        col_attn_heads: int = 4,
        dropout: float = 0.1,
    ):
        super().__init__()
        self.encoder = RobertaModel.from_pretrained(model_name)
        hidden = int(self.encoder.config.hidden_size)

        self.col_pos = nn.Embedding(num_columns, hidden)
        self.col_block = ColumnSelfAttentionBlock(hidden, col_attn_heads, dropout)

        self.classifier = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(hidden, hidden // 2),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden // 2, num_labels),
        )

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        b, c, l = input_ids.shape

        flat_ids = input_ids.view(b * c, l)
        flat_mask = attention_mask.view(b * c, l)

        out = self.encoder(input_ids=flat_ids, attention_mask=flat_mask)
        cls = out.last_hidden_state[:, 0, :]
        cols = cls.view(b, c, -1)

        pos = self.col_pos(torch.arange(c, device=input_ids.device)).unsqueeze(0)
        x = cols + pos
        x = self.col_block(x)
        pooled = x.mean(dim=1)

        return self.classifier(pooled)


NUM_COLUMNS = int(train_input_ids.shape[1])
NUM_LABELS = len(label2id)

model = MultiColumnRobertaClassifier(
    model_name=MODEL_NAME,
    num_columns=NUM_COLUMNS,
    num_labels=NUM_LABELS,
).to(DEVICE)

with torch.no_grad():
    batch_input_ids, batch_attention_mask, _ = next(iter(train_loader))
    logits = model(batch_input_ids.to(DEVICE), batch_attention_mask.to(DEVICE))
    print("logits:", tuple(logits.shape))


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


logits: (4, 3)


### Train and validate

This cell fine‑tunes the whole model end‑to‑end (RoBERTa + column‑attention layers), evaluates on the validation split each epoch, and saves the best checkpoint by validation accuracy.


In [6]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)

total_steps = max(1, EPOCHS * len(train_loader))
warmup_steps = int(WARMUP_RATIO * total_steps)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps,
)

scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available())


@torch.no_grad()
def evaluate(m: nn.Module, loader: DataLoader, num_labels: int):
    m.eval()

    total_loss = 0.0
    total_correct = 0
    total_n = 0
    conf = torch.zeros((num_labels, num_labels), dtype=torch.long)

    for input_ids_b, attention_mask_b, labels_b in loader:
        input_ids_b = input_ids_b.to(DEVICE)
        attention_mask_b = attention_mask_b.to(DEVICE)
        labels_b = labels_b.to(DEVICE)

        with torch.cuda.amp.autocast(enabled=scaler.is_enabled()):
            logits = m(input_ids_b, attention_mask_b)
            loss = loss_fn(logits, labels_b)

        total_loss += float(loss.item()) * int(labels_b.shape[0])
        preds = logits.argmax(dim=1)
        total_correct += int((preds == labels_b).sum().item())
        total_n += int(labels_b.shape[0])

        for t, p in zip(labels_b.view(-1), preds.view(-1)):
            conf[int(t), int(p)] += 1

    avg_loss = total_loss / max(1, total_n)
    acc = total_correct / max(1, total_n)
    return avg_loss, acc, conf


BEST_CKPT_PATH = Path("best_multicolumn_roberta.pt")

best_val_acc = -1.0

for epoch in range(1, EPOCHS + 1):
    model.train()

    running_loss = 0.0
    running_correct = 0
    running_n = 0

    for input_ids_b, attention_mask_b, labels_b in train_loader:
        input_ids_b = input_ids_b.to(DEVICE)
        attention_mask_b = attention_mask_b.to(DEVICE)
        labels_b = labels_b.to(DEVICE)

        optimizer.zero_grad(set_to_none=True)

        with torch.cuda.amp.autocast(enabled=scaler.is_enabled()):
            logits = model(input_ids_b, attention_mask_b)
            loss = loss_fn(logits, labels_b)

        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP_NORM)
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        running_loss += float(loss.item()) * int(labels_b.shape[0])
        preds = logits.argmax(dim=1)
        running_correct += int((preds == labels_b).sum().item())
        running_n += int(labels_b.shape[0])

    train_loss = running_loss / max(1, running_n)
    train_acc = running_correct / max(1, running_n)

    val_loss, val_acc, _ = evaluate(model, val_loader, NUM_LABELS)

    print(
        f"Epoch {epoch}/{EPOCHS} | "
        f"train_loss={train_loss:.4f} train_acc={train_acc:.3f} | "
        f"val_loss={val_loss:.4f} val_acc={val_acc:.3f}"
    )

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(
            {
                "model_state_dict": model.state_dict(),
                "label2id": label2id,
                "feature_columns": FEATURE_COLUMNS,
                "model_name": MODEL_NAME,
                "max_len": MAX_LEN,
            },
            BEST_CKPT_PATH,
        )

print("\nbest_val_acc:", best_val_acc)
print("saved:", BEST_CKPT_PATH.resolve())


  scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available())
  with torch.cuda.amp.autocast(enabled=scaler.is_enabled()):
  with torch.cuda.amp.autocast(enabled=scaler.is_enabled()):


Epoch 1/5 | train_loss=1.1033 train_acc=0.319 | val_loss=1.1018 val_acc=0.333
Epoch 2/5 | train_loss=1.1024 train_acc=0.314 | val_loss=1.1008 val_acc=0.333


KeyboardInterrupt: 

### Load best checkpoint + show confusion matrix

This final cell reloads the best saved model (by validation accuracy), re-runs evaluation on the validation set, and visualizes the confusion matrix over `junior/mid/senior`.


In [None]:
ckpt = torch.load(BEST_CKPT_PATH, map_location=DEVICE)
model.load_state_dict(ckpt["model_state_dict"])

val_loss, val_acc, conf = evaluate(model, val_loader, NUM_LABELS)

print("val_loss:", val_loss)
print("val_acc:", val_acc)
print("confusion_matrix:\n", conf)

fig, ax = plt.subplots(figsize=(6, 6))
im = ax.imshow(conf.numpy(), cmap="Blues")

ax.set_xticks(range(NUM_LABELS))
ax.set_yticks(range(NUM_LABELS))
ax.set_xticklabels([id2label[i] for i in range(NUM_LABELS)], rotation=45, ha="right")
ax.set_yticklabels([id2label[i] for i in range(NUM_LABELS)])

for i in range(NUM_LABELS):
    for j in range(NUM_LABELS):
        ax.text(j, i, int(conf[i, j]), ha="center", va="center")

ax.set_xlabel("Predicted")
ax.set_ylabel("True")
ax.set_title("Multi‑Column RoBERTa – Confusion Matrix (Validation)")
fig.colorbar(im, ax=ax)
plt.tight_layout()
plt.show()
