In [2]:
# Install dependencies (run once per session):
# !pip install torch transformers pandas scikit-learn matplotlib seaborn

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
import pandas as pd
from transformers import (
    DistilBertTokenizerFast,
    DistilBertModel,
    get_linear_schedule_with_warmup
)
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# === 1. Load the CSV file ===
EXCEL_FILE = "./cleaned_resumes.csv"

# === 2. Global configuration ===
BATCH_SIZE = 16
EPOCHS = 10
MAX_LEN = 128  # Per column max length
LEARNING_RATE = 2e-5
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

# === 3. Load and preprocess the data ===
df = pd.read_csv(EXCEL_FILE)
df = df.dropna(subset=['experience_level'])

# Define which columns to use as separate inputs
FEATURE_COLUMNS = [col for col in df.columns if col != 'experience_level']
print(f"Feature columns: {FEATURE_COLUMNS}")

# === 4. Create tokenizer and label mapping ===
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
label_names = sorted(df["experience_level"].unique())
label2id = {label: idx for idx, label in enumerate(label_names)}
id2label = {idx: label for label, idx in label2id.items()}

print(f"Labels: {label_names}")
print(f"Total samples: {len(df)}")

# === 5. Custom Dataset Class - Multi-Column Approach ===
class MultiColumnResumeDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len, feature_columns):
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.feature_columns = feature_columns
        self.dataframe = dataframe.reset_index(drop=True)

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        
        # Tokenize each column separately
        column_encodings = []
        for col in self.feature_columns:
            text = str(row[col]) if pd.notna(row[col]) else ""
            encoding = self.tokenizer(
                text,
                truncation=True,
                padding="max_length",
                max_length=self.max_len,
                return_tensors="pt"
            )
            column_encodings.append({
                "input_ids": encoding["input_ids"].squeeze(0),
                "attention_mask": encoding["attention_mask"].squeeze(0)
            })
        
        label = label2id[row["experience_level"]]
        
        return column_encodings, label

# === 6. Custom Model - Processes Each Column Separately ===
class MultiColumnDistilBERT(nn.Module):
    def __init__(self, num_columns, num_labels, hidden_size=768):
        super(MultiColumnDistilBERT, self).__init__()
        self.num_columns = num_columns
        
        # Shared DistilBERT encoder for all columns
        self.distilbert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        
        # Alternative: Use separate encoders per column (more parameters)
        # self.encoders = nn.ModuleList([
        #     DistilBertModel.from_pretrained("distilbert-base-uncased") 
        #     for _ in range(num_columns)
        # ])
        
        # Attention mechanism to weight column importance
        self.column_attention = nn.Linear(hidden_size, 1)
        
        # Classification head
        self.classifier = nn.Sequential(
            nn.Dropout(0.3),
            nn.Linear(hidden_size, hidden_size // 2),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_size // 2, num_labels)
        )
    
    def forward(self, column_encodings_batch):
        batch_size = len(column_encodings_batch)
        
        # Process each column through DistilBERT
        column_embeddings = []
        
        for col_idx in range(self.num_columns):
            # Extract input_ids and attention_mask for this column across the batch
            input_ids = torch.stack([column_encodings_batch[b][col_idx]["input_ids"] 
                                    for b in range(batch_size)])
            attention_mask = torch.stack([column_encodings_batch[b][col_idx]["attention_mask"] 
                                         for b in range(batch_size)])
            
            # Get [CLS] token embedding for this column
            outputs = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
            cls_embedding = outputs.last_hidden_state[:, 0, :]  # [batch_size, hidden_size]
            column_embeddings.append(cls_embedding)
        
        # Stack all column embeddings: [batch_size, num_columns, hidden_size]
        stacked_embeddings = torch.stack(column_embeddings, dim=1)
        
        # Apply attention to weight columns
        attention_scores = self.column_attention(stacked_embeddings)  # [batch_size, num_columns, 1]
        attention_weights = torch.softmax(attention_scores, dim=1)
        
        # Weighted sum of column embeddings
        weighted_embedding = torch.sum(stacked_embeddings * attention_weights, dim=1)  # [batch_size, hidden_size]
        
        # Classification
        logits = self.classifier(weighted_embedding)
        
        return logits

# === 7. Custom collate function ===
def collate_fn(batch):
    column_encodings_batch = [item[0] for item in batch]
    labels = torch.tensor([item[1] for item in batch])
    return column_encodings_batch, labels

# === 8. Split dataset into Training and Validation ===
full_dataset = MultiColumnResumeDataset(df, tokenizer, MAX_LEN, FEATURE_COLUMNS)
n_total = len(full_dataset)
n_train = int(0.8 * n_total)
n_val = n_total - n_train

train_ds, val_ds = random_split(full_dataset, [n_train, n_val])
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, collate_fn=collate_fn)

print(f"Training samples: {n_train}, Validation samples: {n_val}")

# === 9. Initialize model and optimizer ===
model = MultiColumnDistilBERT(
    num_columns=len(FEATURE_COLUMNS),
    num_labels=len(label2id)
).to(DEVICE)

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
loss_fn = nn.CrossEntropyLoss()

# === 10. Set up Learning Rate Scheduler with warm-up ===
total_steps = EPOCHS * len(train_loader)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * total_steps),
    num_training_steps=total_steps
)

# === 11. Mixed Precision (FP16) setup ===
use_amp = True if torch.cuda.is_available() else False
scaler = torch.amp.GradScaler('cuda') if use_amp else None

# === 12. Training Loop ===
best_val_acc = 0.0

for epoch in range(1, EPOCHS + 1):
    model.train()
    total_train_loss = 0.0

    for column_encodings_batch, labels in train_loader:
        # Move column encodings to device
        for batch_idx in range(len(column_encodings_batch)):
            for col_idx in range(len(column_encodings_batch[batch_idx])):
                column_encodings_batch[batch_idx][col_idx]["input_ids"] = \
                    column_encodings_batch[batch_idx][col_idx]["input_ids"].to(DEVICE)
                column_encodings_batch[batch_idx][col_idx]["attention_mask"] = \
                    column_encodings_batch[batch_idx][col_idx]["attention_mask"].to(DEVICE)
        
        labels = labels.to(DEVICE)
        optimizer.zero_grad()

        if use_amp:
            with torch.amp.autocast('cuda'):
                logits = model(column_encodings_batch)
                loss = loss_fn(logits, labels)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            logits = model(column_encodings_batch)
            loss = loss_fn(logits, labels)
            loss.backward()
            optimizer.step()

        scheduler.step()
        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader)

    # === Validation Phase ===
    model.eval()
    val_loss_total = 0.0
    all_true = []
    all_pred = []

    with torch.no_grad():
        for column_encodings_batch, labels in val_loader:
            # Move column encodings to device
            for batch_idx in range(len(column_encodings_batch)):
                for col_idx in range(len(column_encodings_batch[batch_idx])):
                    column_encodings_batch[batch_idx][col_idx]["input_ids"] = \
                        column_encodings_batch[batch_idx][col_idx]["input_ids"].to(DEVICE)
                    column_encodings_batch[batch_idx][col_idx]["attention_mask"] = \
                        column_encodings_batch[batch_idx][col_idx]["attention_mask"].to(DEVICE)
            
            labels = labels.to(DEVICE)

            if use_amp:
                with torch.amp.autocast('cuda'):
                    logits = model(column_encodings_batch)
                    loss = loss_fn(logits, labels)
            else:
                logits = model(column_encodings_batch)
                loss = loss_fn(logits, labels)

            val_loss_total += loss.item()
            preds = torch.argmax(logits, dim=1).cpu().tolist()
            all_pred.extend(preds)
            all_true.extend(labels.cpu().tolist())

    avg_val_loss = val_loss_total / len(val_loader)
    val_acc = accuracy_score(all_true, all_pred) * 100

    print(
        f"Epoch {epoch}/{EPOCHS} | "
        f"Train Loss: {avg_train_loss:.4f} | "
        f"Val Loss: {avg_val_loss:.4f} | "
        f"Val Acc: {val_acc:.2f}%"
    )

    # === Save best model by Val Accuracy ===
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), "best_distilbert_multicolumn.pt")

# === 13. Final Confusion Matrix ===
conf_mat = confusion_matrix(all_true, all_pred, labels=list(label2id.values()))
print(f"\n✅ Best Multi-Column DistilBERT Val Accuracy: {best_val_acc:.2f}%")
print("Final Confusion Matrix (Validation):")
print(conf_mat)

plt.figure(figsize=(6, 5))
sns.heatmap(
    conf_mat,
    annot=True,
    fmt="d",
    cmap="Blues",
    xticklabels=label_names,
    yticklabels=label_names
)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Multi-Column DistilBERT – Confusion Matrix (Validation)")
plt.show()

Using device: cuda
Feature columns: ['name', 'email', 'summary', 'linkedin', 'github', 'experience', 'total_years_experience', 'education', 'skills', 'projects', 'certifications']
Labels: ['junior', 'mid', 'senior']
Total samples: 2100
Training samples: 1680, Validation samples: 420


KeyboardInterrupt: 