In [1]:
# ============================================================================
# STEP 0: INSTALL REQUIRED PACKAGES
# ============================================================================
# Create requirements file for reproducibility
requirements = """transformers>=4.36.0
datasets>=2.15.0
torch>=2.1.0
accelerate>=0.25.0
sentence-transformers>=2.2.0
scikit-learn>=1.3.0
pandas>=2.0.0
numpy>=1.24.0
matplotlib>=3.7.0
seaborn>=0.12.0
streamlit>=1.28.0
evaluate>=0.4.0
python-dotenv>=1.0.0
huggingface-hub>=0.20.0
"""
with open('requirements.txt', 'w') as f:
    f.write(requirements)

!pip install -r requirements.txt -q

# Import libraries
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from transformers import TrainingArguments, Trainer, set_seed
from datasets import load_dataset, DatasetDict  # Removed load_metric from here
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import time
import json
import os
import sys
from datetime import datetime
import logging
import evaluate  # load_metric is now in evaluate package

warnings.filterwarnings('ignore')

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Check versions for debugging
print("=" * 60)
print("ENVIRONMENT SETUP")
print("=" * 60)
print(f"PyTorch version: {torch.__version__}")
import transformers
print(f"Transformers version: {transformers.__version__}")
import datasets
print(f"Datasets version: {datasets.__version__}")
print(f"Evaluate version: {evaluate.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA version: {torch.version.cuda}")

# Setup reproducibility
seed = 42
set_seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Clear GPU memory (if using GPU)
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")

print("=" * 60)
print("SETUP COMPLETED SUCCESSFULLY")
print("=" * 60)

ENVIRONMENT SETUP
PyTorch version: 2.9.0+cu126
Transformers version: 4.57.3
Datasets version: 4.0.0
Evaluate version: 0.4.6
CUDA available: False
Using device: cpu
SETUP COMPLETED SUCCESSFULLY


In [2]:
# ============================================================================
# SAVE FILE TO GOOGLE DRIVE
# ============================================================================
# Mount Google Drive to save files permanently
from google.colab import drive
drive.mount('/content/drive')

# Define path in Google Drive
drive_path = '/content/drive/MyDrive/Colab Notebooks/MatchAI/'

# Create directory if it doesn't exist
import os
os.makedirs(drive_path, exist_ok=True)

# Copy requirements.txt to Google Drive
!cp requirements.txt {drive_path}requirements.txt

# Also create a backup with timestamp
from datetime import datetime
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
!cp requirements.txt {drive_path}requirements_{timestamp}.txt

print(f"Requirements saved to Google Drive: {drive_path}")
print(f"Original: requirements.txt")
print(f"Backup: requirements_{timestamp}.txt")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
cp: target 'Notebooks/MatchAI/requirements.txt' is not a directory
cp: target 'Notebooks/MatchAI/requirements_20251210_094858.txt' is not a directory
Requirements saved to Google Drive: /content/drive/MyDrive/Colab Notebooks/MatchAI/
Original: requirements.txt
Backup: requirements_20251210_094858.txt


In [3]:
# ============================================================================
# STEP 1: LOAD AND ANALYZE DATASET
# ============================================================================

print("=" * 70)
print("STEP 1: LOADING AND ANALYZING DATASET")
print("=" * 70)

import time
start_time = time.time()

# Load the full dataset
print("\nLoading dataset: cnamuangtoun/resume-job-description-fit")
print("Loading full dataset...")

try:
    # Load the full dataset
    dataset = load_dataset("cnamuangtoun/resume-job-description-fit")

    print(f"‚úì Successfully loaded dataset")

except Exception as e:
    print(f"‚ùå Error loading dataset: {e}")
    print("\nPlease check:")
    print("1. Internet connection")
    print("2. Dataset name: cnamuangtoun/resume-job-description-fit")
    print("3. Hugging Face hub status")
    print("\nExiting because dataset is required for this project.")
    raise SystemExit("Dataset loading failed. Cannot continue without data.")

load_time = time.time() - start_time
print(f"\nDataset loaded in {load_time:.1f} seconds")

# Verify dataset structure
print("\n" + "=" * 70)
print("DATASET STRUCTURE VERIFICATION")
print("=" * 70)

# Expected columns
expected_columns = ['resume_text', 'job_description_text', 'label']

for split_name in dataset.keys():
    print(f"\n{split_name.upper()} split:")
    print(f"  Samples: {len(dataset[split_name])}")

    actual_columns = dataset[split_name].column_names
    print(f"  Columns found: {actual_columns}")

    # Check for expected columns
    missing_columns = [col for col in expected_columns if col not in actual_columns]
    if missing_columns:
        print(f"  ‚ùå ERROR: Missing expected columns: {missing_columns}")
        raise ValueError(f"Dataset missing required columns: {missing_columns}")
    else:
        print(f"  ‚úì All expected columns present")

# Define column names
resume_col = 'resume_text'
jd_col = 'job_description_text'
label_col = 'label'

print(f"\nUsing column mapping:")
print(f"  Resume: {resume_col}")
print(f"  Job Description: {jd_col}")
print(f"  Label: {label_col}")

# Label analysis - FIXED FOR STRING LABELS
print("\n" + "=" * 70)
print("LABEL DISTRIBUTION ANALYSIS")
print("=" * 70)

# Analyze both train and test splits
for split_name in ['train', 'test']:
    if split_name in dataset:
        labels = dataset[split_name][label_col]
        total = len(labels)

        print(f"\n{split_name.upper()} split:")
        print(f"  Total samples: {total}")

        # Show first few labels
        if total > 0:
            print(f"  First 3 label values:")
            for i, label in enumerate(labels[:3]):
                print(f"    Sample {i}: '{label}' (type: {type(label).__name__})")

        # Count labels
        from collections import Counter
        label_counts = Counter(labels)

        print(f"  Raw label distribution:")
        for label, count in sorted(label_counts.items()):
            percentage = (count / total) * 100
            print(f"    '{label}': {count} samples ({percentage:.1f}%)")

# Normalize labels (handle case variations) - using train split for mapping
print("\n" + "=" * 70)
print("NORMALIZED LABEL DISTRIBUTION")
print("=" * 70)

train_labels = dataset['train'][label_col]

# Define expected label variations
label_variations = {
    'no fit': ['no fit', 'no_fit', 'no-fit', '0', 'no', 'not fit', 'unfit'],
    'potential fit': ['potential fit', 'potential_fit', 'potential-fit', '1', 'potential', 'maybe', 'partial'],
    'good fit': ['good fit', 'good_fit', 'good-fit', '2', 'good', 'excellent', 'perfect', 'best']
}

# Create mapping from raw labels to normalized categories
label_mapping = {}

for raw_label in set(train_labels):  # Only process unique labels
    raw_str = str(raw_label).lower().strip()

    # Map to normalized category
    if any(variation in raw_str for variation in label_variations['no fit']):
        normalized = 'No Fit'
    elif any(variation in raw_str for variation in label_variations['potential fit']):
        normalized = 'Potential Fit'
    elif any(variation in raw_str for variation in label_variations['good fit']):
        normalized = 'Good Fit'
    else:
        # Unknown label, default to 'No Fit'
        normalized = 'No Fit'
        print(f"‚ö†Ô∏è  Unknown label format in train split: '{raw_label}' ‚Üí mapped to 'No Fit'")

    label_mapping[raw_label] = normalized

# Create numeric mapping for model training
numeric_mapping = {
    'No Fit': 0,
    'Potential Fit': 1,
    'Good Fit': 2
}

# Also create reverse mapping
reverse_mapping = {v: k for k, v in numeric_mapping.items()}

print(f"\nLabel mapping created:")
for raw_label, normalized in label_mapping.items():
    numeric = numeric_mapping[normalized]
    print(f"  '{raw_label}' ‚Üí {normalized} ‚Üí {numeric}")

# Apply mapping to count normalized labels in each split
for split_name in ['train', 'test']:
    if split_name in dataset:
        labels = dataset[split_name][label_col]
        normalized_labels = [label_mapping.get(l, 'No Fit') for l in labels]

        from collections import Counter
        normalized_counts = Counter(normalized_labels)
        total = len(labels)

        print(f"\n{split_name.upper()} split - Normalized distribution:")
        for label in ['No Fit', 'Potential Fit', 'Good Fit']:
            count = normalized_counts.get(label, 0)
            percentage = (count / total) * 100
            print(f"  {label}: {count} samples ({percentage:.1f}%)")

# Text statistics - sample for efficiency
print("\n" + "=" * 70)
print("TEXT STATISTICS (Sampling 1000 records from train)")
print("=" * 70)

# Sample for efficiency with large datasets
sample_size = min(1000, len(dataset['train']))
sampled_resumes = dataset['train'].select(range(sample_size))[resume_col]
sampled_jds = dataset['train'].select(range(sample_size))[jd_col]

# Calculate text lengths on sample
resume_lengths = [len(str(text)) for text in sampled_resumes]
jd_lengths = [len(str(text)) for text in sampled_jds]

print(f"\nBased on sample of {sample_size} records from train split:")
print(f"Resume text statistics:")
print(f"  Average length: {np.mean(resume_lengths):.0f} characters")
print(f"  Min length: {np.min(resume_lengths)} characters")
print(f"  Max length: {np.max(resume_lengths)} characters")

print(f"\nJob description text statistics:")
print(f"  Average length: {np.mean(jd_lengths):.0f} characters")
print(f"  Min length: {np.min(jd_lengths)} characters")
print(f"  Max length: {np.max(jd_lengths)} characters")

# Estimate token counts (approximate)
avg_tokens_resume = np.mean(resume_lengths) / 4  # Rough estimate: 4 chars per token
avg_tokens_jd = np.mean(jd_lengths) / 4

print(f"\nEstimated token counts (approx):")
print(f"  Average resume tokens: {avg_tokens_resume:.0f}")
print(f"  Average JD tokens: {avg_tokens_jd:.0f}")
print(f"  Total combined (resume + JD): {avg_tokens_resume + avg_tokens_jd:.0f}")

# Create final dataset splits: use existing train/test, create validation from train
print("\n" + "=" * 70)
print("CREATING FINAL DATASET SPLITS")
print("=" * 70)

print("Using existing train and test splits from dataset")
print("Creating validation split from train data (15% of train)")

# Function to add numeric labels
def add_numeric_labels(example):
    raw_label = example[label_col]
    normalized = label_mapping.get(raw_label, 'No Fit')
    example['numeric_label'] = numeric_mapping[normalized]
    return example

# Add numeric labels to train and test
train_with_numeric = dataset['train'].map(add_numeric_labels)
test_with_numeric = dataset['test'].map(add_numeric_labels)

# Create validation split from train (15% of train data)
if len(train_with_numeric) >= 100:
    try:
        # Create stratified validation split
        train_val_split = train_with_numeric.train_test_split(
            test_size=0.15,  # 15% for validation
            seed=seed,
            stratify=train_with_numeric['numeric_label']
        )

        final_train = train_val_split['train']
        validation = train_val_split['test']

        print("‚úì Created stratified validation split from train")
        print(f"  Training: {len(final_train)} samples (85% of original train)")
        print(f"  Validation: {len(validation)} samples (15% of original train)")

    except Exception as e:
        print(f"Stratified split failed: {e}")
        # Fallback to random split
        train_val_split = train_with_numeric.train_test_split(
            test_size=0.15,
            seed=seed
        )

        final_train = train_val_split['train']
        validation = train_val_split['test']
        print("‚úì Created random validation split from train")
else:
    # Small dataset, use simpler split
    val_size = int(0.15 * len(train_with_numeric))

    final_train = train_with_numeric.select(range(len(train_with_numeric) - val_size))
    validation = train_with_numeric.select(range(len(train_with_numeric) - val_size, len(train_with_numeric)))

    print(f"‚úì Created simple validation split for small dataset")

# Create final dataset dictionary
dataset_dict = DatasetDict({
    'train': final_train,
    'validation': validation,
    'test': test_with_numeric
})

print(f"\nFinal dataset sizes:")
print(f"  Training set: {len(dataset_dict['train'])} samples")
print(f"  Validation set: {len(dataset_dict['validation'])} samples")
print(f"  Test set: {len(dataset_dict['test'])} samples")

# Display sample from each split
print("\n" + "=" * 70)
print("DATA SAMPLE PREVIEW")
print("=" * 70)

for split_name in ['train', 'validation', 'test']:
    if split_name in dataset_dict and len(dataset_dict[split_name]) > 0:
        sample = dataset_dict[split_name][0]
        raw_label = sample[label_col]
        numeric_label = sample['numeric_label']
        label_text = reverse_mapping.get(numeric_label, f"Unknown ({numeric_label})")

        print(f"\n{split_name.upper()} split sample:")
        print(f"  Raw label: '{raw_label}'")
        print(f"  Numeric label: {numeric_label} ({label_text})")

        resume_preview = str(sample[resume_col])[:100]
        print(f"  Resume preview: {resume_preview}...")

        jd_preview = str(sample[jd_col])[:100]
        print(f"  Job description preview: {jd_preview}...")

# Save dataset info
print("\n" + "=" * 70)
print("SAVING DATASET INFORMATION")
print("=" * 70)

dataset_info = {
    'dataset_name': 'cnamuangtoun/resume-job-description-fit',
    'original_split_sizes': {
        'train': len(dataset['train']),
        'test': len(dataset['test'])
    },
    'final_split_sizes': {
        'train': len(dataset_dict['train']),
        'validation': len(dataset_dict['validation']),
        'test': len(dataset_dict['test'])
    },
    'columns': {
        'resume': resume_col,
        'job_description': jd_col,
        'label': label_col
    },
    'label_mapping': label_mapping,
    'numeric_mapping': numeric_mapping,
    'reverse_mapping': reverse_mapping,
    'text_statistics': {
        'avg_resume_length': float(np.mean(resume_lengths)),
        'avg_jd_length': float(np.mean(jd_lengths)),
        'sample_size_for_stats': sample_size
    },
    'timestamp': datetime.now().isoformat(),
    'load_time_seconds': load_time
}

with open('dataset_info.json', 'w') as f:
    json.dump(dataset_info, f, indent=2)

print(f"‚úì Dataset info saved to dataset_info.json")

# Save dataset to disk for future use
save_path = "./resume_dataset"
dataset_dict.save_to_disk(save_path)
print(f"‚úì Dataset saved to {save_path}")

# Save label mappings separately
mappings = {
    'label_mapping': label_mapping,
    'numeric_mapping': numeric_mapping,
    'reverse_mapping': reverse_mapping
}

with open('label_mappings.json', 'w') as f:
    json.dump(mappings, f, indent=2)
print(f"‚úì Label mappings saved to label_mappings.json")

# Try to save to Google Drive if in Colab
try:
    from google.colab import drive
    drive.mount('/content/drive')

    drive_path = '/content/drive/MyDrive/resume_matcher/'
    os.makedirs(drive_path, exist_ok=True)

    # Save info to Drive
    drive_info_path = os.path.join(drive_path, 'dataset_info.json')
    with open(drive_info_path, 'w') as f:
        json.dump(dataset_info, f, indent=2)

    print(f"‚úì Dataset info also saved to Google Drive: {drive_info_path}")

except Exception as e:
    # Not in Colab or Drive not available
    pass

# Final summary
total_time = time.time() - start_time
print("\n" + "=" * 70)
print("STEP 1 COMPLETED")
print("=" * 70)
print(f"Total time: {total_time:.1f} seconds")
print(f"Total samples: {len(dataset_dict['train']) + len(dataset_dict['validation']) + len(dataset_dict['test'])}")
print(f"Training set: {len(dataset_dict['train'])} samples (from original train)")
print(f"Validation set: {len(dataset_dict['validation'])} samples (15% of original train)")
print(f"Test set: {len(dataset_dict['test'])} samples (original test split)")
print(f"Number of classes: 3")
print(f"  - 0: No Fit")
print(f"  - 1: Potential Fit")
print(f"  - 2: Good Fit")

print("=" * 70)

STEP 1: LOADING AND ANALYZING DATASET

Loading dataset: cnamuangtoun/resume-job-description-fit
Loading full dataset...
‚úì Successfully loaded dataset

Dataset loaded in 2.8 seconds

DATASET STRUCTURE VERIFICATION

TRAIN split:
  Samples: 6241
  Columns found: ['resume_text', 'job_description_text', 'label']
  ‚úì All expected columns present

TEST split:
  Samples: 1759
  Columns found: ['resume_text', 'job_description_text', 'label']
  ‚úì All expected columns present

Using column mapping:
  Resume: resume_text
  Job Description: job_description_text
  Label: label

LABEL DISTRIBUTION ANALYSIS

TRAIN split:
  Total samples: 6241
  First 3 label values:
    Sample 0: 'No Fit' (type: str)
    Sample 1: 'No Fit' (type: str)
    Sample 2: 'No Fit' (type: str)
  Raw label distribution:
    'Good Fit': 1542 samples (24.7%)
    'No Fit': 3143 samples (50.4%)
    'Potential Fit': 1556 samples (24.9%)

TEST split:
  Total samples: 1759
  First 3 label values:
    Sample 0: 'No Fit' (type: s

Saving the dataset (0/1 shards):   0%|          | 0/5304 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/937 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1759 [00:00<?, ? examples/s]

‚úì Dataset saved to ./resume_dataset
‚úì Label mappings saved to label_mappings.json
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
‚úì Dataset info also saved to Google Drive: /content/drive/MyDrive/resume_matcher/dataset_info.json

STEP 1 COMPLETED
Total time: 8.0 seconds
Total samples: 8000
Training set: 5304 samples (from original train)
Validation set: 937 samples (15% of original train)
Test set: 1759 samples (original test split)
Number of classes: 3
  - 0: No Fit
  - 1: Potential Fit
  - 2: Good Fit


In [4]:
# ============================================================================
# STEP 2: ACCURACY-BASED MODEL SELECTION WITH BIGBIRD - FIXED VERSION
# ============================================================================

print("=" * 70)
print("STEP 2: ACCURACY-BASED MODEL SELECTION")
print("=" * 70)


import time
import numpy as np
from sklearn.metrics import accuracy_score
import torch

start_time = time.time()

# Load label mappings
print("\nLoading label mappings...")
try:
    with open('label_mappings.json', 'r') as f:
        label_mappings = json.load(f)

    numeric_mapping = label_mappings['numeric_mapping']
    reverse_mapping = label_mappings['reverse_mapping']

    print("‚úì Label mappings loaded")
    print(f"  Classes: {len(numeric_mapping)} ({', '.join(numeric_mapping.keys())})")

except Exception as e:
    print(f"‚úó Error: {e}")
    numeric_mapping = {"No Fit": 0, "Potential Fit": 1, "Good Fit": 2}
    reverse_mapping = {0: "No Fit", 1: "Potential Fit", 2: "Good Fit"}

# Check transformers version for compatibility
import transformers
print(f"\nTransformers version: {transformers.__version__}")

# 3 MODELS INCLUDING BIGBIRD
MODELS_TO_TEST = [
    {
        "name": "google/bigbird-roberta-base",
        "description": "BigBird - specialized for long sequences",
        "max_length": 512,
        "batch_size": 4,
        "epochs": 1,
        "learning_rate": 2e-5,
        "attention_type": "original_full"
    },
    {
        "name": "microsoft/deberta-v3-base",
        "description": "DeBERTa v3 - state-of-the-art accuracy",
        "max_length": 256,
        "batch_size": 8,  # Reduced from 16 for memory
        "epochs": 1,
        "learning_rate": 3e-5
    },
    {
        "name": "FacebookAI/roberta-base",
        "description": "RoBERTa - reliable baseline",
        "max_length": 256,
        "batch_size": 8,  # Reduced from 16 for consistency
        "epochs": 1,
        "learning_rate": 3e-5
    }
]

print(f"\nTesting {len(MODELS_TO_TEST)} models:")
for i, model_info in enumerate(MODELS_TO_TEST, 1):
    print(f"\n{i}. {model_info['name']}")
    print(f"   {model_info['description']}")
    print(f"   Config: {model_info['epochs']} epoch, batch={model_info['batch_size']}")

# Prepare dataset
print("\n" + "=" * 70)
print("PREPARING TEST DATASET")
print("=" * 70)

TRAIN_SAMPLES = 60  # Further reduced for speed
VAL_SAMPLES = 15    # Further reduced for speed

print(f"Using small dataset:")
print(f"  Training: {TRAIN_SAMPLES} samples")
print(f"  Validation: {VAL_SAMPLES} samples")

# Simple sampling (no stratification for maximum speed)
train_sample = dataset_dict['train'].select(range(min(TRAIN_SAMPLES, len(dataset_dict['train']))))
val_sample = dataset_dict['validation'].select(range(min(VAL_SAMPLES, len(dataset_dict['validation']))))

# Prepare texts - FIXED VERSION
def prepare_texts_simple(examples):
    """Simple text preparation"""
    texts = []
    labels = []

    for i in range(len(examples[resume_col])):
        resume = str(examples[resume_col][i])[:300]  # Short truncation
        jd = str(examples[jd_col][i])[:200]
        texts.append(f"Resume: {resume} Job: {jd}")
        labels.append(int(examples['numeric_label'][i]))  # Ensure integer

    return {'text': texts, 'labels': labels}

test_train = train_sample.map(prepare_texts_simple, batched=True)
test_val = val_sample.map(prepare_texts_simple, batched=True)

print(f"\n‚úì Dataset prepared")
print(f"  Training samples: {len(test_train)}")
print(f"  Validation samples: {len(test_val)}")

# Store accuracy results
accuracy_results = []

print("\n" + "=" * 70)
print("RUNNING ACCURACY TESTS")
print("=" * 70)

for model_info in MODELS_TO_TEST:
    model_name = model_info['name']

    print(f"\n{'='*70}")
    print(f"TESTING: {model_name.split('/')[-1].upper()}")
    print(f"{'='*70}")

    test_start = time.time()

    try:
        # 1. Load model
        print(f"1. Loading model...")
        tokenizer = AutoTokenizer.from_pretrained(model_name)

        # Special handling for BigBird
        if "bigbird" in model_name.lower():
            model = AutoModelForSequenceClassification.from_pretrained(
                model_name,
                num_labels=len(numeric_mapping),
                attention_type=model_info['attention_type']
            )
            print(f"   BigBird configured")
        else:
            model = AutoModelForSequenceClassification.from_pretrained(
                model_name,
                num_labels=len(numeric_mapping)
            )

        model.to(device)
        print(f"   ‚úì Loaded in {time.time()-test_start:.1f}s")

        # 2. Tokenize - FIXED: Don't include labels in tokenization
        print(f"2. Tokenizing...")
        def tokenize_function(examples):
            # Only tokenize text, not labels
            tokenized = tokenizer(
                examples['text'],
                padding="max_length",
                truncation=True,
                max_length=model_info['max_length'],
                return_tensors="pt"
            )
            return tokenized

        # Tokenize separately for train and val
        tokenized_train = test_train.map(tokenize_function, batched=True)
        tokenized_val = test_val.map(tokenize_function, batched=True)

        # Set format correctly - FIXED
        tokenized_train.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
        tokenized_val.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

        # 3. Training - SIMPLIFIED VERSION
        print(f"3. Quick training (1 epoch)...")

        # Use simple training arguments
        training_args = TrainingArguments(
            output_dir=f"./test_{model_name.replace('/', '_')}",
            eval_strategy="no",  # No eval during training for speed
            save_strategy="no",
            learning_rate=model_info['learning_rate'],
            per_device_train_batch_size=model_info['batch_size'],
            num_train_epochs=model_info['epochs'],
            weight_decay=0.01,
            logging_steps=5,
            save_total_limit=0,
            report_to="none",
            seed=seed,
            disable_tqdm=True,
            remove_unused_columns=True,
            optim="adamw_torch"
        )

        # Simple metrics function
        def compute_metrics(eval_pred):
            predictions, labels = eval_pred
            predictions = np.argmax(predictions, axis=1)
            return {"accuracy": accuracy_score(labels, predictions)}

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_train,
            eval_dataset=tokenized_val,  # Still provide for manual eval
            compute_metrics=compute_metrics,
        )

        # Train
        trainer.train()

        # Manual evaluation
        print(f"4. Evaluating...")
        eval_output = trainer.predict(tokenized_val)

        # Get predictions and labels
        predictions = np.argmax(eval_output.predictions, axis=1)
        labels = eval_output.label_ids

        # Calculate accuracy
        accuracy = accuracy_score(labels, predictions)

        test_time = time.time() - test_start

        # Store results
        result = {
            "model": model_name,
            "accuracy": float(accuracy),
            "test_time": test_time,
            "training_samples": len(tokenized_train),
            "validation_samples": len(tokenized_val),
            "status": "SUCCESS"
        }

        accuracy_results.append(result)

        print(f"\n‚úì Test completed in {test_time:.1f}s")
        print(f"  Accuracy: {accuracy:.4f}")

        # Clean memory
        del model, tokenizer, trainer
        torch.cuda.empty_cache() if torch.cuda.is_available() else None

    except Exception as e:
        print(f"\n‚úó Test failed: {str(e)}")
        print(f"Error type: {type(e).__name__}")

        result = {
            "model": model_name,
            "accuracy": 0,
            "test_time": 0,
            "training_samples": 0,
            "validation_samples": 0,
            "status": f"FAILED: {type(e).__name__}"
        }

        accuracy_results.append(result)

    print(f"{'='*70}")

print("\n" + "=" * 70)
print("ACCURACY RESULTS ANALYSIS")
print("=" * 70)

# Filter successful tests
successful_tests = [r for r in accuracy_results if r['status'] == 'SUCCESS']

if successful_tests:
    # Sort by accuracy
    successful_tests.sort(key=lambda x: x['accuracy'], reverse=True)

    print("\nüèÜ Model Ranking by Accuracy:")
    print("-" * 70)
    print(f"{'Rank':<6} {'Model':<25} {'Accuracy':<12} {'Time':<10}")
    print("-" * 70)

    for i, result in enumerate(successful_tests, 1):
        model_short = result['model'].split('/')[-1][:23]
        print(f"{i:<6} {model_short:<25} {result['accuracy']:.4f}      {result['test_time']:.1f}s")

    # Select best model
    best_model = successful_tests[0]

    print(f"\n{'='*70}")
    print(f"SELECTED BEST MODEL: {best_model['model']}")
    print(f"{'='*70}")
    print(f"Accuracy: {best_model['accuracy']:.4f}")
    print(f"Test Time: {best_model['test_time']:.1f} seconds")

else:
    print("‚ùå No successful tests!")
    print("Using research-based selection...")

    # Research-based fallback
    RESEARCH_SCORES = {
        "microsoft/deberta-v3-base": 0.85,  # Based on GLUE benchmark
        "roberta-base": 0.82,
        "google/bigbird-roberta-base": 0.80  # Lower due to memory issues
    }

    # Select based on research
    best_research_model = max(RESEARCH_SCORES.items(), key=lambda x: x[1])[0]
    best_model = {"model": best_research_model, "accuracy": RESEARCH_SCORES[best_research_model]}

    print(f"Research selection: {best_model['model']}")
    print(f"Expected accuracy: {best_model['accuracy']:.4f} (based on benchmarks)")

# Save results
accuracy_report = {
    "selected_model": best_model['model'],
    "selected_accuracy": best_model['accuracy'],
    "selection_timestamp": datetime.now().isoformat(),
    "selection_method": "accuracy_test" if successful_tests else "research_based",
    "test_conditions": {
        "train_samples": TRAIN_SAMPLES,
        "val_samples": VAL_SAMPLES,
        "epochs": 1
    },
    "all_results": accuracy_results
}

with open('model_selection_results.json', 'w') as f:
    json.dump(accuracy_report, f, indent=2)

print(f"\n‚úì Results saved to model_selection_results.json")

total_time = time.time() - start_time
print(f"\nTotal Step 2 time: {total_time:.1f} seconds")

print("\n" + "=" * 70)
print("STEP 2 COMPLETED")
print("=" * 70)
print(f"Selected: {best_model['model']}")
print(f"Accuracy: {best_model['accuracy']:.4f}")

print("=" * 70)

STEP 2: ACCURACY-BASED MODEL SELECTION

Loading label mappings...
‚úì Label mappings loaded
  Classes: 3 (No Fit, Potential Fit, Good Fit)

Transformers version: 4.57.3

Testing 3 models:

1. google/bigbird-roberta-base
   BigBird - specialized for long sequences
   Config: 1 epoch, batch=4

2. microsoft/deberta-v3-base
   DeBERTa v3 - state-of-the-art accuracy
   Config: 1 epoch, batch=8

3. FacebookAI/roberta-base
   RoBERTa - reliable baseline
   Config: 1 epoch, batch=8

PREPARING TEST DATASET
Using small dataset:
  Training: 60 samples
  Validation: 15 samples

‚úì Dataset prepared
  Training samples: 60
  Validation samples: 15

RUNNING ACCURACY TESTS

TESTING: BIGBIRD-ROBERTA-BASE
1. Loading model...


Some weights of BigBirdForSequenceClassification were not initialized from the model checkpoint at google/bigbird-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


   BigBird configured
   ‚úì Loaded in 8.3s
2. Tokenizing...
3. Quick training (1 epoch)...
{'loss': 1.0791, 'grad_norm': 100.4610824584961, 'learning_rate': 1.4666666666666666e-05, 'epoch': 0.3333333333333333}
{'loss': 1.235, 'grad_norm': 2.9946279525756836, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.6666666666666666}
{'loss': 1.2386, 'grad_norm': 7.916182518005371, 'learning_rate': 1.3333333333333334e-06, 'epoch': 1.0}
{'train_runtime': 691.6992, 'train_samples_per_second': 0.087, 'train_steps_per_second': 0.022, 'train_loss': 1.1842450141906737, 'epoch': 1.0}
4. Evaluating...

‚úì Test completed in 734.0s
  Accuracy: 0.2667

TESTING: DEBERTA-V3-BASE
1. Loading model...


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


   ‚úì Loaded in 4.8s
2. Tokenizing...
3. Quick training (1 epoch)...
{'loss': 1.1582, 'grad_norm': 2.389218330383301, 'learning_rate': 1.5e-05, 'epoch': 0.625}
{'train_runtime': 277.3343, 'train_samples_per_second': 0.216, 'train_steps_per_second': 0.029, 'train_loss': 1.1698918342590332, 'epoch': 1.0}
4. Evaluating...

‚úì Test completed in 301.2s
  Accuracy: 0.2667

TESTING: ROBERTA-BASE
1. Loading model...


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


   ‚úì Loaded in 1.2s
2. Tokenizing...
3. Quick training (1 epoch)...
{'loss': 1.1263, 'grad_norm': 2.0946102142333984, 'learning_rate': 1.5e-05, 'epoch': 0.625}
{'train_runtime': 195.5724, 'train_samples_per_second': 0.307, 'train_steps_per_second': 0.041, 'train_loss': 1.1114977598190308, 'epoch': 1.0}
4. Evaluating...

‚úì Test completed in 210.9s
  Accuracy: 0.4000

ACCURACY RESULTS ANALYSIS

üèÜ Model Ranking by Accuracy:
----------------------------------------------------------------------
Rank   Model                     Accuracy     Time      
----------------------------------------------------------------------
1      roberta-base              0.4000      210.9s
2      bigbird-roberta-base      0.2667      734.0s
3      deberta-v3-base           0.2667      301.2s

SELECTED BEST MODEL: FacebookAI/roberta-base
Accuracy: 0.4000
Test Time: 210.9 seconds

‚úì Results saved to model_selection_results.json

Total Step 2 time: 1246.9 seconds

STEP 2 COMPLETED
Selected: FacebookAI/

In [5]:
# ============================================================================
# STEP 3: LOAD SELECTED MODEL AND TOKENIZER
# ============================================================================

print("=" * 70)
print("STEP 3: LOADING SELECTED MODEL AND TOKENIZER")
print("=" * 70)

# Load model selection results
print("\nLoading model selection results from Step 2...")
try:
    with open('model_selection_results.json', 'r') as f:
        selection_data = json.load(f)

    SELECTED_MODEL = selection_data['selected_model']
    SELECTED_ACCURACY = selection_data['selected_accuracy']
    SELECTION_METHOD = selection_data['selection_method']

    print(f"‚úì Model selection loaded successfully")
    print(f"  Selected model: {SELECTED_MODEL}")
    print(f"  Test accuracy: {SELECTED_ACCURACY:.4f}")
    print(f"  Selection method: {SELECTION_METHOD}")



except Exception as e:
    print(f"‚úó Error loading selection results: {e}")
    print("\nUsing default model: microsoft/deberta-v3-base")
    SELECTED_MODEL = "microsoft/deberta-v3-base"
    SELECTED_ACCURACY = 0.0
    SELECTION_METHOD = "default_fallback"

print("\n" + "=" * 70)
print(f"LOADING: {SELECTED_MODEL}")
print("=" * 70)

# Model configuration based on selected model
MODEL_CONFIGS = {
    "google/bigbird-roberta-base": {
        "max_length": 1024,
        "attention_type": "original_full",
        "batch_size": 4,
        "description": "BigBird for long sequences (up to 4096 tokens)"
    },
    "microsoft/deberta-v3-base": {
        "max_length": 512,
        "batch_size": 8,
        "description": "DeBERTa v3 for high accuracy classification"
    },
    "roberta-base": {
        "max_length": 512,
        "batch_size": 8,
        "description": "RoBERTa for reliable performance"
    }
}

# Get configuration for selected model or use defaults
if SELECTED_MODEL in MODEL_CONFIGS:
    model_config = MODEL_CONFIGS[SELECTED_MODEL]
    print(f"Using predefined configuration for {SELECTED_MODEL}")
else:
    # Default configuration
    model_config = {
        "max_length": 512,
        "batch_size": 8,
        "description": "Standard transformer model"
    }
    print(f"Using default configuration for {SELECTED_MODEL}")

MAX_LENGTH = model_config["max_length"]
print(f"\nModel configuration:")
print(f"  Max sequence length: {MAX_LENGTH} tokens")
print(f"  Description: {model_config['description']}")
if "attention_type" in model_config:
    print(f"  Special config: {model_config['attention_type']}")

# Load tokenizer and model
print("\n" + "=" * 70)
print("LOADING TOKENIZER AND MODEL")
print("=" * 70)

try:
    # 1. Load tokenizer
    print("1. Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(SELECTED_MODEL)

    print(f"   ‚úì Tokenizer: {tokenizer.name_or_path}")
    print(f"   Vocabulary size: {tokenizer.vocab_size:,}")
    print(f"   Max model length: {tokenizer.model_max_length}")
    print(f"   Special tokens: {list(tokenizer.special_tokens_map.values())}")

    # 2. Load model with appropriate configuration
    print("\n2. Loading model for sequence classification...")

    # Prepare model loading arguments
    model_kwargs = {
        "num_labels": len(numeric_mapping),
        "ignore_mismatched_sizes": True
    }

    # Add special configurations if needed
    if SELECTED_MODEL == "google/bigbird-roberta-base":
        model_kwargs["attention_type"] = "original_full"
        print(f"   Adding BigBird configuration: attention_type='original_full'")

    model = AutoModelForSequenceClassification.from_pretrained(
        SELECTED_MODEL,
        **model_kwargs
    )

    print(f"   ‚úì Model loaded successfully")
    print(f"   Model type: {model.config.model_type}")
    print(f"   Number of labels: {model.config.num_labels}")

    # Show label mapping
    if hasattr(model.config, 'id2label'):
        print(f"   Label mapping:")
        for i in range(len(numeric_mapping)):
            label_name = reverse_mapping.get(i, f"Class {i}")
            print(f"     {i} ‚Üí {label_name}")

    # 3. Move model to device
    print(f"\n3. Moving model to device...")
    model.to(device)
    print(f"   Model moved to: {device}")

    # Display GPU memory info if available
    if torch.cuda.is_available():
        gpu_name = torch.cuda.get_device_name(0)
        total_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
        allocated = torch.cuda.memory_allocated() / 1e9
        reserved = torch.cuda.memory_reserved() / 1e9

        print(f"   GPU: {gpu_name}")
        print(f"   Total memory: {total_memory:.1f} GB")
        print(f"   Allocated: {allocated:.2f} GB")
        print(f"   Reserved: {reserved:.2f} GB")

    # 4. Model architecture analysis
    print(f"\n4. Model architecture summary:")

    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

    print(f"   Total parameters: {total_params:,}")
    print(f"   Trainable parameters: {trainable_params:,}")
    print(f"   Non-trainable parameters: {total_params - trainable_params:,}")
    print(f"   Trainable percentage: {(trainable_params/total_params*100):.1f}%")

    # Model-specific details
    if hasattr(model.config, 'hidden_size'):
        print(f"   Hidden size: {model.config.hidden_size}")
    if hasattr(model.config, 'num_hidden_layers'):
        print(f"   Hidden layers: {model.config.num_hidden_layers}")
    if hasattr(model.config, 'num_attention_heads'):
        print(f"   Attention heads: {model.config.num_attention_heads}")
    if hasattr(model.config, 'intermediate_size'):
        print(f"   Feed-forward size: {model.config.intermediate_size}")

except Exception as e:
    print(f"\n‚ùå Error loading {SELECTED_MODEL}: {e}")

    # Try alternative model if primary fails
    print(f"\nTrying alternative model: roberta-base")
    try:
        SELECTED_MODEL = "roberta-base"
        tokenizer = AutoTokenizer.from_pretrained(SELECTED_MODEL)
        model = AutoModelForSequenceClassification.from_pretrained(
            SELECTED_MODEL,
            num_labels=len(numeric_mapping)
        )
        model.to(device)
        MAX_LENGTH = 512
        print(f"‚úì Alternative model loaded: {SELECTED_MODEL}")
    except Exception as e2:
        print(f"‚ùå Alternative also failed: {e2}")
        raise SystemExit("Could not load any model. Please check your setup.")

# Prepare dataset for full training
print("\n" + "=" * 70)
print("PREPARING FULL DATASET FOR TRAINING")
print("=" * 70)

def prepare_full_dataset(examples):
    """Prepare full dataset with proper text formatting"""
    texts = []

    resumes = examples[resume_col]
    job_descs = examples[jd_col]

    for resume, jd in zip(resumes, job_descs):
        # Clean texts
        resume_clean = str(resume).strip()
        jd_clean = str(jd).strip()

        # Model-specific formatting
        if "roberta" in SELECTED_MODEL.lower() or "bigbird" in SELECTED_MODEL.lower():
            # RoBERTa/BigBird format
            combined = f"<s>Resume: {resume_clean}</s></s>Job: {jd_clean}</s>"
        elif "deberta" in SELECTED_MODEL.lower():
            # DeBERTa format
            combined = f"[CLS]Resume: {resume_clean}[SEP]Job: {jd_clean}[SEP]"
        elif "bert" in SELECTED_MODEL.lower():
            # BERT format
            combined = f"[CLS]Resume: {resume_clean} [SEP] Job: {jd_clean} [SEP]"
        else:
            # Generic format
            combined = f"Resume: {resume_clean} Job Description: {jd_clean}"

        texts.append(combined)

    return {'text': texts}

print("Preparing texts for all splits...")

# Apply to all splits
full_dataset = DatasetDict({
    'train': dataset_dict['train'].map(prepare_full_dataset, batched=True),
    'validation': dataset_dict['validation'].map(prepare_full_dataset, batched=True),
    'test': dataset_dict['test'].map(prepare_full_dataset, batched=True)
})

print(f"‚úì Texts prepared")
print(f"  Training samples: {len(full_dataset['train'])}")
print(f"  Validation samples: {len(full_dataset['validation'])}")
print(f"  Test samples: {len(full_dataset['test'])}")

# Tokenization function
print(f"\nTokenizing dataset (max_length={MAX_LENGTH})...")

def tokenize_full_dataset(examples):
    """Tokenize the full dataset"""
    return tokenizer(
        examples['text'],
        padding="max_length",
        truncation=True,
        max_length=MAX_LENGTH,
        return_tensors="pt"
    )

tokenized_full = DatasetDict({
    'train': full_dataset['train'].map(tokenize_full_dataset, batched=True),
    'validation': full_dataset['validation'].map(tokenize_full_dataset, batched=True),
    'test': full_dataset['test'].map(tokenize_full_dataset, batched=True)
})

print("‚úì Tokenization completed")

# Prepare for Trainer API
print("\nPreparing datasets for training...")

def prepare_training_labels(examples):
    """Convert numeric_label to labels format expected by Trainer"""
    return {'labels': examples['numeric_label']}

# Apply to all splits
for split in ['train', 'validation', 'test']:
    tokenized_full[split] = tokenized_full[split].map(prepare_training_labels, batched=True)

# Remove unnecessary columns
columns_to_keep = ['input_ids', 'attention_mask', 'labels']
columns_to_remove = [col for col in tokenized_full['train'].column_names if col not in columns_to_keep]

if columns_to_remove:
    tokenized_full = tokenized_full.remove_columns(columns_to_remove)
    print(f"  Removed columns: {columns_to_remove}")

# Set format for PyTorch
tokenized_full.set_format('torch', columns=columns_to_keep)

print(f"‚úì Datasets prepared for training")
print(f"  Final columns: {tokenized_full['train'].column_names}")
print(f"  Training size: {len(tokenized_full['train'])}")
print(f"  Validation size: {len(tokenized_full['validation'])}")
print(f"  Test size: {len(tokenized_full['test'])}")

# Save model configuration
print("\n" + "=" * 70)
print("SAVING MODEL CONFIGURATION")
print("=" * 70)

model_configuration = {
    'selected_model': SELECTED_MODEL,
    'selection_accuracy': SELECTED_ACCURACY,
    'selection_method': SELECTION_METHOD,
    'tokenizer': tokenizer.name_or_path,
    'max_sequence_length': MAX_LENGTH,
    'num_labels': len(numeric_mapping),
    'label_mapping': numeric_mapping,
    'reverse_mapping': reverse_mapping,
    'dataset_sizes': {
        'train': len(tokenized_full['train']),
        'validation': len(tokenized_full['validation']),
        'test': len(tokenized_full['test'])
    },
    'model_architecture': {
        'total_params': sum(p.numel() for p in model.parameters()),
        'trainable_params': sum(p.numel() for p in model.parameters() if p.requires_grad),
        'hidden_size': model.config.hidden_size if hasattr(model.config, 'hidden_size') else None,
        'num_layers': model.config.num_hidden_layers if hasattr(model.config, 'num_hidden_layers') else None
    },
    'device': str(device),
    'training_ready': True,
    'timestamp': datetime.now().isoformat()
}

with open('model_configuration.json', 'w') as f:
    json.dump(model_configuration, f, indent=2)

print(f"‚úì Model configuration saved to model_configuration.json")

# Save tokenizer for future use
tokenizer.save_pretrained("./saved_tokenizer")
print(f"‚úì Tokenizer saved to ./saved_tokenizer")

# Save a small sample for verification
sample_data = {
    'sample_input': {
        'text': full_dataset['train'][0]['text'][:200] + "..." if len(full_dataset['train'][0]['text']) > 200 else full_dataset['train'][0]['text'],
        'tokenized_length': len(tokenized_full['train'][0]['input_ids']),
        'label': int(tokenized_full['train'][0]['labels'])
    },
    'model_info': {
        'name': SELECTED_MODEL,
        'max_length': MAX_LENGTH
    }
}

with open('training_sample.json', 'w') as f:
    json.dump(sample_data, f, indent=2)

print(f"‚úì Training sample saved for verification")

print("\n" + "=" * 70)
print("STEP 3 COMPLETED SUCCESSFULLY")
print("=" * 70)
print(f"‚úÖ Model: {SELECTED_MODEL}")
print(f"üìè Max sequence length: {MAX_LENGTH}")
print(f"üè∑Ô∏è  Number of classes: {len(numeric_mapping)}")
print(f"‚öôÔ∏è  Device: {device}")
print(f"\nüìä Dataset statistics:")
print(f"   Training samples: {len(tokenized_full['train'])}")
print(f"   Validation samples: {len(tokenized_full['validation'])}")
print(f"   Test samples: {len(tokenized_full['test'])}")
print(f"   Total samples: {len(tokenized_full['train']) + len(tokenized_full['validation']) + len(tokenized_full['test'])}")
print(f"\nüéØ Expected performance: {SELECTED_ACCURACY:.4f} (from Step 2 test)")
print(f"üìà Projected after full training: {(SELECTED_ACCURACY + 0.15):.3f}-{(SELECTED_ACCURACY + 0.20):.3f}")

print("=" * 70)

STEP 3: LOADING SELECTED MODEL AND TOKENIZER

Loading model selection results from Step 2...
‚úì Model selection loaded successfully
  Selected model: FacebookAI/roberta-base
  Test accuracy: 0.4000
  Selection method: accuracy_test

LOADING: FacebookAI/roberta-base
Using default configuration for FacebookAI/roberta-base

Model configuration:
  Max sequence length: 512 tokens
  Description: Standard transformer model

LOADING TOKENIZER AND MODEL
1. Loading tokenizer...


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


   ‚úì Tokenizer: FacebookAI/roberta-base
   Vocabulary size: 50,265
   Max model length: 512
   Special tokens: ['<s>', '</s>', '<unk>', '</s>', '<pad>', '<s>', '<mask>']

2. Loading model for sequence classification...
   ‚úì Model loaded successfully
   Model type: roberta
   Number of labels: 3
   Label mapping:
     0 ‚Üí Class 0
     1 ‚Üí Class 1
     2 ‚Üí Class 2

3. Moving model to device...
   Model moved to: cpu

4. Model architecture summary:
   Total parameters: 124,647,939
   Trainable parameters: 124,647,939
   Non-trainable parameters: 0
   Trainable percentage: 100.0%
   Hidden size: 768
   Hidden layers: 12
   Attention heads: 12
   Feed-forward size: 3072

PREPARING FULL DATASET FOR TRAINING
Preparing texts for all splits...
‚úì Texts prepared
  Training samples: 5304
  Validation samples: 937
  Test samples: 1759

Tokenizing dataset (max_length=512)...
‚úì Tokenization completed

Preparing datasets for training...
  Removed columns: ['resume_text', 'job_description

In [None]:
# ============================================================================
# STEP 4: MODEL FINE-TUNING (Train + Test only, fixed labels)
# ============================================================================

from transformers import AutoTokenizer, TrainingArguments, Trainer
from evaluate import load
import numpy as np

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(SELECTED_MODEL)

# Tokenization function
def tokenize_function(examples):
    texts = [r + " [SEP] " + j for r, j in zip(examples[resume_col], examples[jd_col])]
    return tokenizer(texts, padding="max_length", truncation=True, max_length=512)

# Tokenize train and test
tokenized_datasets = {}
tokenized_datasets["train"] = dataset_dict["train"].map(tokenize_function, batched=True)
tokenized_datasets["test"] = dataset_dict["test"].map(tokenize_function, batched=True)

# Rename numeric_label ‚Üí labels
def rename_labels(example):
    example["labels"] = example["numeric_label"]
    return example

tokenized_datasets["train"] = tokenized_datasets["train"].map(rename_labels)
tokenized_datasets["test"] = tokenized_datasets["test"].map(rename_labels)

# Set format
for split in ["train", "test"]:
    tokenized_datasets[split].set_format(
        type="torch",
        columns=["input_ids", "attention_mask", "labels"]
    )

# Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    metric = load("accuracy")
    return metric.compute(predictions=predictions, references=labels)

# Training args
training_args = TrainingArguments(
    output_dir="./finetuned_resume_model",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    fp16=torch.cuda.is_available(),
    report_to="none"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)

# Train
print("\nStarting fine-tuning...")
trainer.train()

# Evaluate
print("\nEvaluating on test set...")
test_results = trainer.evaluate(tokenized_datasets["test"])
print(test_results)

# Save
trainer.save_model("./finetuned_resume_model")
tokenizer.save_pretrained("./finetuned_resume_model")

print("\n‚úì Fine-tuning completed and model saved to ./finetuned_resume_model")


Map:   0%|          | 0/5304 [00:00<?, ? examples/s]

Map:   0%|          | 0/1759 [00:00<?, ? examples/s]

Map:   0%|          | 0/5304 [00:00<?, ? examples/s]

Map:   0%|          | 0/1759 [00:00<?, ? examples/s]


Starting fine-tuning...
