In [None]:
%pip install torch transformers accelerate datasets evaluate numpy pandas jupyter scikit-learn

# Imports


In [4]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
import evaluate
import numpy as np
import torch
from sklearn.metrics import hamming_loss, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score, jaccard_score

# Load Data

In [5]:
df = pd.read_parquet('../data/processed/cleaned_classified_data.parquet')

In [None]:
df = df[[df.columns[0], df.columns[1], df.columns[9]]]
df.columns = ['url', 'title', 'category']
dataset = Dataset.from_pandas(df)
split_dataset = dataset.train_test_split(test_size=0.1, seed=1) 
dataset_dict = DatasetDict({
    'train': split_dataset['train'],
    'validation': split_dataset['test']
})
print(f"Train size: {len(dataset_dict['train'])}, Validation size: {len(dataset_dict['validation'])}")

# Load Pre-trained Model

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_path = "Alibaba-NLP/gte-modernbert-base" # this is where we load our base model in this case we are using a finetuned bert model for embedding tasks
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Define the labels
labels = [
    "News", "Entertainment", "Shop", "Chat", "Education",
    "Government", "Health", "Technology", "Work", "Travel", "Uncategorized"
]
num_labels = len(labels)
id2label = {i: label for i, label in enumerate(labels)}
label2id = {label: i for i, label in enumerate(labels)}

model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
    problem_type="multi_label_classification"
)


# Trainable Parameters

In [None]:
# Freeze all base model parameters initially
for param in model.base_model.parameters():
    param.requires_grad = False

num_layers = 22 # ModernBERT has 22 layers

# Unfreeze the last n encoder layers
for i in range(num_layers - 4, num_layers):  
    for param in model.base_model.layers[i].parameters():
        param.requires_grad = True

# Unfreeze the final_norm layer
for param in model.base_model.final_norm.parameters():
    param.requires_grad = True

# Data Pre-processing

In [None]:
def preprocess_function(examples):
    texts = [f"{url}:{title}" for url, title in zip(examples['url'], examples['title'])]
    tokenized_inputs = tokenizer(texts, truncation=True, max_length=512)
    label_vectors = []
    for category in examples['category']:
        if isinstance(category, str):
            # Remove surrounding brackets (if any) then split on comma
            category_clean = category.strip("[]")
            cats = [cat.strip() for cat in category_clean.split(',')]
        elif isinstance(category, list):
            # Already a list; just strip each element
            cats = [cat.strip() for cat in category]
        else:
            cats = []
        label_vector = [1.0 if label in cats else 0.0 for label in labels]
        label_vectors.append(label_vector)
    tokenized_inputs['labels'] = label_vectors
    return tokenized_inputs

tokenized_data = dataset_dict.map(preprocess_function, batched=True)



In [19]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define Evaluation Metrics

In [20]:
f1 = evaluate.load("f1")
roc_auc = evaluate.load("roc_auc", "multilabel")
#(llm generated this )
def compute_metrics(eval_pred):
    # Unpack evaluation predictions
    logits, true_labels = eval_pred
    # Convert logits to probabilities using sigmoid
    probabilities = 1 / (1 + np.exp(-logits))
    # Threshold probabilities at 0.5 to get binary predictions
    predictions = (probabilities >= 0.5).astype(np.int32)
    true_labels_int = true_labels.astype(np.int32)

    # Hamming Loss: fraction of labels incorrectly predicted
    hamming = hamming_loss(true_labels_int, predictions)

    # Subset Accuracy (Exact Match Ratio)
    exact_match = accuracy_score(true_labels_int, predictions)

    # Flatten arrays for micro-averaged metrics
    predictions_flat = predictions.ravel()
    true_labels_flat = true_labels_int.ravel()
    probabilities_flat = probabilities.ravel()

    # Micro-averaged metrics
    precision_micro = precision_score(true_labels_flat, predictions_flat, average='micro')
    recall_micro = recall_score(true_labels_flat, predictions_flat, average='micro')
    f1_micro = f1_score(true_labels_flat, predictions_flat, average='micro')

    # Macro averaged metrics
    precision_macro = precision_score(true_labels_int, predictions, average='macro')
    recall_macro = recall_score(true_labels_int, predictions, average='macro')
    f1_macro = f1_score(true_labels_int, predictions, average='macro')

    # Weighted averaged metrics
    precision_weighted = precision_score(true_labels_int, predictions, average='weighted')
    recall_weighted = recall_score(true_labels_int, predictions, average='weighted')
    f1_weighted = f1_score(true_labels_int, predictions, average='weighted')

    # ROC-AUC
    roc_auc_micro = roc_auc_score(true_labels_flat, probabilities_flat, average='micro')
    roc_auc_macro = roc_auc_score(true_labels_int, probabilities, average='macro', multi_class='ovr')

    # PR-AUC
    pr_auc_micro = average_precision_score(true_labels_flat, probabilities_flat, average='micro')
    pr_auc_macro = average_precision_score(true_labels_int, probabilities, average='macro')

    # Jaccard Similarity
    jaccard_micro = jaccard_score(true_labels_flat, predictions_flat, average='micro')
    jaccard_macro = jaccard_score(true_labels_int, predictions, average='macro')

    # F1 scores for each label (using the global list 'labels')
    f1_per_label = {}
    for i, label_name in enumerate(labels):
        f1_label = f1_score(true_labels_int[:, i], predictions[:, i], average='binary', zero_division=0)
        f1_per_label[f"f1_{label_name}"] = round(f1_label, 3)

    # Combine all metrics into one dictionary
    metrics = {
        "hamming_loss": round(hamming, 3),
        "exact_match": round(exact_match, 3),
        "precision_micro": round(precision_micro, 3),
        "recall_micro": round(recall_micro, 3),
        "f1_micro": round(f1_micro, 3),
        "precision_macro": round(precision_macro, 3),
        "recall_macro": round(recall_macro, 3),
        "f1_macro": round(f1_macro, 3),
        "precision_weighted": round(precision_weighted, 3),
        "recall_weighted": round(recall_weighted, 3),
        "f1_weighted": round(f1_weighted, 3),
        "roc_auc_micro": round(roc_auc_micro, 3),
        "roc_auc_macro": round(roc_auc_macro, 3),
        "pr_auc_micro": round(pr_auc_micro, 3),
        "pr_auc_macro": round(pr_auc_macro, 3),
        "jaccard_micro": round(jaccard_micro, 3),
        "jaccard_macro": round(jaccard_macro, 3),
    }
    # Add per-label F1 scores
    metrics.update(f1_per_label)
    return metrics


# Training Parameters

In [21]:
#hyperparameters
lr = 2e-5
batch_size = 16 
num_epochs = 3 

training_args = TrainingArguments(
    output_dir="data/models/URL-TITLE-classifier",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
)

# Train


In [None]:
torch.set_float32_matmul_precision('high')  # Add this if using a compatible GPU

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["validation"],
    tokenizer=tokenizer,  
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
trainer.train()

In [None]:
# Load the original pre-trained ModernBERT model
original_model = AutoModelForSequenceClassification.from_pretrained(
    model_path, num_labels=num_labels, id2label=id2label, label2id=label2id, problem_type="multi_label_classification"
)

# Create a Trainer for the original model (no training, just evaluation)
original_trainer = Trainer(
    model=original_model,
    args=training_args, 
    eval_dataset=tokenized_data["validation"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Evaluate both models on validation data
print("Evaluating Your Trained Model...")
your_metrics = trainer.evaluate()
print("Evaluating Original Pre-trained Model...")
original_metrics = original_trainer.evaluate()

# Display comparison
print("\nComparison of Models on Validation Data:")
print(f"{'Metric':<15} {'Your Model':<15} {'Original Model':<15}")
print("-" * 45)
for metric in your_metrics:
    if metric.startswith("eval_"):
        metric_name = metric[5:] 
        print(f"{metric_name:<15} {your_metrics[metric]:<15.3f} {original_metrics[metric]:<15.3f}")

In [None]:
# Create a validation dataset that only includes the relevant fields
validation_dataset = tokenized_data["validation"]
urls = validation_dataset["url"]
titles = validation_dataset["title"]

# Set model to evaluation mode
model.eval()

# Lists to store data
texts = []
urls_list = []
titles_list = []
true_labels_list = []
predicted_labels_list = []
prediction_probs_list = []

# Process examples individually to avoid padding issues
device = model.device if hasattr(model, 'device') else next(model.parameters()).device

# Disable gradient calculation for inference
with torch.no_grad():
    for i in range(len(validation_dataset)):
        # Get a single example
        input_ids = torch.tensor(validation_dataset[i]["input_ids"]).unsqueeze(0).to(device)  # Add batch dimension
        attention_mask = torch.tensor(validation_dataset[i]["attention_mask"]).unsqueeze(0).to(device)
        
        # Store url and title
        url = urls[i]
        title = titles[i]
        
        # Get true labels
        true_label = torch.tensor(validation_dataset[i]["labels"]).to(device)
        
        # Get model predictions
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        
        # Convert logits to probabilities using sigmoid
        prob = torch.sigmoid(logits[0])  # Remove batch dimension
        
        # Get predicted class (0 or 1) for each label
        pred = (prob >= 0.5).int()
        
        # Store original URL and title
        urls_list.append(url)
        titles_list.append(title)
        
        # Decode the input text
        text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
        texts.append(text)
        
        true_labels_list.append(true_label.cpu().numpy())
        predicted_labels_list.append(pred.cpu().numpy())
        prediction_probs_list.append(prob.cpu().numpy())
        
        # Print progress
        if (i+1) % 100 == 0:
            print(f"Processed {i+1}/{len(validation_dataset)} examples")

# Create DataFrame with results
results_df = pd.DataFrame({
    'url': urls_list,
    'title': titles_list,
    'combined_text': texts,
})

# Add true labels and predictions for each category
for i, label_name in enumerate(labels):
    results_df[f'true_{label_name}'] = [label_array[i] for label_array in true_labels_list]
    results_df[f'pred_{label_name}'] = [pred_array[i] for pred_array in predicted_labels_list]
    results_df[f'prob_{label_name}'] = [prob_array[i] for prob_array in prediction_probs_list]

# Save to CSV
results_df.to_csv('validation_results.csv', index=False)
print(f"Results saved to validation_results.csv")