In [None]:
from google.colab import drive
drive.mount('/content/drive')
DRIVE_PATH = '/content/drive/MyDrive/how_not_to_train_a_model/'

import os
import re
import copy
import math
import random
import gc
import shutil
import time
from collections import Counter
from datetime import datetime

import numpy as np
import pandas as pd
from scipy.stats import pointbiserialr

import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.corpus import stopwords, wordnet
from textblob import TextBlob
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import mutual_info_classif, SelectKBest
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.compose import ColumnTransformer
from sklearn.utils.class_weight import compute_class_weight

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.isotonic import IsotonicRegression
from sklearn.calibration import calibration_curve

from sklearn.metrics import (
    classification_report, confusion_matrix, ConfusionMatrixDisplay,
    roc_auc_score, roc_curve, auc,
    precision_recall_curve, average_precision_score,
    accuracy_score, precision_score, recall_score, f1_score
)

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AdamW
from tqdm.auto import tqdm

import joblib


In [None]:

def clear_gpu_memory():
    torch.cuda.empty_cache()
    gc.collect()


In [None]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512, augmentation=None):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.augmentation = augmentation

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        if self.augmentation and random.random() < 0.5:
            text = self.augmentation(text)

        encoding = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }


In [None]:
def evaluate(model, dataloader, device):
    model.eval()
    predictions = []
    true_labels = []
    probs = []
    total_loss = 0
    loss_fn = nn.CrossEntropyLoss()

    with torch.no_grad():
        progress_bar = tqdm(dataloader, desc="Evaluating", leave=True, position=0)
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            loss = loss_fn(logits, labels)
            total_loss += loss.item()

            preds = torch.argmax(logits, dim=1).cpu().numpy()
            predictions.extend(preds)

            prob = torch.softmax(logits, dim=1)[:, 1].cpu().numpy()
            probs.extend(prob)

            true_labels.extend(labels.cpu().numpy())

    return predictions, true_labels, probs, total_loss / len(dataloader)

def plot_results(true_labels, predictions, probs, model_name):
    conf_matrix = confusion_matrix(true_labels, predictions)
    print("\nConfusion Matrix:")
    print(conf_matrix)

    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Negative', 'Positive'],
                yticklabels=['Negative', 'Positive'])
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Confusion Matrix ({model_name})')
    plt.tight_layout()
    plt.savefig(f"{DRIVE_PATH}/{model_name.lower().replace('-', '_')}_confusion_matrix.png")
    plt.show()

    roc_auc = roc_auc_score(true_labels, probs)
    print(f"\nROC AUC: {roc_auc:.4f}")

    precision, recall, _ = precision_recall_curve(true_labels, probs)
    avg_precision = average_precision_score(true_labels, probs)

    plt.figure(figsize=(10, 6))
    plt.plot(recall, precision, lw=2, label=f'PR curve (AP = {avg_precision:.4f})')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'Precision-Recall Curve ({model_name})')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(f"{DRIVE_PATH}/{model_name.lower().replace('-', '_')}_pr_curve.png")
    plt.show()

    return roc_auc, avg_precision

In [None]:
def evaluate_with_features(model, dataloader, device):
    model.eval()
    predictions = []
    true_labels = []
    features = []
    total_loss = 0
    loss_fn = nn.CrossEntropyLoss()

    with torch.no_grad():
        progress_bar = tqdm(dataloader, desc="evaluating", leave=True, position=0)
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids,
                            attention_mask=attention_mask,
                            output_hidden_states=True)
            logits = outputs.logits

            if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
                feat = outputs.pooler_output
            else:
                feat = outputs.hidden_states[-1][:, 0, :]

            loss = loss_fn(logits, labels)
            total_loss += loss.item()

            preds = torch.argmax(logits, dim=1).cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())
            features.extend(feat.cpu().numpy())

    return predictions, true_labels, features, total_loss / len(dataloader)


In [None]:
MODEL_CONFIGS = {
    'bert-uncased': {
        'name': 'BERT-Uncased',
        'model_id': 'bert-base-uncased',
    },
    'bert-cased': {
        'name': 'BERT-Cased',
        'model_id': 'bert-base-cased',
    },
    'roberta': {
        'name': 'RoBERTa',
        'model_id': 'roberta-base',
    },
    'deberta': {
        'name': 'DeBERTa',
        'model_id': 'microsoft/deberta-base',
    }
}


In [None]:

train_df = pd.read_csv('/content/Train_Set.csv', header=None)
train_texts = train_df[5].tolist()
train_labels = (pd.to_numeric(train_df[6], errors='coerce') >= 2).astype(int).tolist()
train_labels_cont = pd.to_numeric(train_df[6], errors='coerce').tolist()

dev_df = pd.read_csv('/content/Dev_Set.csv', header=None)
dev_texts = dev_df[5].tolist()
dev_labels = (pd.to_numeric(dev_df[6], errors='coerce') >= 2).astype(int).tolist()
dev_labels_cont = pd.to_numeric(dev_df[6], errors='coerce').tolist()

print(f"loaded {len(train_texts)} train examples and {len(dev_texts)} dev examples")


In [None]:
def get_model_predictions(model_configs, model_paths, texts, labels, batch_size=16, seed=3):
    all_probs = []
    model_names = []
    X_set = texts
    y_set = labels

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    for config, model_path in zip(model_configs, model_paths):
        model_name = config['name']
        model_id = config['model_id']
        tokenizer_id = config.get('tokenizer_id', model_id)

        print(f"loading {model_name} from {model_path}")

        tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
        model = AutoModelForSequenceClassification.from_pretrained(
            model_id,
            num_labels=2,
            problem_type="single_label_classification"
        )
        model.load_state_dict(torch.load(model_path, map_location=device))
        model.to(device)
        model.eval()

        test_dataset = TextDataset(X_set, y_set, tokenizer)
        test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

        predictions, true_labels, probs, _ = evaluate_with_features(model, test_dataloader, device)
        all_probs.append(probs)

        model_name_short = os.path.basename(model_path).replace('.pt', '')
        if 'best_model' in model_path:
            model_name_short = f"{model_name}_best"
        model_names.append(model_name_short)

        model.cpu()
        del model
        clear_gpu_memory()

    X_ensemble = np.column_stack(all_probs)
    return X_ensemble, y_set, model_names

available_paths = []
available_configs = []

best_models = [
    "/content/drive/MyDrive/how_not_to_train_a_model/bert_cased_BERT-Cased_sampler_focal_synonym_exponential_sched_model.pt",
    "/content/drive/MyDrive/how_not_to_train_a_model/bert_uncased_BERT-Uncased_weighted_loss_cross_entropy_contrastive0.4783889164833044_cosine_sched_model.pt",
    "/content/drive/MyDrive/how_not_to_train_a_model/deberta_DeBERTa_sampler_focal_synonym_exponential_sched_model.pt",
    "/content/drive/MyDrive/how_not_to_train_a_model/roberta_RoBERTa_sampler_focal_synonym_exponential_sched_model.pt"
]

for model_path in best_models:
    if os.path.exists(model_path):
        available_paths.append(model_path)
        if 'bert_uncased' in model_path:
            available_configs.append(MODEL_CONFIGS['bert-uncased'])
        elif 'bert_cased' in model_path:
            available_configs.append(MODEL_CONFIGS['bert-cased'])
        elif 'roberta' in model_path:
            available_configs.append(MODEL_CONFIGS['roberta'])
        elif 'deberta' in model_path:
            available_configs.append(MODEL_CONFIGS['deberta'])



print(f"Found {len(available_paths)} best models for ensemble:")
for path in available_paths:
    print(f"  - {os.path.basename(path)}")

X_ensemble_train, y_train, model_names = get_model_predictions(
    available_configs, available_paths,
    train_texts,
    train_labels,
    batch_size=16,
    seed=3
)

X_ensemble_dev, y_dev, _ = get_model_predictions(
    available_configs, available_paths,
    dev_texts,
    dev_labels,
    batch_size=16,
    seed=3
)

In [None]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
grid = GridSearchCV(
    LogisticRegression(penalty='l2', solver='liblinear', random_state=3),
    param_grid,
    cv=5,
    scoring='f1'
)
grid.fit(X_ensemble_train, y_train)


ensemble_model = grid.best_estimator_
print(f"\nBest ensemble parameters: {grid.best_params_}")
print(f"Cross-validation F1 score (training): {grid.best_score_:.4f}")

ensemble_preds_train = ensemble_model.predict(X_ensemble_train)
ensemble_probs_train = ensemble_model.predict_proba(X_ensemble_train)[:, 1]

ensemble_preds_dev = ensemble_model.predict(X_ensemble_dev)
ensemble_probs_dev = ensemble_model.predict_proba(X_ensemble_dev)[:, 1]

uncalibrated_roc_auc = roc_auc_score(y_dev, ensemble_probs_dev)
uncalibrated_f1 = f1_score(y_dev, ensemble_preds_dev)
print(f"\nBefore calibration - ROC AUC: {uncalibrated_roc_auc:.4f}, F1: {uncalibrated_f1:.4f}")
print("\nUncalibrated classification report on dev set:")
print(classification_report(y_dev, ensemble_preds_dev))

plt.figure(figsize=(10, 6))
fraction_pos, mean_pred_val = calibration_curve(y_dev, ensemble_probs_dev, n_bins=10)
plt.plot(mean_pred_val, fraction_pos, 's-', label='Before calibration')
plt.plot([0, 1], [0, 1], 'k--', label='Perfectly calibrated')
plt.xlabel('Mean predicted probability')
plt.ylabel('Fraction of positives')
plt.title('Calibration Curve - Before Calibration')
plt.legend()
plt.grid()
plt.savefig(f"{DRIVE_PATH}/before_calibration_curve.png")
plt.show()


calibrator = IsotonicRegression(out_of_bounds='clip')
calibrator.fit(ensemble_probs_dev.reshape(-1, 1), y_dev)

calibrated_probs_dev = calibrator.predict(ensemble_probs_dev.reshape(-1, 1))

thresholds = np.linspace(0, 1, 200)
f1_scores = []
for threshold in thresholds:
    preds = calibrated_probs_dev >= threshold
    f1 = f1_score(y_dev, preds)
    f1_scores.append(f1)

best_idx = np.argmax(f1_scores)
optimal_threshold = thresholds[best_idx]
best_f1 = f1_scores[best_idx]

print(f"Optimal threshold: {optimal_threshold:.4f}")
print(f"Best F1 score with calibration: {best_f1:.4f}")

calibrated_preds_dev = calibrated_probs_dev >= optimal_threshold

calibrated_roc_auc = roc_auc_score(y_dev, calibrated_probs_dev)
calibrated_f1 = f1_score(y_dev, calibrated_preds_dev)
print(f"\nAfter calibration - ROC AUC: {calibrated_roc_auc:.4f}, F1: {calibrated_f1:.4f}")
print("\nCalibrated classification report on dev set:")
print(classification_report(y_dev, calibrated_preds_dev))

plt.figure(figsize=(10, 6))
plt.plot(thresholds, f1_scores)
plt.axvline(x=optimal_threshold, color='r', linestyle='--',
            label=f'Optimal threshold: {optimal_threshold:.4f}')
plt.xlabel('Threshold')
plt.ylabel('F1 Score')
plt.title('F1 Score vs Threshold (after calibration)')
plt.legend()
plt.grid()
plt.savefig(f"{DRIVE_PATH}/f1_vs_threshold_calibrated.png")
plt.show()

plt.figure(figsize=(10, 6))
fraction_pos_cal, mean_pred_val_cal = calibration_curve(y_dev, calibrated_probs_dev, n_bins=10)
plt.plot(mean_pred_val, fraction_pos, 's-', label='Before calibration')
plt.plot(mean_pred_val_cal, fraction_pos_cal, 's-', label='After calibration')
plt.plot([0, 1], [0, 1], 'k--', label='Perfectly calibrated')
plt.xlabel('Mean predicted probability')
plt.ylabel('Fraction of positives')
plt.title('Calibration Curve - Before vs After Calibration')
plt.legend()
plt.grid()
plt.savefig(f"{DRIVE_PATH}/calibration_curve_comparison.png")
plt.show()


plt.figure(figsize=(10, 6))
fpr, tpr, _ = roc_curve(y_dev, ensemble_probs_dev)
plt.plot(fpr, tpr, label=f'Before calibration (AUC = {uncalibrated_roc_auc:.4f})')
fpr_cal, tpr_cal, _ = roc_curve(y_dev, calibrated_probs_dev)
plt.plot(fpr_cal, tpr_cal, label=f'After calibration (AUC = {calibrated_roc_auc:.4f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Before vs After Calibration')
plt.legend()
plt.grid()
plt.savefig(f"{DRIVE_PATH}/roc_curve_comparison.png")
plt.show()


plt.figure(figsize=(10, 6))
precision, recall, _ = precision_recall_curve(y_dev, ensemble_probs_dev)
ap_score = average_precision_score(y_dev, ensemble_probs_dev)
plt.plot(recall, precision, label=f'Before calibration (AP = {ap_score:.4f})')

precision_cal, recall_cal, _ = precision_recall_curve(y_dev, calibrated_probs_dev)
ap_score_cal = average_precision_score(y_dev, calibrated_probs_dev)
plt.plot(recall_cal, precision_cal, label=f'After calibration (AP = {ap_score_cal:.4f})')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve - Before vs After Calibration')
plt.legend()
plt.grid()
plt.savefig(f"{DRIVE_PATH}/pr_curve_comparison.png")
plt.show()


plt.figure(figsize=(16, 6))
plt.subplot(1, 2, 1)
conf_matrix = confusion_matrix(y_dev, ensemble_preds_dev)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Negative', 'Positive'],
            yticklabels=['Negative', 'Positive'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix - Before Calibration')

plt.subplot(1, 2, 2)
conf_matrix_cal = confusion_matrix(y_dev, calibrated_preds_dev)
sns.heatmap(conf_matrix_cal, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Negative', 'Positive'],
            yticklabels=['Negative', 'Positive'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix - After Calibration')
plt.tight_layout()
plt.savefig(f"{DRIVE_PATH}/confusion_matrix_comparison.png")
plt.show()


calibrated_model = {
    'base_model': ensemble_model,
    'calibrator': calibrator,
    'optimal_threshold': optimal_threshold
}
calibrated_output_path = f"{DRIVE_PATH}/pcl_calibrated_ensemble.joblib"
joblib.dump(calibrated_model, calibrated_output_path)
print(f"\nCalibrated ensemble model saved to '{calibrated_output_path}'")

feature_names_path = f"{DRIVE_PATH}/pcl_best_models_feature_names.joblib"
joblib.dump(model_names, feature_names_path)
print(f"Feature mapping saved to '{feature_names_path}'")



In [None]:


param_grid_cont = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}
grid_cont = GridSearchCV(
    Ridge(random_state=3),
    param_grid_cont,
    cv=5,
    scoring='neg_mean_squared_error'
)
grid_cont.fit(X_ensemble_train, train_labels_cont)

ensemble_model_cont = grid_cont.best_estimator_

print(f"\nBest regression ensemble parameters: {grid_cont.best_params_}")
print(f"Cross-validation MSE score (training): {-grid_cont.best_score_:.4f}")

ensemble_preds_cont_train = ensemble_model_cont.predict(X_ensemble_train)
ensemble_preds_cont_dev = ensemble_model_cont.predict(X_ensemble_dev)

binary_preds_cont_dev = ensemble_preds_cont_dev >= 1.5
uncalibrated_f1_cont = f1_score(y_dev, binary_preds_cont_dev)
print(f"\nRegression approach - Before calibration - F1: {uncalibrated_f1_cont:.4f}")
print("\nUncalibrated regression classification report on dev set:")
print(classification_report(y_dev, binary_preds_cont_dev))

calibrator_cont = IsotonicRegression(out_of_bounds='clip')
calibrator_cont.fit(ensemble_preds_cont_dev.reshape(-1, 1), y_dev)
calibrated_preds_cont_dev = calibrator_cont.predict(ensemble_preds_cont_dev.reshape(-1, 1))

thresholds_cont = np.linspace(min(calibrated_preds_cont_dev), max(calibrated_preds_cont_dev), 200)
f1_scores_cont = []
for threshold in thresholds_cont:
    preds = calibrated_preds_cont_dev >= threshold
    f1 = f1_score(y_dev, preds)
    f1_scores_cont.append(f1)

best_idx_cont = np.argmax(f1_scores_cont)
optimal_threshold_cont = thresholds_cont[best_idx_cont]
best_f1_cont = f1_scores_cont[best_idx_cont]

print(f"Optimal threshold for regression outputs: {optimal_threshold_cont:.4f}")
print(f"Best F1 score with regression + calibration: {best_f1_cont:.4f}")

calibrated_binary_preds_cont_dev = calibrated_preds_cont_dev >= optimal_threshold_cont
calibrated_f1_cont = f1_score(y_dev, calibrated_binary_preds_cont_dev)

print(f"\nAfter calibration - Regression F1: {calibrated_f1_cont:.4f}")
print("\nCalibrated regression classification report on dev set:")
print(classification_report(y_dev, calibrated_binary_preds_cont_dev))

plt.figure(figsize=(10, 6))
plt.plot(thresholds_cont, f1_scores_cont, label='Regression approach')
plt.axvline(x=optimal_threshold_cont, color='g', linestyle='--',
            label=f'Regression optimal: {optimal_threshold_cont:.4f}')
plt.xlabel('Threshold')
plt.ylabel('F1 Score')
plt.title('F1 Score vs Threshold - Regression')
plt.legend()
plt.grid()
plt.savefig(f"{DRIVE_PATH}/f1_vs_threshold_comparison.png")
plt.show()


plt.subplot(1, 2, 2)
conf_matrix_cont_cal = confusion_matrix(y_dev, calibrated_binary_preds_cont_dev)
sns.heatmap(conf_matrix_cont_cal, annot=True, fmt='d', cmap='Greens',
            xticklabels=['Negative', 'Positive'],
            yticklabels=['Negative', 'Positive'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Regression Approach - After Calibration')
plt.tight_layout()
plt.savefig(f"{DRIVE_PATH}/confusion_matrix_regression_vs_classification.png")
plt.show()

calibrated_model_cont = {
    'base_model': ensemble_model_cont,
    'calibrator': calibrator_cont,
    'optimal_threshold': optimal_threshold_cont
}
calibrated_output_path_cont = f"{DRIVE_PATH}/pcl_calibrated_regression_ensemble.joblib"
joblib.dump(calibrated_model_cont, calibrated_output_path_cont)
print(f"\nCalibrated regression ensemble model saved to '{calibrated_output_path_cont}'")


In [None]:
unique_scores = sorted(set(dev_labels_cont))
acc_by_score = {}
count_by_score = {}

for score in unique_scores:
    score_indices = [i for i, x in enumerate(dev_labels_cont) if x == score]
    preds = [calibrated_binary_preds_cont_dev[i] for i in score_indices]
    truth = [y_dev[i] for i in score_indices]

    acc = accuracy_score(truth, preds)
    acc_by_score[score] = acc
    count_by_score[score] = len(score_indices)

score_table = []
for score in unique_scores:
    score_table.append([
        score,
        count_by_score[score],
        f"{acc_by_score[score]:.4f}"
    ])

print("\nAccuracy by Raw Score:")
print("-" * 40)
print(f"{'Raw Score':<10} {'Count':>8} {'Accuracy':>10}")
print("-" * 40)
for row in score_table:
    print(f"{row[0]:<10} {row[1]:>8} {row[2]:>10}")
print("-" * 40)


In [None]:
unique_scores = sorted(set(dev_labels_cont))
acc_by_score = {}
count_by_score = {}

for score in unique_scores:
    score_indices = [i for i, x in enumerate(dev_labels_cont) if x == score]
    preds = [calibrated_binary_preds_cont_dev[i] for i in score_indices]
    truth = [y_dev[i] for i in score_indices]

    acc = accuracy_score(truth, preds)
    acc_by_score[score] = acc
    count_by_score[score] = len(score_indices)

plt.figure(figsize=(12, 6))
scores = list(acc_by_score.keys())
acc_values = list(acc_by_score.values())
counts = [count_by_score[s] for s in scores]

max_size = 300
min_size = 30
normalized_sizes = [min_size + (c / max(counts)) * (max_size - min_size) for c in counts]

plt.scatter(scores, acc_values, s=normalized_sizes, alpha=0.7)
plt.plot(scores, acc_values, 'b--', alpha=0.5)

for i, (score, acc) in enumerate(zip(scores, acc_values)):
    plt.annotate(f"n={counts[i]}", (score, acc),
                 textcoords="offset points",
                 xytext=(0,10),
                 ha='center')

plt.xlabel('Label')
plt.ylabel('Accuracy')
plt.title('Accuracy vs Label')
plt.grid(True, linestyle='--', alpha=0.7)
plt.savefig(f"{DRIVE_PATH}/accuracy_vs_raw_score.png")
plt.show()

categories = dev_df[3].tolist()
unique_categories = sorted(set(categories))

f1_by_category = {}
count_by_category = {}
prec_by_category = {}
rec_by_category = {}
acc_by_category = {}

for category in unique_categories:
    category_indices = [i for i, x in enumerate(categories) if x == category]
    preds = [calibrated_binary_preds_cont_dev[i] for i in category_indices]
    truth = [y_dev[i] for i in category_indices]

    try:
        f1 = f1_score(truth, preds)
        prec = precision_score(truth, preds, zero_division=0)
        rec = recall_score(truth, preds, zero_division=0)
        acc = accuracy_score(truth, preds)
    except:
        f1, prec, rec, acc = 0, 0, 0, 0

    f1_by_category[category] = f1
    prec_by_category[category] = prec
    rec_by_category[category] = rec
    acc_by_category[category] = acc
    count_by_category[category] = len(category_indices)

category_table = []
for category in unique_categories:
    category_table.append([
        category,
        count_by_category[category],
        f"{f1_by_category[category]:.4f}",
        f"{prec_by_category[category]:.4f}",
        f"{rec_by_category[category]:.4f}",
        f"{acc_by_category[category]:.4f}"
    ])

category_table.sort(key=lambda x: float(x[2]), reverse=True)

print("Performance by Category:")
print("-" * 80)
print(f"{'Category':<15} {'Count':>8} {'F1 Score':>10} {'Precision':>10} {'Recall':>10} {'Accuracy':>10}")
print("-" * 80)
for row in category_table:
    print(f"{row[0]:<15} {row[1]:>8} {row[2]:>10} {row[3]:>10} {row[4]:>10} {row[5]:>10}")
print("-" * 80)

dev_text_lengths = [len(text.split()) if isinstance(text, str) else 0 for text in dev_texts]

sorted_lengths = sorted(dev_text_lengths)
bins = [0, 10, 25, 50, 100, 200, 500]
if max(sorted_lengths) > 500:
    bins.append(max(sorted_lengths) + 1)

bin_labels = [f"{bins[i]}-{bins[i+1]-1}" for i in range(len(bins)-1)]

length_bins = np.digitize(dev_text_lengths, bins) - 1

f1_by_length = {}
count_by_length = {}
prec_by_length = {}
rec_by_length = {}
acc_by_length = {}
error_examples_by_length = {}

for bin_idx in range(len(bin_labels)):
    bin_indices = [i for i, b in enumerate(length_bins) if b == bin_idx]
    if not bin_indices:
        continue

    preds = [calibrated_binary_preds_cont_dev[i] for i in bin_indices]
    truth = [y_dev[i] for i in bin_indices]

    try:
        f1 = f1_score(truth, preds)
        prec = precision_score(truth, preds, zero_division=0)
        rec = recall_score(truth, preds, zero_division=0)
        acc = accuracy_score(truth, preds)
    except:
        f1, prec, rec, acc = 0, 0, 0, 0

    f1_by_length[bin_labels[bin_idx]] = f1
    prec_by_length[bin_labels[bin_idx]] = prec
    rec_by_length[bin_labels[bin_idx]] = rec
    acc_by_length[bin_labels[bin_idx]] = acc
    count_by_length[bin_labels[bin_idx]] = len(bin_indices)

    error_indices = [bin_indices[i] for i, (p, t) in enumerate(zip(preds, truth)) if p != t]
    error_examples_by_length[bin_labels[bin_idx]] = len(error_indices)

length_table = []
for bin_label in bin_labels:
    if bin_label in f1_by_length:
        error_rate = error_examples_by_length[bin_label] / count_by_length[bin_label]
        length_table.append([
            bin_label,
            count_by_length[bin_label],
            f"{f1_by_length[bin_label]:.4f}",
            f"{prec_by_length[bin_label]:.4f}",
            f"{rec_by_length[bin_label]:.4f}",
            f"{acc_by_length[bin_label]:.4f}",
            f"{error_rate:.4f}"
        ])

print("\nPerformance by Text Length:")
print("-" * 95)
print(f"{'Length Range':<15} {'Count':>8} {'F1 Score':>10} {'Precision':>10} {'Recall':>10} {'Accuracy':>10} {'Error Rate':>10}")
print("-" * 95)
for row in length_table:
    print(f"{row[0]:<15} {row[1]:>8} {row[2]:>10} {row[3]:>10} {row[4]:>10} {row[5]:>10} {row[6]:>10}")
print("-" * 95)

In [None]:

def predict_with_calibrated_regression_ensemble(texts, calibrated_model, model_configs, model_paths):
    X_ensemble, _, _ = get_model_predictions(
        model_configs, model_paths, texts, [0] * len(texts)
    )

    base_preds = calibrated_model['base_model'].predict(X_ensemble)

    calibrated_preds = calibrated_model['calibrator'].predict(base_preds.reshape(-1, 1))

    calibrated_binary = calibrated_preds >= calibrated_model['optimal_threshold']

    return calibrated_binary, calibrated_preds

In [None]:
def get_model_predictions_probs(model_configs, model_paths, texts, labels, batch_size=16, seed=3):
    all_probs = []
    model_names = []
    y_set = labels
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    for config, model_path in zip(model_configs, model_paths):
        model_name = config['name']
        model_id = config['model_id']
        tokenizer_id = config.get('tokenizer_id', model_id)
        print(f"loading {model_name} from {model_path}")

        tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
        model = AutoModelForSequenceClassification.from_pretrained(
            model_id,
            num_labels=2,
            problem_type="single_label_classification"
        )
        model.load_state_dict(torch.load(model_path, map_location=device))
        model.to(device)
        model.eval()

        test_dataset = TextDataset(texts, labels, tokenizer)
        test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

        predictions, true_labels, probs, _ = evaluate(model, test_dataloader, device)
        all_probs.append(probs)

        model_name_short = os.path.basename(model_path).replace('.pt', '')
        if 'best_model' in model_path:
            model_name_short = f"{model_name}_best"
        model_names.append(model_name_short)

        model.cpu()
        del model
        clear_gpu_memory()

    X_ensemble = np.column_stack(all_probs)
    return X_ensemble, y_set, model_names

def find_optimal_threshold(probs, y_true, num_thresholds=200):
    thresholds = np.linspace(0, 1, num_thresholds)
    f1_scores = []
    for thresh in thresholds:
        preds = (probs >= thresh).astype(int)
        f1 = f1_score(y_true, preds)
        f1_scores.append(f1)
    best_idx = np.argmax(f1_scores)
    optimal_threshold = thresholds[best_idx]
    best_f1 = f1_scores[best_idx]
    return optimal_threshold, best_f1





In [None]:
X_train_feat, y_train, model_names = get_model_predictions(
    available_configs, available_paths, train_texts, train_labels, batch_size=16, seed=3
)
X_dev_feat, y_dev, _ = get_model_predictions(
    available_configs, available_paths, dev_texts, dev_labels, batch_size=16, seed=3
)

lr1 = LogisticRegression(penalty='l2', solver='liblinear', random_state=3)
lr1.fit(X_train_feat, y_train)
dev_probs1 = lr1.predict_proba(X_dev_feat)[:, 1]

calibrator1 = IsotonicRegression(out_of_bounds='clip')
calibrator1.fit(dev_probs1, y_dev)
calibrated_dev_probs1 = calibrator1.predict(dev_probs1)

opt_thresh1, best_f11 = find_optimal_threshold(calibrated_dev_probs1, y_dev)
dev_preds1 = (calibrated_dev_probs1 >= opt_thresh1).astype(int)

print("pipeline 1 results:")
print("optimal threshold: {:.4f}, best f1: {:.4f}".format(opt_thresh1, best_f11))
print("roc auc:", roc_auc_score(y_dev, calibrated_dev_probs1))
print("f1 score:", f1_score(y_dev, dev_preds1))
print(classification_report(y_dev, dev_preds1))


In [None]:


X_train_prob, y_train, _ = get_model_predictions_probs(
    available_configs, available_paths, train_texts, train_labels, batch_size=16, seed=3
)
X_dev_prob, y_dev, _ = get_model_predictions_probs(
    available_configs, available_paths, dev_texts, dev_labels, batch_size=16, seed=3
)

lr2 = LogisticRegression(penalty='l2', solver='liblinear', random_state=3)
lr2.fit(X_train_prob, y_train)
dev_probs2 = lr2.predict_proba(X_dev_prob)[:, 1]

calibrator2 = IsotonicRegression(out_of_bounds='clip')
calibrator2.fit(dev_probs2, y_dev)
calibrated_dev_probs2 = calibrator2.predict(dev_probs2)

opt_thresh2, best_f12 = find_optimal_threshold(calibrated_dev_probs2, y_dev)
dev_preds2 = (calibrated_dev_probs2 >= opt_thresh2).astype(int)

print("pipeline 2 results:")
print("optimal threshold: {:.4f}, best f1: {:.4f}".format(opt_thresh2, best_f12))
print("roc auc:", roc_auc_score(y_dev, calibrated_dev_probs2))
print("f1 score:", f1_score(y_dev, dev_preds2))
print(classification_report(y_dev, dev_preds2))


calibrated_train_cols = []
calibrated_dev_cols = []
for i in range(X_train_prob.shape[1]):
    iso = IsotonicRegression(out_of_bounds='clip')
    iso.fit(X_train_prob[:, i], y_train)
    calibrated_train_cols.append(iso.predict(X_train_prob[:, i]))
    calibrated_dev_cols.append(iso.predict(X_dev_prob[:, i]))

X_train_prob_cal = np.column_stack(calibrated_train_cols)
X_dev_prob_cal = np.column_stack(calibrated_dev_cols)

lr3 = LogisticRegression(penalty='l2', solver='liblinear', random_state=3)
lr3.fit(X_train_prob_cal, y_train)
dev_probs3 = lr3.predict_proba(X_dev_prob_cal)[:, 1]

opt_thresh3, best_f13 = find_optimal_threshold(dev_probs3, y_dev)
dev_preds3 = (dev_probs3 >= opt_thresh3).astype(int)

print("pipeline 3 results:")
print("optimal threshold: {:.4f}, best f1: {:.4f}".format(opt_thresh3, best_f13))
print("roc auc:", roc_auc_score(y_dev, dev_probs3))
print("f1 score:", f1_score(y_dev, dev_preds3))
print(classification_report(y_dev, dev_preds3))

In [None]:
train_df = pd.DataFrame({
    'text': train_texts,
    'label': train_labels,
    'label_cont': train_labels_cont
})

dev_df = pd.DataFrame({
    'text': dev_texts,
    'label': dev_labels,
    'label_cont': dev_labels_cont
})

train_df['text'].fillna('', inplace=True)
dev_df['text'].fillna('', inplace=True)

train_fe = train_df[['text', 'label', 'label_cont']].copy()
train_fe['length'] = train_fe['text'].apply(len)
train_fe['word_count'] = train_fe['text'].apply(lambda x: len(str(x).split()))
train_fe['unique_word_count'] = train_fe['text'].apply(lambda x: len(set(str(x).split())))
train_fe['average_word_length'] = train_fe['text'].apply(lambda x: np.mean([len(word) for word in str(x).split()]) if len(str(x).split()) > 0 else 0)
train_fe['sentence_count'] = train_fe['text'].apply(lambda x: len(re.split(r'[.!?]+', str(x))) - 1)
train_fe['average_sentence_length'] = train_fe['text'].apply(
    lambda x: np.mean([len(s.split()) for s in re.split(r'[.!?]+', str(x)) if s.strip()])
    if len([s for s in re.split(r'[.!?]+', str(x)) if s.strip()]) > 0 else 0
)
train_fe['lexical_diversity'] = train_fe.apply(lambda x: x['unique_word_count'] / x['word_count'] if x['word_count'] > 0 else 0, axis=1)

stop_words = set(stopwords.words('english'))
train_fe['stopword_count'] = train_fe['text'].apply(lambda x: sum(1 for word in str(x).lower().split() if word in stop_words))
train_fe['stopword_ratio'] = train_fe.apply(lambda x: x['stopword_count'] / x['word_count'] if x['word_count'] > 0 else 0, axis=1)
train_fe['content_ratio'] = 1 - train_fe['stopword_ratio']

train_fe['polarity'] = train_fe['text'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
train_fe['subjectivity'] = train_fe['text'].apply(lambda x: TextBlob(str(x)).sentiment.subjectivity)

dev_fe = dev_df[['text', 'label', 'label_cont']].copy()
dev_fe['length'] = dev_fe['text'].apply(len)
dev_fe['word_count'] = dev_fe['text'].apply(lambda x: len(str(x).split()))
dev_fe['unique_word_count'] = dev_fe['text'].apply(lambda x: len(set(str(x).split())))
dev_fe['average_word_length'] = dev_fe['text'].apply(lambda x: np.mean([len(word) for word in str(x).split()]) if len(str(x).split()) > 0 else 0)
dev_fe['sentence_count'] = dev_fe['text'].apply(lambda x: len(re.split(r'[.!?]+', str(x))) - 1)
dev_fe['average_sentence_length'] = dev_fe['text'].apply(
    lambda x: np.mean([len(s.split()) for s in re.split(r'[.!?]+', str(x)) if s.strip()])
    if len([s for s in re.split(r'[.!?]+', str(x)) if s.strip()]) > 0 else 0
)
dev_fe['lexical_diversity'] = dev_fe.apply(lambda x: x['unique_word_count'] / x['word_count'] if x['word_count'] > 0 else 0, axis=1)
dev_fe['stopword_count'] = dev_fe['text'].apply(lambda x: sum(1 for word in str(x).lower().split() if word in stop_words))
dev_fe['stopword_ratio'] = dev_fe.apply(lambda x: x['stopword_count'] / x['word_count'] if x['word_count'] > 0 else 0, axis=1)
dev_fe['content_ratio'] = 1 - dev_fe['stopword_ratio']
dev_fe['polarity'] = dev_fe['text'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
dev_fe['subjectivity'] = dev_fe['text'].apply(lambda x: TextBlob(str(x)).sentiment.subjectivity)

feature_cols = [
    'length', 'word_count', 'unique_word_count', 'average_word_length',
    'sentence_count', 'average_sentence_length', 'lexical_diversity',
    'stopword_count', 'stopword_ratio', 'content_ratio',
    'polarity', 'subjectivity'
]

for col in feature_cols:
    train_fe[col].fillna(0, inplace=True)
    dev_fe[col].fillna(0, inplace=True)

tfidf_vectorizer = TfidfVectorizer(
    min_df=5, max_df=0.8,
    ngram_range=(1, 2),
    sublinear_tf=True
)

X_tfidf_train = tfidf_vectorizer.fit_transform(train_fe['text'])
X_tfidf_dev = tfidf_vectorizer.transform(dev_fe['text'])

X_features_train = train_fe[feature_cols].values
X_features_dev = dev_fe[feature_cols].values

scaler = StandardScaler()
X_features_train_scaled = scaler.fit_transform(X_features_train)
X_features_dev_scaled = scaler.transform(X_features_dev)

from scipy.sparse import hstack
X_combined_train = hstack([X_tfidf_train, X_features_train_scaled])
X_combined_dev = hstack([X_tfidf_dev, X_features_dev_scaled])

X_train_dense = X_combined_train.toarray()
X_dev_dense = X_combined_dev.toarray()

if np.isnan(X_train_dense).sum() > 0 or np.isnan(X_dev_dense).sum() > 0:
    X_train_dense = np.nan_to_num(X_train_dense)
    X_dev_dense = np.nan_to_num(X_dev_dense)
    X_combined_train = sparse.csr_matrix(X_train_dense)
    X_combined_dev = sparse.csr_matrix(X_dev_dense)

log_reg = LogisticRegression(
    C=1.0,
    class_weight='balanced',
    solver='liblinear',
    max_iter=1000,
    random_state=42
)

log_reg.fit(X_combined_train, train_fe['label'])

dev_probs = log_reg.predict_proba(X_combined_dev)[:, 1]

thresholds = np.linspace(0, 1, 100)
f1_scores = []
for threshold in thresholds:
    preds = dev_probs >= threshold
    f1 = f1_score(dev_fe['label'], preds)
    f1_scores.append(f1)

best_idx = np.argmax(f1_scores)
optimal_threshold = thresholds[best_idx]
best_f1 = f1_scores[best_idx]

dev_preds = dev_probs >= optimal_threshold
print(classification_report(dev_fe['label'], dev_preds))

bow_model = {
    'vectorizer': tfidf_vectorizer,
    'scaler': scaler,
    'model': log_reg,
    'threshold': optimal_threshold,
    'feature_cols': feature_cols
}
model_path = f"{DRIVE_PATH}/bow_tfidf_model.joblib"
joblib.dump(bow_model, model_path)

plt.figure(figsize=(8, 6))
conf_matrix = confusion_matrix(dev_fe['label'], dev_preds)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
           xticklabels=['Negative', 'Positive'],
           yticklabels=['Negative', 'Positive'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.savefig(f"{DRIVE_PATH}/bow_tfidf_confusion_matrix.png")
plt.show()

plt.figure(figsize=(10, 6))
plt.plot(thresholds, f1_scores)
plt.axvline(x=optimal_threshold, color='r', linestyle='--',
           label=f'Optimal threshold: {optimal_threshold:.4f}')
plt.xlabel('Threshold')
plt.ylabel('F1 Score')
plt.title('F1 Score vs Classification Threshold')
plt.legend()
plt.grid(True)
plt.savefig(f"{DRIVE_PATH}/bow_tfidf_f1_vs_threshold.png")
plt.show()



misclass_indices = np.where(dev_fe['label'] != dev_preds)[0]

random_idx = np.random.choice(misclass_indices)

misclass_text = dev_fe.iloc[random_idx]['text']
true_label = dev_fe.iloc[random_idx]['label']
pred_label = dev_preds[random_idx]
pred_prob = dev_probs[random_idx]

print(f"Text: {misclass_text[:500]}...")
print(f"True label: {true_label} | Predicted: {pred_label} | Confidence: {pred_prob:.4f}")

In [None]:
misclass_indices = np.where(dev_fe['label'] != dev_preds)[0]

random_idx = np.random.choice(misclass_indices)

misclass_text = dev_fe.iloc[random_idx]['text']
true_label = dev_fe.iloc[random_idx]['label']
pred_label = dev_preds[random_idx]
pred_prob = dev_probs[random_idx]

print(f"Text: {misclass_text[:500]}...")
print(f"True label: {true_label} | Predicted: {pred_label} | Confidence: {pred_prob:.4f}")

In [None]:
from sklearn.naive_bayes import MultinomialNB

nb_classifier = MultinomialNB(alpha=0.1)
nb_classifier.fit(X_tfidf_train, train_df['label'])

nb_probs = nb_classifier.predict_proba(X_tfidf_dev)[:, 1]

thresholds = np.linspace(0, 1, 100)
nb_f1_scores = []
for threshold in thresholds:
    preds = nb_probs >= threshold
    f1 = f1_score(dev_df['label'], preds)
    nb_f1_scores.append(f1)

best_idx = np.argmax(nb_f1_scores)
nb_optimal_threshold = thresholds[best_idx]
nb_best_f1 = nb_f1_scores[best_idx]

nb_preds = nb_probs >= nb_optimal_threshold
print(classification_report(dev_df['label'], nb_preds))

plt.figure(figsize=(8, 6))
conf_matrix_nb = confusion_matrix(dev_df['label'], nb_preds)
sns.heatmap(conf_matrix_nb, annot=True, fmt='d', cmap='Greens',
           xticklabels=['Negative', 'Positive'],
           yticklabels=['Negative', 'Positive'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Naive Bayes Confusion Matrix')
plt.tight_layout()
plt.savefig(f"{DRIVE_PATH}/naive_bayes_confusion_matrix.png")
plt.show()

plt.figure(figsize=(10, 6))
plt.plot(thresholds, nb_f1_scores)
plt.axvline(x=nb_optimal_threshold, color='r', linestyle='--',
           label=f'Optimal threshold: {nb_optimal_threshold:.4f}')
plt.xlabel('Threshold')
plt.ylabel('F1 Score')
plt.title('Naive Bayes: F1 Score vs Classification Threshold')
plt.legend()
plt.grid(True)
plt.savefig(f"{DRIVE_PATH}/naive_bayes_f1_vs_threshold.png")
plt.show()

nb_model = {
    'vectorizer': tfidf_vectorizer,
    'model': nb_classifier,
    'threshold': nb_optimal_threshold
}
nb_model_path = f"{DRIVE_PATH}/naive_bayes_model.joblib"
joblib.dump(nb_model, nb_model_path)

plt.figure(figsize=(10, 6))
plt.plot(thresholds, f1_scores, label='Logistic Regression')
plt.plot(thresholds, nb_f1_scores, label='Naive Bayes')
plt.axvline(x=optimal_threshold, color='r', linestyle='--',
           label=f'LR threshold: {optimal_threshold:.4f}')
plt.axvline(x=nb_optimal_threshold, color='g', linestyle='--',
           label=f'NB threshold: {nb_optimal_threshold:.4f}')
plt.xlabel('Threshold')
plt.ylabel('F1 Score')
plt.title('F1 Score Comparison: Logistic Regression vs Naive Bayes')
plt.legend()
plt.grid(True)
plt.savefig(f"{DRIVE_PATH}/model_comparison_f1_vs_threshold.png")
plt.show()

In [None]:

test_df = pd.read_csv('/content/task4_test.tsv', header=None, sep='\t')
test_texts = test_df[4].tolist()
test_texts[:10]

In [None]:
X_ensemble_test, _, _ = get_model_predictions(
    available_configs, available_paths,
    test_texts,
    [0] * len(test_texts),
    batch_size=16,
    seed=3
)

In [None]:
ensemble_preds_cont_test = ensemble_model_cont.predict(X_ensemble_test)
calibrated_preds_test = calibrator_cont.predict(ensemble_preds_cont_test.reshape(-1, 1)) >= optimal_threshold_cont

binary_preds_test = (calibrated_preds_test).astype(int)

with open('test.txt', 'w') as f:
    for pred in binary_preds_test:
        f.write(f"{pred}\n")


binary_preds_dev = (calibrated_binary_preds_cont_dev).astype(int)

with open('dev.txt', 'w') as f:
    for pred in binary_preds_dev:
        f.write(f"{pred}\n")