In [None]:
!pip install optuna kaleido

In [None]:
from google.colab import drive

import os
import re
import copy
import math
import random
import gc
import shutil
import time
from collections import Counter
from datetime import datetime

import numpy as np
import pandas as pd
from scipy.stats import pointbiserialr

import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.corpus import stopwords, wordnet
from textblob import TextBlob
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import mutual_info_classif, SelectKBest
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.compose import ColumnTransformer
from sklearn.utils.class_weight import compute_class_weight

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import (
    classification_report, confusion_matrix,
    roc_auc_score, roc_curve, auc,
    precision_recall_curve, average_precision_score,
    accuracy_score, precision_score, recall_score, f1_score,
    ConfusionMatrixDisplay
)

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AdamW
from tqdm.auto import tqdm

import joblib

import optuna
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner

DRIVE_PATH = '/content/drive/MyDrive/how_not_to_train_a_model'
drive.mount('/content/drive')


In [None]:

def clear_gpu_memory():
    torch.cuda.empty_cache()
    gc.collect()


In [None]:
class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * ce_loss

        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        else:
            return focal_loss

class ContrastiveLoss(nn.Module):
    def __init__(self, margin=1.0):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin

    def forward(self, outputs, labels):
        embedding = outputs.hidden_states[-1][:, 0, :]

        embedding = F.normalize(embedding, p=2, dim=1)

        pos_idx = (labels == 1).nonzero(as_tuple=True)[0]
        neg_idx = (labels == 0).nonzero(as_tuple=True)[0]

        if len(pos_idx) == 0 or len(neg_idx) == 0:
            return torch.tensor(0.0, device=outputs.logits.device)

        pos_embed = embedding[pos_idx]
        neg_embed = embedding[neg_idx]

        pos_pairs = torch.cdist(pos_embed, pos_embed, p=2)
        neg_pairs = torch.cdist(pos_embed, neg_embed, p=2)

        pos_mask = torch.ones_like(pos_pairs) - torch.eye(pos_pairs.size(0), device=pos_pairs.device)
        pos_pairs = pos_pairs * pos_mask

        pos_loss = pos_pairs.sum() / (pos_pairs.size(0) * (pos_pairs.size(0) - 1) + 1e-6)

        neg_loss = F.relu(self.margin - neg_pairs).sum() / (neg_pairs.size(0) * neg_pairs.size(1) + 1e-6)

        return pos_loss + neg_loss

In [None]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512, augmentation=None):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.augmentation = augmentation

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        if self.augmentation and random.random() < 0.5:
            text = self.augmentation(text)

        encoding = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }


In [None]:
def synonym_replacement(text, n=1):
    words = text.split()
    new_words = words.copy()

    random_word_indices = random.sample(range(len(words)), min(n, len(words)))

    for idx in random_word_indices:
        word = words[idx]
        synonyms = []

        for syn in wordnet.synsets(word):
            for lemma in syn.lemmas():
                synonyms.append(lemma.name().replace('_', ' '))

        if synonyms:
            new_words[idx] = random.choice(synonyms)

    return ' '.join(new_words)

def random_insertion(text, n=1):
    words = text.split()
    new_words = words.copy()

    for _ in range(n):
        if not words:
            continue

        random_idx = random.randint(0, len(new_words))
        random_word_idx = random.randint(0, len(words) - 1)
        new_words.insert(random_idx, words[random_word_idx])

    return ' '.join(new_words)

def random_deletion(text, p=0.1):
    words = text.split()

    if len(words) == 1:
        return text

    new_words = []

    for word in words:
        if random.random() > p:
            new_words.append(word)

    if not new_words:
        return random.choice(words)

    return ' '.join(new_words)

def get_augmentation(aug_type):
    augmentations = {
        'synonym': synonym_replacement,
        'insertion': random_insertion,
        'deletion': random_deletion
    }
    return augmentations.get(aug_type, None)

In [None]:
def train_epoch(model, dataloader, optimizer, loss_fn, device, use_contrastive=False, contrastive_weight=0.1, epoch=None, scheduler=None):
    model.train()
    total_loss = 0
    processed_batches = 0

    if use_contrastive:
        contrastive_loss_fn = ContrastiveLoss()

    progress_bar = tqdm(dataloader, desc="Training", leave=True, position=0)
    for batch_idx, batch in enumerate(progress_bar):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
            output_hidden_states=use_contrastive
        )

        main_loss = loss_fn(outputs.logits, labels)

        if use_contrastive:
            cont_loss = contrastive_loss_fn(outputs, labels)
            loss = main_loss + contrastive_weight * cont_loss

        loss.backward()
        optimizer.step()

        if scheduler is not None and isinstance(scheduler, torch.optim.lr_scheduler.OneCycleLR):
            scheduler.step()

        total_loss += loss.item()
        processed_batches += 1
        progress_bar.set_postfix({'loss': loss.item()})

    return total_loss / processed_batches

def evaluate(model, dataloader, device):
    model.eval()
    predictions = []
    true_labels = []
    probs = []
    total_loss = 0
    loss_fn = nn.CrossEntropyLoss()

    with torch.no_grad():
        progress_bar = tqdm(dataloader, desc="Evaluating", leave=True, position=0)
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            loss = loss_fn(logits, labels)
            total_loss += loss.item()

            preds = torch.argmax(logits, dim=1).cpu().numpy()
            predictions.extend(preds)

            prob = torch.softmax(logits, dim=1)[:, 1].cpu().numpy()
            probs.extend(prob)

            true_labels.extend(labels.cpu().numpy())

    return predictions, true_labels, probs, total_loss / len(dataloader)

def plot_results(true_labels, predictions, probs, model_name):
    conf_matrix = confusion_matrix(true_labels, predictions)
    print("\nConfusion Matrix:")
    print(conf_matrix)

    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Negative', 'Positive'],
                yticklabels=['Negative', 'Positive'])
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Confusion Matrix ({model_name})')
    plt.tight_layout()
    plt.savefig(f"{DRIVE_PATH}/{model_name.lower().replace('-', '_')}_confusion_matrix.png")
    plt.show()

    roc_auc = roc_auc_score(true_labels, probs)
    print(f"\nROC AUC: {roc_auc:.4f}")

    precision, recall, _ = precision_recall_curve(true_labels, probs)
    avg_precision = average_precision_score(true_labels, probs)

    plt.figure(figsize=(10, 6))
    plt.plot(recall, precision, lw=2, label=f'PR curve (AP = {avg_precision:.4f})')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'Precision-Recall Curve ({model_name})')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(f"{DRIVE_PATH}/{model_name.lower().replace('-', '_')}_pr_curve.png")
    plt.show()

    return roc_auc, avg_precision

In [None]:


os.makedirs(DRIVE_PATH, exist_ok=True)

def save_best_model_after_study(study, model_name, drive_path=DRIVE_PATH):
    best_trial = study.best_trial
    model_files = os.listdir(drive_path)
    best_model_file = None

    model_identifier = model_name.lower().replace('-', '_')
    trial_identifier = f"trial{best_trial.number}"

    for file in model_files:
        if file.endswith('.pt') and trial_identifier in file and model_identifier in file:
            best_model_file = file
            break

    if best_model_file:
        source_path = os.path.join(drive_path, best_model_file)
        best_model_name = f"{model_identifier}_best_model.pt"
        target_path = os.path.join(drive_path, best_model_name)

        shutil.copy(source_path, target_path)

        print(f"\nbest model from trial #{best_trial.number} saved as: {best_model_name}")
        print(f"\troc auc: {best_trial.value:.4f}")
        print(f"\tmodel path: {target_path}")

        meta_path = os.path.join(drive_path, f"{model_identifier}_best_parms.txt")
        with open(meta_path, 'w') as f:
            f.write(f"best trial: #{best_trial.number}\n")
            f.write(f"roc auc: {best_trial.value:.4f}\n")
            f.write("parameters:\n")
            for key, value in best_trial.params.items():
                f.write(f"  {key}: {value}\n")

        return target_path
    else:
        print(f"\ncould not find the model file for the best trial #{best_trial.number} and model {model_identifier}")
        return None


In [None]:
def setup_training(balance_method, y_train, train_dataset, batch_size, loss_type, focal_alpha=None, focal_gamma=None):
    class_counts = np.bincount(y_train)
    total_samples = len(y_train)

    if balance_method == 'sampler':
        weights = np.ones(len(y_train))
        weights[y_train == 0] = class_counts[1] / class_counts[0]
        train_weights = torch.DoubleTensor(weights)
        sampler = WeightedRandomSampler(train_weights, len(train_weights))
        train_dataloader = DataLoader(train_dataset, batch_size=batch_size, sampler=sampler)

        class_weights = None
    else:
        train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

        class_weights = torch.tensor([total_samples / (len(class_counts) * count) for count in class_counts],
                                    dtype=torch.float)

    if loss_type == 'focal':
        loss_fn = FocalLoss(alpha=focal_alpha, gamma=focal_gamma)
    else:
        loss_fn = nn.CrossEntropyLoss(weight=class_weights if balance_method != 'sampler' else None)

    return train_dataloader, loss_fn

def train_evaluate_model(model_config, train_texts, train_labels, dev_texts, dev_labels,
                         balance_method='weighted_loss',
                         loss_type='cross_entropy',
                         augmentation=None,
                         use_contrastive=False,
                         contrastive_weight=0.1,
                         focal_alpha=1,
                         focal_gamma=2,
                         num_epochs=10,
                         batch_size=16,
                         lr=2e-5,
                         seed=3,
                         patience=3,
                         val_size=0.1,
                         trial=None,
                         scheduler_params=None):
    torch.manual_seed(seed)
    np.random.seed(seed)

    model_name = model_config['name']
    model_id = model_config['model_id']
    tokenizer_id = model_config.get('tokenizer_id', model_id)

    config_str = f"{model_name}_{balance_method}_{loss_type}"

    if augmentation:
        config_str += f"_{augmentation}"

    if use_contrastive:
        config_str += f"_contrastive{contrastive_weight}"

    if scheduler_params and scheduler_params.get('scheduler_type') != 'none':
        config_str += f"_{scheduler_params['scheduler_type']}_sched"

    print(f"\ntraining configuration: {config_str}")

    X_train_full, y_train_full = train_texts, train_labels
    X_test, y_test = dev_texts, dev_labels

    X_train, X_val, y_train, y_val = train_test_split(
        X_train_full, y_train_full, test_size=val_size, random_state=seed, stratify=y_train_full
    )

    tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_id,
        num_labels=2,
        problem_type="single_label_classification",
        output_hidden_states=use_contrastive
    )

    aug_function = get_augmentation(augmentation) if augmentation else None

    train_dataset = TextDataset(X_train, y_train, tokenizer, augmentation=aug_function)
    val_dataset = TextDataset(X_val, y_val, tokenizer)
    test_dataset = TextDataset(X_test, y_test, tokenizer)

    train_dataloader, loss_fn = setup_training(balance_method, y_train, train_dataset, batch_size, loss_type, focal_alpha, focal_gamma)

    val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"using device: {device}")

    model.to(device)
    if balance_method == 'weighted_loss' and loss_type != 'focal':
        loss_fn.to(device)

    optimizer = AdamW(model.parameters(), lr=lr)

    scheduler = None
    if scheduler_params and scheduler_params.get('scheduler_type') != 'none':
        scheduler_type = scheduler_params.get('scheduler_type')

        if scheduler_type == 'step':
            scheduler = torch.optim.lr_scheduler.StepLR(
                optimizer,
                step_size=scheduler_params.get('step_size', 1),
                gamma=scheduler_params.get('gamma', 0.1)
            )
        elif scheduler_type == 'exponential':
            scheduler = torch.optim.lr_scheduler.ExponentialLR(
                optimizer,
                gamma=scheduler_params.get('gamma', 0.9)
            )
        elif scheduler_type == 'cosine':
            scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
                optimizer,
                T_max=scheduler_params.get('T_max', 10),
                eta_min=scheduler_params.get('eta_min', 1e-6)
            )
        elif scheduler_type == 'reduce_on_plateau':
            scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
                optimizer,
                mode='min',
                factor=scheduler_params.get('factor', 0.1),
                patience=scheduler_params.get('patience', 2),
                threshold=scheduler_params.get('threshold', 0.01),
                threshold_mode='rel',
                verbose=True
            )
        elif scheduler_type == 'one_cycle':
            steps_per_epoch = len(train_dataloader)
            total_steps = steps_per_epoch * num_epochs

            scheduler = torch.optim.lr_scheduler.OneCycleLR(
                optimizer,
                max_lr=scheduler_params.get('max_lr', lr * 10),
                total_steps=total_steps,
                pct_start=scheduler_params.get('pct_start', 0.3),
                div_factor=scheduler_params.get('div_factor', 25.0),
                final_div_factor=scheduler_params.get('final_div_factor', 1e4)
            )

        print(f"Using {scheduler_type} scheduler")

    best_val_loss = float('inf')
    best_model_state = None
    patience_counter = 0

    for epoch in range(num_epochs):
        print(f"\nepoch {epoch+1}/{num_epochs}")

        train_loss = train_epoch(
            model,
            train_dataloader,
            optimizer,
            loss_fn,
            device,
            use_contrastive=use_contrastive,
            contrastive_weight=contrastive_weight,
            epoch=epoch,
            scheduler=scheduler
        )
        print(f"training loss: {train_loss:.4f}")

        _, _, _, val_loss = evaluate(model, val_dataloader, device)
        print(f"validation loss: {val_loss:.4f}")

        if trial:
            trial.report(val_loss, epoch)
            if trial.should_prune():
                raise optuna.exceptions.TrialPruned()

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model_state = copy.deepcopy(model.state_dict())
            patience_counter = 0
            print(f"new best validation loss: {best_val_loss:.4f}")
        else:
            patience_counter += 1
            print(f"no improvement for {patience_counter} epochs (best: {best_val_loss:.4f})")

        if scheduler and not isinstance(scheduler, torch.optim.lr_scheduler.OneCycleLR):
            if isinstance(scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
                scheduler.step(val_loss)
            else:
                scheduler.step()

            current_lr = optimizer.param_groups[0]['lr']
            print(f"Current learning rate: {current_lr:.8f}")

        if patience_counter >= patience:
            print(f"early stopping triggered after {epoch+1} epochs")
            break

        clear_gpu_memory()

    model.load_state_dict(best_model_state)

    print("\nevaluating on dev (test) set")
    predictions, true_labels, probs, _ = evaluate(model, test_dataloader, device)
    clear_gpu_memory()

    print("\nclassification report:")
    report = classification_report(true_labels, predictions)
    print(report)

    roc_auc, avg_precision = plot_results(true_labels, predictions, probs, model_name)

    output_name = f"{model_name.lower().replace('-', '_')}_{config_str}_model.pt"
    model_path = f"{DRIVE_PATH}/{output_name}"
    torch.save(best_model_state, model_path)
    print(f"best model saved to {model_path}")

    model.cpu()
    del model
    clear_gpu_memory()

    result = {
        'model_name': model_name,
        'balance_method': balance_method,
        'loss_type': loss_type,
        'augmentation': augmentation,
        'contrastive': use_contrastive,
        'contrastive_weight': contrastive_weight if use_contrastive else 0,
        'focal_alpha': focal_alpha if loss_type == 'focal' else 0,
        'focal_gamma': focal_gamma if loss_type == 'focal' else 0,
        'roc_auc': roc_auc,
        'avg_precision': avg_precision,
        'model_path': model_path,
        'best_val_loss': best_val_loss,
        'epochs_trained': epoch+1,
        'config_str': config_str
    }

    if scheduler_params:
        result['scheduler_type'] = scheduler_params.get('scheduler_type', 'none')

        if scheduler_params.get('scheduler_type') != 'none':
            for param_name, param_value in scheduler_params.items():
                if param_name != 'scheduler_type':
                    result[f'scheduler_{param_name}'] = param_value

    return result

In [None]:
def model_optuna_optimization(model_config, train_texts, train_labels, dev_texts, dev_labels, param_space, n_trials=10, seed=3, study_name=None):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

    model_name = model_config['name']
    if study_name is None:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        study_name = f"{model_name.lower().replace('-', '_')}_{timestamp}"

    print(f"starting optuna optimization for {model_name}")
    print(f"study name: {study_name}")

    storage_name = f"sqlite:///{DRIVE_PATH}/{study_name}.db"
    results = []

    def objective(trial):
        nonlocal results

        balance_method = trial.suggest_categorical('balance_method', param_space['balance_method'])
        loss_type = trial.suggest_categorical('loss_type', param_space['loss_type'])
        augmentation = trial.suggest_categorical('augmentation', [None, 'synonym', 'insertion', 'deletion'])
        use_contrastive = trial.suggest_categorical('use_contrastive', [False, True])

        scheduler_type = trial.suggest_categorical('scheduler_type', param_space['scheduler_type'])

        scheduler_params = {'scheduler_type': scheduler_type}

        if scheduler_type == 'step':
            scheduler_params['step_size'] = trial.suggest_categorical('step_size', param_space['step_size'])
            scheduler_params['gamma'] = trial.suggest_categorical('scheduler_gamma', param_space['scheduler_gamma'])
        elif scheduler_type == 'exponential':
            scheduler_params['gamma'] = trial.suggest_categorical('scheduler_gamma', param_space['scheduler_gamma'])
        elif scheduler_type == 'cosine':
            scheduler_params['T_max'] = trial.suggest_categorical('T_max', param_space['T_max'])
            scheduler_params['eta_min'] = trial.suggest_categorical('eta_min', param_space['eta_min'])
        elif scheduler_type == 'reduce_on_plateau':
            scheduler_params['factor'] = trial.suggest_categorical('factor', param_space['factor'])
            scheduler_params['patience'] = trial.suggest_categorical('scheduler_patience', param_space['scheduler_patience'])
            scheduler_params['threshold'] = trial.suggest_categorical('threshold', param_space['threshold'])
        elif scheduler_type == 'one_cycle':
            scheduler_params['max_lr'] = trial.suggest_categorical('max_lr', param_space['max_lr'])
            scheduler_params['pct_start'] = trial.suggest_categorical('pct_start', param_space['pct_start'])
            scheduler_params['div_factor'] = trial.suggest_categorical('div_factor', param_space['div_factor'])
            scheduler_params['final_div_factor'] = trial.suggest_categorical('final_div_factor', param_space['final_div_factor'])

        contrastive_weight = None
        focal_alpha = None
        focal_gamma = None

        if use_contrastive:
            contrastive_weight = trial.suggest_float('contrastive_weight', 0.05, 0.5, log=True)

        if loss_type == 'focal':
            focal_alpha = trial.suggest_float('focal_alpha', 0.25, 2.0, log=True)
            focal_gamma = trial.suggest_float('focal_gamma', 1.0, 5.0)

        lr = trial.suggest_float('learning_rate', 1e-6, 1e-4, log=True)
        batch_size = trial.suggest_categorical('batch_size', [8, 16, 24])

        print(f"\n===== {model_name}: Trial {trial.number + 1}/{n_trials} =====")
        print(f"balance method: {balance_method}")
        print(f"loss type: {loss_type}")
        print(f"augmentation: {augmentation}")
        print(f"contrastive learning: {use_contrastive}")
        print(f"learning rate: {lr:.6f}")
        print(f"batch size: {batch_size}")
        print(f"scheduler type: {scheduler_type}")
        if scheduler_type != 'none':
            print(f"scheduler params: {scheduler_params}")

        if contrastive_weight is None:
            contrastive_weight = 0.1
        if focal_alpha is None:
            focal_alpha = 1.0
        if focal_gamma is None:
            focal_gamma = 2.0

        start_time = time.time()

        try:
            result = train_evaluate_model(
                model_config,
                train_texts, train_labels,
                dev_texts, dev_labels,
                balance_method=balance_method,
                loss_type=loss_type,
                augmentation=augmentation,
                use_contrastive=use_contrastive,
                contrastive_weight=contrastive_weight,
                focal_alpha=focal_alpha,
                focal_gamma=focal_gamma,
                num_epochs=param_space['num_epochs'],
                batch_size=batch_size,
                lr=lr,
                patience=param_space['patience'],
                seed=seed,
                trial=trial,
                scheduler_params=scheduler_params
            )

            execution_time = time.time() - start_time
            result['execution_time'] = execution_time

            result['trial_number'] = trial.number
            result['balance_method'] = balance_method
            result['loss_type'] = loss_type
            result['augmentation'] = augmentation
            result['use_contrastive'] = use_contrastive
            result['contrastive_weight'] = contrastive_weight
            result['focal_alpha'] = focal_alpha
            result['focal_gamma'] = focal_gamma
            result['learning_rate'] = lr
            result['batch_size'] = batch_size
            result['scheduler_type'] = scheduler_type

            for param_name, param_value in scheduler_params.items():
                if param_name != 'scheduler_type':
                    result[f'scheduler_{param_name}'] = param_value

            results.append(result)

            results_df = pd.DataFrame(results)
            results_df.to_csv(f"{DRIVE_PATH}/{study_name}_results.csv", index=False)

            return result['roc_auc']

        except Exception as e:
            print(f"trial failed with error: {str(e)}")

            error_result = {
                'trial_number': trial.number,
                'balance_method': balance_method,
                'loss_type': loss_type,
                'augmentation': augmentation,
                'use_contrastive': use_contrastive,
                'contrastive_weight': contrastive_weight,
                'focal_alpha': focal_alpha,
                'focal_gamma': focal_gamma,
                'learning_rate': lr,
                'batch_size': batch_size,
                'scheduler_type': scheduler_type,
                'error': str(e),
                'execution_time': time.time() - start_time
            }

            for param_name, param_value in scheduler_params.items():
                if param_name != 'scheduler_type':
                    error_result[f'scheduler_{param_name}'] = param_value

            results.append(error_result)

            results_df = pd.DataFrame(results)
            results_df.to_csv(f"{DRIVE_PATH}/{study_name}_results.csv", index=False)

            return float('-inf')

    sampler = optuna.samplers.TPESampler(seed=seed)
    pruner = optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=5)

    study = optuna.create_study(
        study_name=study_name,
        storage=storage_name,
        direction="maximize",
        sampler=sampler,
        pruner=pruner,
        load_if_exists=True
    )

    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)

    best_trial = study.best_trial

    print(f"\n===== {model_name} Optuna Optimization Complete =====")
    print(f"best trial: #{best_trial.number}")
    print(f"best roc auc: {best_trial.value:.4f}")
    print("best parameters:")
    for key, value in best_trial.params.items():
        print(f"  {key}: {value}")

    try:
        best_result = next(res for res in results if res.get('trial_number') == best_trial.number)
    except StopIteration:
        best_result = {
            'trial_number': best_trial.number,
            'roc_auc': best_trial.value,
            **best_trial.params
        }

    best_params = best_trial.params.copy()

    if 'contrastive_weight' not in best_params and best_params.get('use_contrastive', False):
        best_params['contrastive_weight'] = 0.1
    if 'focal_alpha' not in best_params and best_params.get('loss_type') == 'focal':
        best_params['focal_alpha'] = 1.0
    if 'focal_gamma' not in best_params and best_params.get('loss_type') == 'focal':
        best_params['focal_gamma'] = 2.0

    if 'scheduler_type' in best_params and best_params['scheduler_type'] != 'none':
        scheduler_type = best_params['scheduler_type']

        if scheduler_type == 'step':
            if 'step_size' not in best_params:
                best_params['step_size'] = param_space['step_size'][0]
            if 'scheduler_gamma' not in best_params:
                best_params['scheduler_gamma'] = param_space['scheduler_gamma'][0]
        elif scheduler_type == 'exponential':
            if 'scheduler_gamma' not in best_params:
                best_params['scheduler_gamma'] = param_space['scheduler_gamma'][0]
        elif scheduler_type == 'cosine':
            if 'T_max' not in best_params:
                best_params['T_max'] = param_space['T_max'][0]
            if 'eta_min' not in best_params:
                best_params['eta_min'] = param_space['eta_min'][0]
        elif scheduler_type == 'reduce_on_plateau':
            if 'factor' not in best_params:
                best_params['factor'] = param_space['factor'][0]
            if 'scheduler_patience' not in best_params:
                best_params['scheduler_patience'] = param_space['scheduler_patience'][0]
            if 'threshold' not in best_params:
                best_params['threshold'] = param_space['threshold'][0]
        elif scheduler_type == 'one_cycle':
            if 'max_lr' not in best_params:
                best_params['max_lr'] = param_space['max_lr'][0]
            if 'pct_start' not in best_params:
                best_params['pct_start'] = param_space['pct_start'][0]
            if 'div_factor' not in best_params:
                best_params['div_factor'] = param_space['div_factor'][0]
            if 'final_div_factor' not in best_params:
                best_params['final_div_factor'] = param_space['final_div_factor'][0]

    study_info = {
        'study_name': study_name,
        'n_trials': n_trials,
        'best_trial': best_trial.number,
        'best_score': best_trial.value,
        'best_params': best_params
    }

    pd.DataFrame([study_info]).to_csv(f"{DRIVE_PATH}/{study_name}_summary.csv", index=False)

    try:
        fig = optuna.visualization.plot_optimization_history(study)
        fig.write_image(f"{DRIVE_PATH}/{study_name}_history.png")

        fig = optuna.visualization.plot_param_importances(study)
        fig.write_image(f"{DRIVE_PATH}/{study_name}_importances.png")

        fig = optuna.visualization.plot_parallel_coordinate(study)
        fig.write_image(f"{DRIVE_PATH}/{study_name}_parallel.png")

        param_importances = optuna.importance.get_param_importances(study)
        top_params = list(param_importances.keys())[:3]
        for param in top_params:
            fig = optuna.visualization.plot_slice(study, params=[param])
            fig.write_image(f"{DRIVE_PATH}/{study_name}_slice_{param}.png")

        print("visualizations saved successfully")
    except Exception as e:
        print(f"visualization generation failed: {str(e)}")

    all_trials_df = study.trials_dataframe()
    all_trials_df.to_csv(f"{DRIVE_PATH}/{study_name}_all_trials.csv", index=False)

    best_model_path = save_best_model_after_study(study, model_config['name'])

    return pd.DataFrame(results), best_params, best_result, best_model_path

In [None]:
MODEL_CONFIGS = {
    'bert-uncased': {
        'name': 'BERT-Uncased',
        'model_id': 'bert-base-uncased',
    },
    'bert-cased': {
        'name': 'BERT-Cased',
        'model_id': 'bert-base-cased',
    },
    'roberta': {
        'name': 'RoBERTa',
        'model_id': 'roberta-base',
    },
    'deberta': {
        'name': 'DeBERTa',
        'model_id': 'microsoft/deberta-base',
    }
}


In [None]:
train_df = pd.read_csv('/content/Train_Set.csv', header=None)
train_texts = train_df[5].tolist()
train_labels = (pd.to_numeric(train_df[6], errors='coerce') >= 2).astype(int).tolist()

dev_df = pd.read_csv('/content/Dev_Set.csv', header=None)
dev_texts = dev_df[5].tolist()
dev_labels = (pd.to_numeric(dev_df[6], errors='coerce') >= 2).astype(int).tolist()

print(f"loaded {len(train_texts)} train examples and {len(dev_texts)} dev examples")


In [None]:
PARAM_SPACE = {
    'balance_method': ['weighted_loss', 'sampler'],
    'loss_type': ['cross_entropy', 'focal'],
    'augmentation': [None, 'synonym', 'insertion', 'deletion'],
    'use_contrastive': [False, True],
    'contrastive_weight': [0.1, 0.5],
    'focal_alpha': [0.25, 1.0],
    'focal_gamma': [2.0, 5.0],
    'num_epochs': 10,
    'batch_size': [8, 16, 24],
    'lr': [1e-6, 1e-4],
    'patience': 3,

    'scheduler_type': ['none', 'step', 'exponential', 'cosine', 'reduce_on_plateau', 'one_cycle'],

    'step_size': [1, 2, 3],
    'scheduler_gamma': [0.1, 0.3, 0.5],


    'T_max': [5, 8, 10],
    'eta_min': [1e-7, 1e-6, 1e-5],

    'factor': [0.1, 0.2, 0.5],
    'scheduler_patience': [1, 2, 3],
    'threshold': [0.01, 0.05],

    'max_lr': [5e-5, 1e-4, 2e-4],
    'pct_start': [0.1, 0.3, 0.5],
    'div_factor': [10.0, 25.0],
    'final_div_factor': [100.0, 1000.0]
}

In [None]:

results_bert_uncased, best_params_bert_uncased, best_result_bert_uncased, _ = model_optuna_optimization(
    MODEL_CONFIGS['bert-uncased'],
    train_texts,
    train_labels,
    dev_texts,
    dev_labels,
    PARAM_SPACE,
    n_trials=5,
    seed=3,
    study_name="bert_uncased_optimization"
)

print("\nBest parameters for BERT-Uncased:")
for key, value in best_params_bert_uncased.items():
    print(f"  {key}: {value}")
print(f"Best ROC AUC: {best_result_bert_uncased['roc_auc']:.4f}")
print(f"Best Avg Precision: {best_result_bert_uncased['avg_precision']:.4f}")
print(f"Model saved at: {best_result_bert_uncased['model_path']}")


In [None]:
from transformers import BertForSequenceClassification, BertTokenizer

model_path = '/content/drive/MyDrive/how_not_to_train_a_model/bert_uncased_BERT-Uncased_weighted_loss_cross_entropy_contrastive0.4783889164833044_cosine_sched_model.pt'
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)


state_dict = torch.load(model_path, map_location=device)
model.load_state_dict(state_dict)
model.eval()

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def get_probabilities(texts, batch_size=32):
    probs = []
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            batch_texts = [str(text) for text in batch_texts]
            inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors='pt')
            outputs = model(**inputs)
            logits = outputs.logits
            batch_probs = torch.sigmoid(logits)[:, 1].cpu().numpy()
            probs.extend(batch_probs)
    return np.array(probs)


dev_probs = get_probabilities(dev_texts)

thresholds = np.arange(0.1, 0.95, 0.05)
results = {}
for thresh in thresholds:
    preds = (dev_probs >= thresh).astype(int)
    precision, recall, f1, _ = precision_recall_fscore_support(dev_labels, preds, average='binary')
    results[thresh] = {'precision': precision, 'recall': recall, 'f1': f1}

print("threshold tuning results:")
for thresh, metrics in results.items():
    print(f"threshold: {thresh:.2f}, precision: {metrics['precision']:.3f}, recall: {metrics['recall']:.3f}, f1: {metrics['f1']:.3f}")

precision_vals, recall_vals, thresholds_curve = precision_recall_curve(dev_labels, dev_probs)
plt.plot(thresholds_curve, precision_vals[:-1], label="precision")
plt.plot(thresholds_curve, recall_vals[:-1], label="recall")
plt.xlabel("threshold")
plt.ylabel("score")
plt.title("precision and recall vs. threshold")
plt.legend()
plt.show()


In [None]:

results_bert_cased, best_params_bert_cased, best_result_bert_cased, _ = model_optuna_optimization(
    MODEL_CONFIGS['bert-cased'],
    train_texts,
    train_labels,
    dev_texts,
    dev_labels,
    PARAM_SPACE,
    n_trials=5,
    seed=3,
    study_name="bert_cased_optimization"
)

print("\nBest parameters for BERT-cased:")
for key, value in best_params_bert_cased.items():
    print(f"  {key}: {value}")
print(f"Best ROC AUC: {best_result_bert_cased['roc_auc']:.4f}")
print(f"Best Avg Precision: {best_result_bert_cased['avg_precision']:.4f}")
print(f"Model saved at: {best_result_bert_cased['model_path']}")
