In [1]:
import os
import glob
import math
from transformers import get_cosine_schedule_with_warmup
import torch
from collections import Counter
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
)
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import gc

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def clear_memory():
    """Clear GPU memory cache."""
    if torch.backends.mps.is_available():
        torch.mps.empty_cache()
    torch.cuda.empty_cache()  # For compatibility if CUDA is used
    gc.collect() 

In [3]:
# --- Configuration ---
config = {
    "data_dir": "data",
    "model_name": "dbmdz/bert-base-turkish-cased",
    "batch_size": 4,
    "num_epochs": 15,
    "max_length": 412,
    "num_folds": 5,
    "early_stopping_patience": 3,
    "output_dir": "dbmdz/bert-base-turkish-cased-finetuned",
    # Learning rate parameters
    "initial_learning_rate": 3e-5,
    "min_learning_rate": 1e-5,
    "warmup_ratio": 0.2,  
    # Weight decay parameters
    "initial_weight_decay": 0.01, 
    "final_weight_decay": 0.001,  
    "weight_decay_schedule": "linear"  # Use linear decay
}

criterion = nn.CrossEntropyLoss(label_smoothing=0.1)

# Create necessary directories
model_name = config["model_name"].replace("/", "_")
os.makedirs(os.path.join("fold_metrics", model_name), exist_ok=True)
os.makedirs(os.path.join("plots", model_name), exist_ok=True)

# --- Device Setup ---
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Initialize best metrics
best_params = None
best_f1 = 0.0

Using device: mps


# --- Load and Preprocess Dataset ---

In [4]:
def load_dataset(data_dir):
    texts, labels = [], []
    authors = sorted([d for d in os.listdir(data_dir) if not d.startswith('.')])
    author_to_label = {author: idx for idx, author in enumerate(authors)}
 
    for author, label in author_to_label.items():
        author_dir = os.path.join(data_dir, author)
        if os.path.isdir(author_dir):
            for file_name in os.listdir(author_dir):
                if not file_name.endswith('.txt'):
                    continue
                file_path = os.path.join(author_dir, file_name)
                try:
                    with open(file_path, 'r', encoding='ISO-8859-1') as file:
                        texts.append(file.read())
                        labels.append(label)
                except Exception as e:
                    print(f"Error reading {file_path}: {e}")
    return texts, labels, author_to_label

# Load dataset
texts, labels, author_to_label = load_dataset(config["data_dir"])
print(f"Loaded {len(texts)} samples from {config['data_dir']}.")
print(f"Authors: {author_to_label}")
print(Counter(labels))


Loaded 1500 samples from data.
Authors: {'AHMET ÇAKAR': 0, 'ALİ SİRMEN': 1, 'ATAOL BEHRAMOĞLU': 2, 'ATİLLA DORSAY': 3, 'AYKAN SEVER': 4, 'AZİZ ÜSTEL': 5, 'CAN ATAKLI': 6, 'DENİZ GÖKÇE': 7, 'EMRE KONGAR': 8, 'GÖZDE BEDELOĞLU': 9, 'HASAN PULUR': 10, 'HİKMET ÇETİNKAYA': 11, 'MEHMET ALİ BİRAND': 12, 'MEHMET DEMİRKOL': 13, 'MELTEM GÜRLE': 14, 'MERYEM KORAY': 15, 'MÜMTAZ SOYSAL': 16, 'NAZAN BEKİROĞLU': 17, 'NAZIM ALPMAN': 18, 'NEDİM HAZAR': 19, 'NEŞE YAŞIN': 20, 'OKAY KARACAN': 21, 'ÖZGE BAŞAK TANELİ': 22, 'REHA MUHTAR': 23, 'RIDVAN DİLMEN': 24, 'RUHAT MENGİ': 25, 'SELİM İLERİ': 26, 'TARHAN ERDEM': 27, 'UFUK BOZKIR': 28, 'YAŞAR SEYMAN': 29}
Counter({0: 50, 1: 50, 2: 50, 3: 50, 4: 50, 5: 50, 6: 50, 7: 50, 8: 50, 9: 50, 10: 50, 11: 50, 12: 50, 13: 50, 14: 50, 15: 50, 16: 50, 17: 50, 18: 50, 19: 50, 20: 50, 21: 50, 22: 50, 23: 50, 24: 50, 25: 50, 26: 50, 27: 50, 28: 50, 29: 50})


# --- Data Augmentation ---

# --- Tokenization ---

In [5]:
tokenizer = AutoTokenizer.from_pretrained(config["model_name"])
model = AutoModelForSequenceClassification.from_pretrained(
    config["model_name"],
    num_labels=30,  # Number of authors
    problem_type="single_label_classification"
)
#print(model.config)

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text, max_length=self.max_length, truncation=True, padding="max_length", return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze().to(device),
            "attention_mask": encoding["attention_mask"].squeeze().to(device),
            "labels": torch.tensor(label, dtype=torch.long).to(device)
        }

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
def save_fold_metrics(true_labels, predictions, num_classes, fold):
    """Calculate and save metrics for each fold."""
    # Create model-specific directory
    model_name = config["model_name"].replace("/", "_")
    metrics_dir = os.path.join("fold_metrics", model_name)

    
    # Calculate class-wise metrics
    precision = precision_score(true_labels, predictions, average=None)
    recall = recall_score(true_labels, predictions, average=None)
    f1 = f1_score(true_labels, predictions, average=None)
    
    # Calculate averages
    precision_avg = precision.mean()
    recall_avg = recall.mean()
    f1_avg = f1.mean()
    
    # Save metrics to CSV
    metrics_df = pd.DataFrame({
        'Class': [f'Class {i+1}' for i in range(num_classes)] + ['Average'],
        'Precision': list(precision) + [precision_avg],
        'Recall': list(recall) + [recall_avg],
        'F1-Score': list(f1) + [f1_avg]
    })
    
    # Save to CSV
    output_path = os.path.join(metrics_dir, f"performance_metrics_fold_{fold}.csv")
    metrics_df.to_csv(output_path, index=False)
    print(f"Metrics for fold {fold} saved to {output_path}")
    
    return precision_avg, recall_avg, f1_avg

In [7]:
def generate_combined_plots(train_metrics, val_metrics, all_labels, all_preds, num_classes, fold):
    # Create plots directory
    model_name = config["model_name"].replace("/", "_")
    plots_dir = os.path.join("plots", model_name)
    
    # Extract metrics history
    epochs = range(1, len(train_metrics) + 1)
    train_losses = [m['loss'] for m in train_metrics]
    train_precisions = [m['precision'] for m in train_metrics]
    train_recalls = [m['recall'] for m in train_metrics]
    train_f1s = [m['f1'] for m in train_metrics]
    
    val_losses = [m['loss'] for m in val_metrics]
    val_precisions = [m['precision'] for m in val_metrics]
    val_recalls = [m['recall'] for m in val_metrics]
    val_f1s = [m['f1'] for m in val_metrics]
    
    # Create figure with subplots
    fig = plt.figure(figsize=(20, 15))
    gs = fig.add_gridspec(3, 2)
    
    # 1. Training and Validation Loss
    ax1 = fig.add_subplot(gs[0, 0])
    ax1.plot(epochs, train_losses, 'b-', label='Training Loss', marker='o')
    ax1.plot(epochs, val_losses, 'r-', label='Validation Loss', marker='o')
    ax1.set_title('Loss Progress')
    ax1.set_xlabel('Epochs')
    ax1.set_ylabel('Loss')
    ax1.legend()
    ax1.grid(True)
    
    # 2. All Metrics Progress
    ax2 = fig.add_subplot(gs[0, 1])
    ax2.plot(epochs, train_precisions, 'b-', label='Train Precision', marker='o')
    ax2.plot(epochs, train_recalls, 'g-', label='Train Recall', marker='o')
    ax2.plot(epochs, train_f1s, 'r-', label='Train F1', marker='o')
    ax2.plot(epochs, val_precisions, 'b--', label='Val Precision', marker='s')
    ax2.plot(epochs, val_recalls, 'g--', label='Val Recall', marker='s')
    ax2.plot(epochs, val_f1s, 'r--', label='Val F1', marker='s')
    ax2.set_title('Metrics Progress')
    ax2.set_xlabel('Epochs')
    ax2.set_ylabel('Score')
    ax2.legend()
    ax2.grid(True)
    
    # 3. Per-Class Performance
    class_precision = precision_score(all_labels, all_preds, labels=np.arange(num_classes), average=None, zero_division=0)
    class_recall = recall_score(all_labels, all_preds, labels=np.arange(num_classes), average=None, zero_division=0)
    class_f1 = f1_score(all_labels, all_preds, labels=np.arange(num_classes), average=None, zero_division=0)

    """
    print(f"Precision: {class_precision}")
    print(f"Recall: {class_recall}")
    print(f"F1-Score: {class_f1}")
    """
    
    x = np.arange(num_classes)
    width = 0.25
    
    ax3 = fig.add_subplot(gs[1, :])
    ax3.bar(x - width, class_precision, width, label='Precision', color='blue', alpha=0.7)
    ax3.bar(x, class_recall, width, label='Recall', color='green', alpha=0.7)
    ax3.bar(x + width, class_f1, width, label='F1-score', color='red', alpha=0.7)
    
    ax3.set_ylabel('Scores')
    ax3.set_title('Per-Class Performance')
    ax3.set_xticks(x)
    ax3.set_xticklabels([f'Class {i+1}' for i in range(num_classes)], rotation=45)
    ax3.legend()
    ax3.grid(True)
    
    # 4. Confusion Matrix
    ax4 = fig.add_subplot(gs[2, :])  # Define ax4 here
    cm = confusion_matrix(all_labels, all_preds, labels=np.arange(num_classes))
    print(f"Confusion Matrix Shape: {cm.shape}")
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[f'Class {i+1}' for i in range(num_classes)])
    disp.plot(ax=ax4, cmap='Blues', xticks_rotation=45)
    ax4.set_title('Confusion Matrix')

    
    # Adjust layout and save
    plt.tight_layout()
    plt.savefig(os.path.join(plots_dir, f'combined_metrics_fold_{fold}.png'), dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Plots saved to {os.path.join(plots_dir, f'combined_metrics_fold_{fold}.png')}")


In [8]:
def get_scheduler(optimizer, num_training_steps):
    """Create a learning rate scheduler with warmup and cosine decay."""
    num_warmup_steps = int(num_training_steps * config["warmup_ratio"])
    
    # Print for debugging
    print(f"Total training steps: {num_training_steps}")
    print(f"Warmup steps: {num_warmup_steps}")
    print(f"Initial lr: {config['initial_learning_rate']}")
    
    return get_cosine_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps,
        num_cycles=0.5
    )

def get_weight_decay(epoch, num_epochs):
    """Calculate weight decay based on training progress."""
    if config["weight_decay_schedule"] == "linear":
        progress = epoch / num_epochs
        return config["initial_weight_decay"] + (config["final_weight_decay"] - config["initial_weight_decay"]) * progress
    else:  # cosine
        progress = epoch / num_epochs
        return config["initial_weight_decay"] + (config["final_weight_decay"] - config["initial_weight_decay"]) * \
               (1 + math.cos(math.pi * progress)) / 2
    
def calculate_training_steps(train_dataloader, num_epochs):
    return len(train_dataloader) * num_epochs // config["gradient_accumulation_steps"]

In [9]:
def save_fold_metrics(true_labels, predictions, num_classes, fold):
    """Calculate and save metrics for each fold."""
    # Create model-specific directory
    model_name = config["model_name"].replace("/", "_")
    metrics_dir = os.path.join("fold_metrics", model_name)

    
    # Calculate class-wise metrics
    precision = precision_score(true_labels, predictions, average=None)
    recall = recall_score(true_labels, predictions, average=None)
    f1 = f1_score(true_labels, predictions, average=None)
    
    # Calculate averages
    precision_avg = precision.mean()
    recall_avg = recall.mean()
    f1_avg = f1.mean()
    
    # Save metrics to CSV
    metrics_df = pd.DataFrame({
        'Class': [f'Class {i+1}' for i in range(num_classes)] + ['Average'],
        'Precision': list(precision) + [precision_avg],
        'Recall': list(recall) + [recall_avg],
        'F1-Score': list(f1) + [f1_avg]
    })
    
    # Save to CSV
    output_path = os.path.join(metrics_dir, f"performance_metrics_fold_{fold}.csv")
    metrics_df.to_csv(output_path, index=False)
    print(f"Metrics for fold {fold} saved to {output_path}")
    
    return precision_avg, recall_avg, f1_avg

# --- Training Loop ---

In [10]:
config_model = AutoConfig.from_pretrained("dbmdz/bert-base-turkish-cased")
config_model.num_labels = 30
config_model.hidden_dropout_prob = 0.25
config_model.attention_probs_dropout_prob = 0.25


def train_epoch(model, dataloader, optimizer, scheduler, device, epoch_num, num_epochs):
    model.train()
    total_loss = 0
    all_preds, all_labels = [], []
    progress_bar = tqdm(dataloader, desc="Training", unit="batch")
    
    # Update weight decay for this epoch
    current_weight_decay = get_weight_decay(epoch_num, num_epochs)
    for param_group in optimizer.param_groups:
        param_group['weight_decay'] = current_weight_decay
    
    for batch in progress_bar:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits

        loss = criterion(logits, labels)
        loss.backward()
        
        # Gradient clipping for stability
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        optimizer.step()
        scheduler.step()  # Update learning rate
        
        total_loss += loss.item()
        
        # Collect predictions for metrics
        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
        
        # Update progress bar with current learning rate and weight decay
        current_lr = scheduler.get_last_lr()[0]
        progress_bar.set_postfix({
            'loss': f'{loss.item():.4f}',
            'lr': f'{current_lr:.2e}',
            'wd': f'{current_weight_decay:.2e}'
        })
    
    # Calculate training metrics
    precision = precision_score(all_labels, all_preds, average='macro')
    recall = recall_score(all_labels, all_preds, average='macro')
    f1 = f1_score(all_labels, all_preds, average='macro')
    
    return {
        'loss': total_loss / len(dataloader),
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'learning_rate': scheduler.get_last_lr()[0],
        'weight_decay': current_weight_decay
    }

def evaluate_epoch(model, dataloader, device):
    model.eval()
    total_loss = 0
    all_preds, all_labels = [], []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits
            loss = criterion(logits, labels)
            total_loss += loss.item()
            
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    # Calculate metrics including accuracy
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='macro')
    recall = recall_score(all_labels, all_preds, average='macro')
    f1 = f1_score(all_labels, all_preds, average='macro')
    
    return {
        'loss': total_loss / len(dataloader),
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'predictions': all_preds,
        'true_labels': all_labels
    }

def train_and_evaluate(model, train_data, train_labels, val_data, val_labels, fold):
    # Create datasets and dataloaders
    model = model.to(device)
    clear_memory()
    train_dataset = TextDataset(train_data, train_labels, tokenizer, config["max_length"])
    val_dataset = TextDataset(val_data, val_labels, tokenizer, config["max_length"])
    
    train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=config['batch_size'])
    
    # Calculate total steps and warmup steps
    num_training_steps = len(train_loader) * config['num_epochs']
    num_warmup_steps = int(num_training_steps * config['warmup_ratio'])
    
    # Initialize optimizer with initial learning rate and weight decay
    optimizer = optim.AdamW(
        model.parameters(),
        lr=config['initial_learning_rate'],
        weight_decay=config['initial_weight_decay'],
        betas=(0.9, 0.999),
        eps=1e-8
    )

   
    
    # Initialize learning rate scheduler with warmup and cosine decay
    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps
    )
    
    best_val_f1 = 0
    early_stopping_counter = 0
    train_metrics_history = []
    val_metrics_history = []
    
    print(f"\n{'='*50}")
    print(f"Training Fold {fold}")
    print(f"{'='*50}")
    print(f"Total steps: {num_training_steps}, Warmup steps: {num_warmup_steps}")
    
    for epoch in range(config['num_epochs']):
        print(f"\nEpoch {epoch + 1}/{config['num_epochs']}")
        print("-" * 30)
        
        # Training
        train_metrics = train_epoch(
            model, train_loader, optimizer, scheduler, device, 
            epoch, config['num_epochs']
        )
        train_metrics_history.append(train_metrics)
        clear_memory()
        
        # Validation
        val_metrics = evaluate_epoch(model, val_loader, device)
        val_metrics_history.append(val_metrics)
        clear_memory()
        
        # Print metrics
        print(f"\nTraining Metrics:")
        print(f"Loss: {train_metrics['loss']:.4f}")
        print(f"Precision: {train_metrics['precision']:.4f}")
        print(f"Recall: {train_metrics['recall']:.4f}")
        print(f"F1-Score: {train_metrics['f1']:.4f}")
        print(f"Learning Rate: {train_metrics['learning_rate']:.2e}")
        print(f"Weight Decay: {train_metrics['weight_decay']:.2e}")
        
        print(f"\nValidation Metrics:")
        print(f"Loss: {val_metrics['loss']:.4f}")
        print(f"Accuracy: {val_metrics['accuracy']:.4f}")
        print(f"Precision: {val_metrics['precision']:.4f}")
        print(f"Recall: {val_metrics['recall']:.4f}")
        print(f"F1-Score: {val_metrics['f1']:.4f}")
        
        # Early stopping check
        if val_metrics['f1'] > best_val_f1:
            best_val_f1 = val_metrics['f1']
            early_stopping_counter = 0
            best_metrics = val_metrics
            
            # Save the best model for this fold
            os.makedirs(config['output_dir'], exist_ok=True)
            model_save_path = os.path.join(config['output_dir'], f'best_model_fold_{fold}.pt')
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler_state_dict': scheduler.state_dict(),
                'best_f1': best_val_f1,
                'config': config
            }, model_save_path)
            print(f"\nSaved best model for fold {fold} with F1: {best_val_f1:.4f}")
        else:
            early_stopping_counter += 1
            
        if early_stopping_counter >= config['early_stopping_patience']:
            print("\nEarly stopping triggered")
            break
    
    # Generate visualizations
    generate_combined_plots(
        train_metrics_history,
        val_metrics_history,
        best_metrics['true_labels'],
        best_metrics['predictions'],
        30,  # number of classes
        fold
    )
    
    # Save fold metrics
    precision_avg, recall_avg, f1_avg = save_fold_metrics(
        best_metrics['true_labels'],
        best_metrics['predictions'],
        30,
        fold
    )
    
    return best_metrics

def cross_validate(texts, labels):
    global best_f1
    
    skf = StratifiedKFold(n_splits=config['num_folds'], shuffle=True, random_state=42)
    fold_metrics = []
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(texts, labels)):
        print(f"\nFold {fold + 1}/{config['num_folds']}")
        print(f"Train size: {len(train_idx)}, Validation size: {len(val_idx)}")
        clear_memory()
        
        # Reset model for each fold
        model = AutoModelForSequenceClassification.from_pretrained(
            config["model_name"],
            config=config_model
        ).to(device)
        
        # Split data
        X_train = [texts[i] for i in train_idx]
        y_train = [labels[i] for i in train_idx]
        X_val = [texts[i] for i in val_idx]
        y_val = [labels[i] for i in val_idx]
        
        # Train and evaluate
        metrics = train_and_evaluate(
            model, X_train, y_train, X_val, y_val, fold + 1
        )
        
        fold_metrics.append({
            'fold': fold + 1,
            'loss': metrics['loss'],
            'accuracy': metrics['accuracy'],
            'precision': metrics['precision'],
            'recall': metrics['recall'],
            'f1': metrics['f1']
        })
        
        # Print fold results
        print(f"\nFold {fold + 1} Results:")
        print(f"Loss: {metrics['loss']:.4f}")
        print(f"Accuracy: {metrics['accuracy']:.4f}")
        print(f"Precision: {metrics['precision']:.4f}")
        print(f"Recall: {metrics['recall']:.4f}")
        print(f"F1-Score: {metrics['f1']:.4f}")
        
        del model
        clear_memory()
    
    # Calculate average metrics across folds
    avg_metrics = {
        'loss': np.mean([m['loss'] for m in fold_metrics]),
        'accuracy': np.mean([m['accuracy'] for m in fold_metrics]),
        'precision': np.mean([m['precision'] for m in fold_metrics]),
        'recall': np.mean([m['recall'] for m in fold_metrics]),
        'f1': np.mean([m['f1'] for m in fold_metrics])
    }
    
    print('\nAverage metrics across folds:')
    print(f"Loss: {avg_metrics['loss']:.4f}")
    print(f"Accuracy: {avg_metrics['accuracy']:.4f}")
    print(f"Precision: {avg_metrics['precision']:.4f}")
    print(f"Recall: {avg_metrics['recall']:.4f}")
    print(f"F1-Score: {avg_metrics['f1']:.4f}")
    
    # Save overall results
    results_df = pd.DataFrame(fold_metrics)
    results_df.to_csv('fold_results.csv', index=False)
    print("\nSaved detailed fold results to 'fold_results.csv'")
    
    best_f1 = avg_metrics['f1']
    
    return avg_metrics

# --- Generate Overall Results ---

In [11]:
def generate_overall_results(fold_metrics_dir, output_filename="overall_performance_metrics.csv"):
    # Gather all fold-level CSV files
    fold_files = glob.glob(os.path.join(fold_metrics_dir, "performance_metrics_fold_*.csv"))
    if not fold_files:
        raise FileNotFoundError("No fold-level performance metrics files found in the directory.")

    # Initialize DataFrame for aggregation
    all_folds_metrics = []

    # Process each fold file
    for file in fold_files:
        fold_df = pd.read_csv(file)
        all_folds_metrics.append(fold_df)

    # Combine all fold data
    combined_df = pd.concat(all_folds_metrics)

    # Exclude the "Average" row for class-level aggregation
    class_only_df = combined_df[~combined_df["Class"].str.contains("Average")]

    # Aggregate metrics by class
    aggregated_metrics = class_only_df.groupby("Class").mean().reset_index()

    # Sort the metrics by Class
    aggregated_metrics["Class"] = aggregated_metrics["Class"].str.extract(r'(\d+)').astype(int)
    aggregated_metrics = aggregated_metrics.sort_values(by="Class").reset_index(drop=True)

    # Compute overall averages
    overall_precision = aggregated_metrics["Precision"].mean()
    overall_recall = aggregated_metrics["Recall"].mean()
    overall_f1 = aggregated_metrics["F1-Score"].mean()

    # Add "Average" row to the results using pd.concat
    average_row = pd.DataFrame(
        {
            "Class": ["Average"],
            "Precision": [overall_precision],
            "Recall": [overall_recall],
            "F1-Score": [overall_f1],
        }
    )
    aggregated_metrics = pd.concat([aggregated_metrics, average_row], ignore_index=True)

    # Save to a new CSV file
    aggregated_metrics.to_csv(output_filename, index=False)
    print(f"Overall performance metrics saved to '{output_filename}'.")

    return aggregated_metrics


In [12]:

def display_as_dataframe(aggregated_metrics):
    """
    Display the overall performance metrics as a clean DataFrame.

    Args:
        aggregated_metrics (pd.DataFrame): DataFrame containing overall performance metrics.
    """
    # Rename columns to match the teacher's format
    aggregated_metrics = aggregated_metrics.rename(columns={"Class": " ", "Precision": "Precision", "Recall": "Recall", "F1-Score": "F-Score"})
    
    # Display the DataFrame as is
    display(aggregated_metrics.style.set_table_styles(
        [{'selector': 'th', 'props': [('text-align', 'center'), ('font-weight', 'bold')]},
         {'selector': 'td', 'props': [('text-align', 'center')]}]
    ).set_caption("Overall Performance Metrics"))


# --- Execute Training ---

In [13]:
# --- Execute Cross-Validation with Hyperparameter Search ---
best_metrics = cross_validate(texts, labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Fold 1/5
Train size: 1200, Validation size: 300

Training Fold 1
Total steps: 4500, Warmup steps: 900

Epoch 1/15
------------------------------


Training: 100%|██████████| 300/300 [02:43<00:00,  1.83batch/s, loss=3.2635, lr=1.00e-05, wd=1.00e-02]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Training Metrics:
Loss: 3.4715
Precision: 0.0442
Recall: 0.0425
F1-Score: 0.0315
Learning Rate: 1.00e-05
Weight Decay: 1.00e-02

Validation Metrics:
Loss: 3.3519
Accuracy: 0.1233
Precision: 0.1031
Recall: 0.1233
F1-Score: 0.0825

Saved best model for fold 1 with F1: 0.0825

Epoch 2/15
------------------------------


Training: 100%|██████████| 300/300 [02:41<00:00,  1.85batch/s, loss=2.8745, lr=2.00e-05, wd=9.40e-03]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Training Metrics:
Loss: 3.1233
Precision: 0.1710
Recall: 0.1500
F1-Score: 0.1402
Learning Rate: 2.00e-05
Weight Decay: 9.40e-03

Validation Metrics:
Loss: 2.4431
Accuracy: 0.3967
Precision: 0.4638
Recall: 0.3967
F1-Score: 0.3537

Saved best model for fold 1 with F1: 0.3537

Epoch 3/15
------------------------------


Training: 100%|██████████| 300/300 [02:43<00:00,  1.84batch/s, loss=1.7473, lr=3.00e-05, wd=8.80e-03]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Training Metrics:
Loss: 2.0597
Precision: 0.5544
Recall: 0.5200
F1-Score: 0.5110
Learning Rate: 3.00e-05
Weight Decay: 8.80e-03

Validation Metrics:
Loss: 1.5787
Accuracy: 0.7133
Precision: 0.7116
Recall: 0.7133
F1-Score: 0.6782

Saved best model for fold 1 with F1: 0.6782

Epoch 4/15
------------------------------


Training: 100%|██████████| 300/300 [02:41<00:00,  1.85batch/s, loss=1.2476, lr=2.95e-05, wd=8.20e-03]



Training Metrics:
Loss: 1.2769
Precision: 0.7877
Recall: 0.7942
F1-Score: 0.7873
Learning Rate: 2.95e-05
Weight Decay: 8.20e-03

Validation Metrics:
Loss: 1.0771
Accuracy: 0.8500
Precision: 0.8520
Recall: 0.8500
F1-Score: 0.8406

Saved best model for fold 1 with F1: 0.8406

Epoch 5/15
------------------------------


Training: 100%|██████████| 300/300 [02:41<00:00,  1.85batch/s, loss=0.8099, lr=2.80e-05, wd=7.60e-03]



Training Metrics:
Loss: 0.9037
Precision: 0.9052
Recall: 0.9058
F1-Score: 0.9046
Learning Rate: 2.80e-05
Weight Decay: 7.60e-03

Validation Metrics:
Loss: 0.9722
Accuracy: 0.8567
Precision: 0.8910
Recall: 0.8567
F1-Score: 0.8456

Saved best model for fold 1 with F1: 0.8456

Epoch 6/15
------------------------------


Training: 100%|██████████| 300/300 [02:42<00:00,  1.85batch/s, loss=0.6566, lr=2.56e-05, wd=7.00e-03]



Training Metrics:
Loss: 0.7368
Precision: 0.9785
Recall: 0.9783
F1-Score: 0.9782
Learning Rate: 2.56e-05
Weight Decay: 7.00e-03

Validation Metrics:
Loss: 0.9720
Accuracy: 0.9000
Precision: 0.9098
Recall: 0.9000
F1-Score: 0.8987

Saved best model for fold 1 with F1: 0.8987

Epoch 7/15
------------------------------


Training: 100%|██████████| 300/300 [02:41<00:00,  1.85batch/s, loss=0.6586, lr=2.25e-05, wd=6.40e-03]



Training Metrics:
Loss: 0.6888
Precision: 0.9901
Recall: 0.9900
F1-Score: 0.9900
Learning Rate: 2.25e-05
Weight Decay: 6.40e-03

Validation Metrics:
Loss: 0.9071
Accuracy: 0.9267
Precision: 0.9319
Recall: 0.9267
F1-Score: 0.9258

Saved best model for fold 1 with F1: 0.9258

Epoch 8/15
------------------------------


Training:  44%|████▍     | 133/300 [01:12<01:30,  1.84batch/s, loss=0.6558, lr=2.09e-05, wd=5.80e-03]


KeyboardInterrupt: 

In [None]:
# --- Save Overall Results ---
results_df = pd.DataFrame([{
    'Loss': best_metrics['loss'],
    'Accuracy': best_metrics['accuracy'],
    'Precision': best_metrics['precision'],
    'Recall': best_metrics['recall'],
    'F1-Score': best_metrics['f1']
}])
results_df.to_csv("best_metrics.csv", index=False)
print("Results saved to 'best_metrics.csv'")

In [None]:
# Generate and display the final performance table as required
overall_results = generate_overall_results("fold_metrics/dbmdz_bert-base-turkish-cased", output_filename="overall_performance_metrics.csv")
display_as_dataframe(overall_results)
