In [1]:
import os
import glob
import math
from transformers import get_cosine_schedule_with_warmup
import torch
from collections import Counter
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
)
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import gc

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def clear_memory():
    """Clear GPU memory cache."""
    if torch.backends.mps.is_available():
        torch.mps.empty_cache()
    torch.cuda.empty_cache()  # For compatibility if CUDA is used
    gc.collect() 

In [3]:
# --- Configuration ---
config = {
    "data_dir": "data",
    "model_name": "dbmdz/bert-base-turkish-cased",
    "batch_size": 4,
    "num_epochs": 10,
    "max_length": 512, 
    "num_folds": 5,
    "early_stopping_patience": 3,
    "output_dir": "dbmdz/bert-base-turkish-cased-finetuned",
    # Learning rate parameters
    "initial_learning_rate": 3e-5,
    "min_learning_rate": 1e-5,
    "warmup_ratio": 0.2,  
    # Weight decay parameters
    "initial_weight_decay": 0.01, 
    "final_weight_decay": 0.001,  
    "weight_decay_schedule": "linear"  # Use linear decay
}

criterion = nn.CrossEntropyLoss(label_smoothing=0.1)

# Create necessary directories
model_name = config["model_name"].replace("/", "_")
os.makedirs(os.path.join("fold_metrics", model_name), exist_ok=True)
os.makedirs(os.path.join("plots", model_name), exist_ok=True)

# --- Device Setup ---
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Initialize best metrics
best_params = None
best_f1 = 0.0

Using device: mps


# --- Load and Preprocess Dataset ---

In [4]:
def load_dataset(data_dir):
    texts, labels = [], []
    authors = sorted([d for d in os.listdir(data_dir) if not d.startswith('.')])
    author_to_label = {author: idx for idx, author in enumerate(authors)}
 
    for author, label in author_to_label.items():
        author_dir = os.path.join(data_dir, author)
        if os.path.isdir(author_dir):
            for file_name in os.listdir(author_dir):
                if not file_name.endswith('.txt'):
                    continue
                file_path = os.path.join(author_dir, file_name)
                try:
                    with open(file_path, 'r', encoding='ISO-8859-1') as file:
                        texts.append(file.read())
                        labels.append(label)
                except Exception as e:
                    print(f"Error reading {file_path}: {e}")
    return texts, labels, author_to_label

# Load dataset
texts, labels, author_to_label = load_dataset(config["data_dir"])
print(f"Loaded {len(texts)} samples from {config['data_dir']}.")
print(f"Authors: {author_to_label}")
print(Counter(labels))


Loaded 1500 samples from data.
Authors: {'AHMET ÇAKAR': 0, 'ALİ SİRMEN': 1, 'ATAOL BEHRAMOĞLU': 2, 'ATİLLA DORSAY': 3, 'AYKAN SEVER': 4, 'AZİZ ÜSTEL': 5, 'CAN ATAKLI': 6, 'DENİZ GÖKÇE': 7, 'EMRE KONGAR': 8, 'GÖZDE BEDELOĞLU': 9, 'HASAN PULUR': 10, 'HİKMET ÇETİNKAYA': 11, 'MEHMET ALİ BİRAND': 12, 'MEHMET DEMİRKOL': 13, 'MELTEM GÜRLE': 14, 'MERYEM KORAY': 15, 'MÜMTAZ SOYSAL': 16, 'NAZAN BEKİROĞLU': 17, 'NAZIM ALPMAN': 18, 'NEDİM HAZAR': 19, 'NEŞE YAŞIN': 20, 'OKAY KARACAN': 21, 'ÖZGE BAŞAK TANELİ': 22, 'REHA MUHTAR': 23, 'RIDVAN DİLMEN': 24, 'RUHAT MENGİ': 25, 'SELİM İLERİ': 26, 'TARHAN ERDEM': 27, 'UFUK BOZKIR': 28, 'YAŞAR SEYMAN': 29}
Counter({0: 50, 1: 50, 2: 50, 3: 50, 4: 50, 5: 50, 6: 50, 7: 50, 8: 50, 9: 50, 10: 50, 11: 50, 12: 50, 13: 50, 14: 50, 15: 50, 16: 50, 17: 50, 18: 50, 19: 50, 20: 50, 21: 50, 22: 50, 23: 50, 24: 50, 25: 50, 26: 50, 27: 50, 28: 50, 29: 50})


# --- Data Augmentation ---

# --- Tokenization ---

In [5]:
tokenizer = AutoTokenizer.from_pretrained(config["model_name"])
model = AutoModelForSequenceClassification.from_pretrained(
    config["model_name"],
    num_labels=30,  # Number of authors
    problem_type="single_label_classification"
)
#print(model.config)

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text, max_length=self.max_length, truncation=True, padding="max_length", return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze().to(device),
            "attention_mask": encoding["attention_mask"].squeeze().to(device),
            "labels": torch.tensor(label, dtype=torch.long).to(device)
        }

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
def save_fold_metrics(true_labels, predictions, num_classes, fold):
    """Calculate and save metrics for each fold."""
    # Create model-specific directory
    model_name = config["model_name"].replace("/", "_")
    metrics_dir = os.path.join("fold_metrics", model_name)

    
    # Calculate class-wise metrics
    precision = precision_score(true_labels, predictions, average=None)
    recall = recall_score(true_labels, predictions, average=None)
    f1 = f1_score(true_labels, predictions, average=None)
    
    # Calculate averages
    precision_avg = precision.mean()
    recall_avg = recall.mean()
    f1_avg = f1.mean()
    
    # Save metrics to CSV
    metrics_df = pd.DataFrame({
        'Class': [f'Class {i+1}' for i in range(num_classes)] + ['Average'],
        'Precision': list(precision) + [precision_avg],
        'Recall': list(recall) + [recall_avg],
        'F1-Score': list(f1) + [f1_avg]
    })
    
    # Save to CSV
    output_path = os.path.join(metrics_dir, f"performance_metrics_fold_{fold}.csv")
    metrics_df.to_csv(output_path, index=False)
    print(f"Metrics for fold {fold} saved to {output_path}")
    
    return precision_avg, recall_avg, f1_avg

In [7]:
def generate_combined_plots(train_metrics, val_metrics, all_labels, all_preds, num_classes, fold):
    # Create plots directory
    model_name = config["model_name"].replace("/", "_")
    plots_dir = os.path.join("plots", model_name)
    
    # Extract metrics history
    epochs = range(1, len(train_metrics) + 1)
    train_losses = [m['loss'] for m in train_metrics]
    train_precisions = [m['precision'] for m in train_metrics]
    train_recalls = [m['recall'] for m in train_metrics]
    train_f1s = [m['f1'] for m in train_metrics]
    
    val_losses = [m['loss'] for m in val_metrics]
    val_precisions = [m['precision'] for m in val_metrics]
    val_recalls = [m['recall'] for m in val_metrics]
    val_f1s = [m['f1'] for m in val_metrics]
    
    # Create figure with subplots
    fig = plt.figure(figsize=(20, 15))
    gs = fig.add_gridspec(3, 2)
    
    # 1. Training and Validation Loss
    ax1 = fig.add_subplot(gs[0, 0])
    ax1.plot(epochs, train_losses, 'b-', label='Training Loss', marker='o')
    ax1.plot(epochs, val_losses, 'r-', label='Validation Loss', marker='o')
    ax1.set_title('Loss Progress')
    ax1.set_xlabel('Epochs')
    ax1.set_ylabel('Loss')
    ax1.legend()
    ax1.grid(True)
    
    # 2. All Metrics Progress
    ax2 = fig.add_subplot(gs[0, 1])
    ax2.plot(epochs, train_precisions, 'b-', label='Train Precision', marker='o')
    ax2.plot(epochs, train_recalls, 'g-', label='Train Recall', marker='o')
    ax2.plot(epochs, train_f1s, 'r-', label='Train F1', marker='o')
    ax2.plot(epochs, val_precisions, 'b--', label='Val Precision', marker='s')
    ax2.plot(epochs, val_recalls, 'g--', label='Val Recall', marker='s')
    ax2.plot(epochs, val_f1s, 'r--', label='Val F1', marker='s')
    ax2.set_title('Metrics Progress')
    ax2.set_xlabel('Epochs')
    ax2.set_ylabel('Score')
    ax2.legend()
    ax2.grid(True)
    
    # 3. Per-Class Performance
    class_precision = precision_score(all_labels, all_preds, labels=np.arange(num_classes), average=None, zero_division=0)
    class_recall = recall_score(all_labels, all_preds, labels=np.arange(num_classes), average=None, zero_division=0)
    class_f1 = f1_score(all_labels, all_preds, labels=np.arange(num_classes), average=None, zero_division=0)

    """
    print(f"Precision: {class_precision}")
    print(f"Recall: {class_recall}")
    print(f"F1-Score: {class_f1}")
    """
    
    x = np.arange(num_classes)
    width = 0.25
    
    ax3 = fig.add_subplot(gs[1, :])
    ax3.bar(x - width, class_precision, width, label='Precision', color='blue', alpha=0.7)
    ax3.bar(x, class_recall, width, label='Recall', color='green', alpha=0.7)
    ax3.bar(x + width, class_f1, width, label='F1-score', color='red', alpha=0.7)
    
    ax3.set_ylabel('Scores')
    ax3.set_title('Per-Class Performance')
    ax3.set_xticks(x)
    ax3.set_xticklabels([f'Class {i+1}' for i in range(num_classes)], rotation=45)
    ax3.legend()
    ax3.grid(True)
    
    # 4. Confusion Matrix
    ax4 = fig.add_subplot(gs[2, :])  # Define ax4 here
    cm = confusion_matrix(all_labels, all_preds, labels=np.arange(num_classes))
    print(f"Confusion Matrix Shape: {cm.shape}")
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[f'Class {i+1}' for i in range(num_classes)])
    disp.plot(ax=ax4, cmap='Blues', xticks_rotation=45)
    ax4.set_title('Confusion Matrix')

    
    # Adjust layout and save
    plt.tight_layout()
    plt.savefig(os.path.join(plots_dir, f'combined_metrics_fold_{fold}.png'), dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Plots saved to {os.path.join(plots_dir, f'combined_metrics_fold_{fold}.png')}")


In [8]:
def get_scheduler(optimizer, num_training_steps):
    """Create a learning rate scheduler with warmup and cosine decay."""
    num_warmup_steps = int(num_training_steps * config["warmup_ratio"])
    
    # Print for debugging
    print(f"Total training steps: {num_training_steps}")
    print(f"Warmup steps: {num_warmup_steps}")
    print(f"Initial lr: {config['initial_learning_rate']}")
    
    return get_cosine_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps,
        num_cycles=0.5
    )

def get_weight_decay(epoch, num_epochs):
    """Calculate weight decay based on training progress."""
    if config["weight_decay_schedule"] == "linear":
        progress = epoch / num_epochs
        return config["initial_weight_decay"] + (config["final_weight_decay"] - config["initial_weight_decay"]) * progress
    else:  # cosine
        progress = epoch / num_epochs
        return config["initial_weight_decay"] + (config["final_weight_decay"] - config["initial_weight_decay"]) * \
               (1 + math.cos(math.pi * progress)) / 2
    
def calculate_training_steps(train_dataloader, num_epochs):
    return len(train_dataloader) * num_epochs // config["gradient_accumulation_steps"]

In [9]:
def save_fold_metrics(true_labels, predictions, num_classes, fold):
    """Calculate and save metrics for each fold."""
    # Create model-specific directory
    model_name = config["model_name"].replace("/", "_")
    metrics_dir = os.path.join("fold_metrics", model_name)

    
    # Calculate class-wise metrics
    precision = precision_score(true_labels, predictions, average=None)
    recall = recall_score(true_labels, predictions, average=None)
    f1 = f1_score(true_labels, predictions, average=None)
    
    # Calculate averages
    precision_avg = precision.mean()
    recall_avg = recall.mean()
    f1_avg = f1.mean()
    
    # Save metrics to CSV
    metrics_df = pd.DataFrame({
        'Class': [f'Class {i+1}' for i in range(num_classes)] + ['Average'],
        'Precision': list(precision) + [precision_avg],
        'Recall': list(recall) + [recall_avg],
        'F1-Score': list(f1) + [f1_avg]
    })
    
    # Save to CSV
    output_path = os.path.join(metrics_dir, f"performance_metrics_fold_{fold}.csv")
    metrics_df.to_csv(output_path, index=False)
    print(f"Metrics for fold {fold} saved to {output_path}")
    
    return precision_avg, recall_avg, f1_avg

# --- Training Loop ---

In [10]:
config_model = AutoConfig.from_pretrained("dbmdz/bert-base-turkish-cased")
config_model.num_labels = 30
config_model.hidden_dropout_prob = 0.2
config_model.attention_probs_dropout_prob = 0.2


def train_epoch(model, dataloader, optimizer, scheduler, device, epoch_num, num_epochs):
    model.train()
    total_loss = 0
    all_preds, all_labels = [], []
    progress_bar = tqdm(dataloader, desc="Training", unit="batch")
    
    # Update weight decay for this epoch
    current_weight_decay = get_weight_decay(epoch_num, num_epochs)
    for param_group in optimizer.param_groups:
        param_group['weight_decay'] = current_weight_decay
    
    for batch in progress_bar:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits

        loss = criterion(logits, labels)
        loss.backward()
        
        # Gradient clipping for stability
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        optimizer.step()
        scheduler.step()  # Update learning rate
        
        total_loss += loss.item()
        
        # Collect predictions for metrics
        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
        
        # Update progress bar with current learning rate and weight decay
        current_lr = scheduler.get_last_lr()[0]
        progress_bar.set_postfix({
            'loss': f'{loss.item():.4f}',
            'lr': f'{current_lr:.2e}',
            'wd': f'{current_weight_decay:.2e}'
        })
    
    # Calculate training metrics
    precision = precision_score(all_labels, all_preds, average='macro')
    recall = recall_score(all_labels, all_preds, average='macro')
    f1 = f1_score(all_labels, all_preds, average='macro')
    
    return {
        'loss': total_loss / len(dataloader),
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'learning_rate': scheduler.get_last_lr()[0],
        'weight_decay': current_weight_decay
    }

def evaluate_epoch(model, dataloader, device):
    model.eval()
    total_loss = 0
    all_preds, all_labels = [], []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits
            loss = criterion(logits, labels)
            total_loss += loss.item()
            
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    # Calculate metrics including accuracy
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='macro')
    recall = recall_score(all_labels, all_preds, average='macro')
    f1 = f1_score(all_labels, all_preds, average='macro')
    
    return {
        'loss': total_loss / len(dataloader),
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'predictions': all_preds,
        'true_labels': all_labels
    }

def train_and_evaluate(model, train_data, train_labels, val_data, val_labels, fold):
    # Create datasets and dataloaders
    model = model.to(device)
    clear_memory()
    train_dataset = TextDataset(train_data, train_labels, tokenizer, config["max_length"])
    val_dataset = TextDataset(val_data, val_labels, tokenizer, config["max_length"])
    
    train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=config['batch_size'])
    
    # Calculate total steps and warmup steps
    num_training_steps = len(train_loader) * config['num_epochs']
    num_warmup_steps = int(num_training_steps * config['warmup_ratio'])
    
    # Initialize optimizer with initial learning rate and weight decay
    optimizer = optim.AdamW(
        model.parameters(),
        lr=config['initial_learning_rate'],
        weight_decay=config['initial_weight_decay'],
        betas=(0.9, 0.999),
        eps=1e-8
    )

   
    
    # Initialize learning rate scheduler with warmup and cosine decay
    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps
    )
    
    best_val_f1 = 0
    early_stopping_counter = 0
    train_metrics_history = []
    val_metrics_history = []
    
    print(f"\n{'='*50}")
    print(f"Training Fold {fold}")
    print(f"{'='*50}")
    print(f"Total steps: {num_training_steps}, Warmup steps: {num_warmup_steps}")
    
    for epoch in range(config['num_epochs']):
        print(f"\nEpoch {epoch + 1}/{config['num_epochs']}")
        print("-" * 30)
        
        # Training
        train_metrics = train_epoch(
            model, train_loader, optimizer, scheduler, device, 
            epoch, config['num_epochs']
        )
        train_metrics_history.append(train_metrics)
        clear_memory()
        
        # Validation
        val_metrics = evaluate_epoch(model, val_loader, device)
        val_metrics_history.append(val_metrics)
        clear_memory()
        
        # Print metrics
        print(f"\nTraining Metrics:")
        print(f"Loss: {train_metrics['loss']:.4f}")
        print(f"Precision: {train_metrics['precision']:.4f}")
        print(f"Recall: {train_metrics['recall']:.4f}")
        print(f"F1-Score: {train_metrics['f1']:.4f}")
        print(f"Learning Rate: {train_metrics['learning_rate']:.2e}")
        print(f"Weight Decay: {train_metrics['weight_decay']:.2e}")
        
        print(f"\nValidation Metrics:")
        print(f"Loss: {val_metrics['loss']:.4f}")
        print(f"Accuracy: {val_metrics['accuracy']:.4f}")
        print(f"Precision: {val_metrics['precision']:.4f}")
        print(f"Recall: {val_metrics['recall']:.4f}")
        print(f"F1-Score: {val_metrics['f1']:.4f}")
        
        # Early stopping check
        if val_metrics['f1'] > best_val_f1:
            best_val_f1 = val_metrics['f1']
            early_stopping_counter = 0
            best_metrics = val_metrics
            
            # Save the best model for this fold
            os.makedirs(config['output_dir'], exist_ok=True)
            model_save_path = os.path.join(config['output_dir'], f'best_model_fold_{fold}.pt')
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler_state_dict': scheduler.state_dict(),
                'best_f1': best_val_f1,
                'config': config
            }, model_save_path)
            print(f"\nSaved best model for fold {fold} with F1: {best_val_f1:.4f}")
        else:
            early_stopping_counter += 1
            
        if early_stopping_counter >= config['early_stopping_patience']:
            print("\nEarly stopping triggered")
            break
    
    # Generate visualizations
    generate_combined_plots(
        train_metrics_history,
        val_metrics_history,
        best_metrics['true_labels'],
        best_metrics['predictions'],
        30,  # number of classes
        fold
    )
    
    # Save fold metrics
    precision_avg, recall_avg, f1_avg = save_fold_metrics(
        best_metrics['true_labels'],
        best_metrics['predictions'],
        30,
        fold
    )
    
    return best_metrics

def cross_validate(texts, labels):
    global best_f1
    
    skf = StratifiedKFold(n_splits=config['num_folds'], shuffle=True, random_state=42)
    fold_metrics = []
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(texts, labels)):
        print(f"\nFold {fold + 1}/{config['num_folds']}")
        print(f"Train size: {len(train_idx)}, Validation size: {len(val_idx)}")
        clear_memory()
        
        # Reset model for each fold
        model = AutoModelForSequenceClassification.from_pretrained(
            config["model_name"],
            config=config_model
        ).to(device)
        
        # Split data
        X_train = [texts[i] for i in train_idx]
        y_train = [labels[i] for i in train_idx]
        X_val = [texts[i] for i in val_idx]
        y_val = [labels[i] for i in val_idx]
        
        # Train and evaluate
        metrics = train_and_evaluate(
            model, X_train, y_train, X_val, y_val, fold + 1
        )
        
        fold_metrics.append({
            'fold': fold + 1,
            'loss': metrics['loss'],
            'accuracy': metrics['accuracy'],
            'precision': metrics['precision'],
            'recall': metrics['recall'],
            'f1': metrics['f1']
        })
        
        # Print fold results
        print(f"\nFold {fold + 1} Results:")
        print(f"Loss: {metrics['loss']:.4f}")
        print(f"Accuracy: {metrics['accuracy']:.4f}")
        print(f"Precision: {metrics['precision']:.4f}")
        print(f"Recall: {metrics['recall']:.4f}")
        print(f"F1-Score: {metrics['f1']:.4f}")
        
        del model
        clear_memory()
    
    # Calculate average metrics across folds
    avg_metrics = {
        'loss': np.mean([m['loss'] for m in fold_metrics]),
        'accuracy': np.mean([m['accuracy'] for m in fold_metrics]),
        'precision': np.mean([m['precision'] for m in fold_metrics]),
        'recall': np.mean([m['recall'] for m in fold_metrics]),
        'f1': np.mean([m['f1'] for m in fold_metrics])
    }
    
    print('\nAverage metrics across folds:')
    print(f"Loss: {avg_metrics['loss']:.4f}")
    print(f"Accuracy: {avg_metrics['accuracy']:.4f}")
    print(f"Precision: {avg_metrics['precision']:.4f}")
    print(f"Recall: {avg_metrics['recall']:.4f}")
    print(f"F1-Score: {avg_metrics['f1']:.4f}")
    
    # Save overall results
    results_df = pd.DataFrame(fold_metrics)
    results_df.to_csv('fold_results.csv', index=False)
    print("\nSaved detailed fold results to 'fold_results.csv'")
    
    best_f1 = avg_metrics['f1']
    
    return avg_metrics

# --- Generate Overall Results ---

In [11]:
def generate_overall_results(fold_metrics_dir, output_filename="overall_performance_metrics.csv"):
    # Gather all fold-level CSV files
    fold_files = glob.glob(os.path.join(fold_metrics_dir, "performance_metrics_fold_*.csv"))
    if not fold_files:
        raise FileNotFoundError("No fold-level performance metrics files found in the directory.")

    # Initialize DataFrame for aggregation
    all_folds_metrics = []

    # Process each fold file
    for file in fold_files:
        fold_df = pd.read_csv(file)
        all_folds_metrics.append(fold_df)

    # Combine all fold data
    combined_df = pd.concat(all_folds_metrics)

    # Exclude the "Average" row for class-level aggregation
    class_only_df = combined_df[~combined_df["Class"].str.contains("Average")]

    # Aggregate metrics by class
    aggregated_metrics = class_only_df.groupby("Class").mean().reset_index()

    # Sort the metrics by Class
    aggregated_metrics["Class"] = aggregated_metrics["Class"].str.extract(r'(\d+)').astype(int)
    aggregated_metrics = aggregated_metrics.sort_values(by="Class").reset_index(drop=True)

    # Compute overall averages
    overall_precision = aggregated_metrics["Precision"].mean()
    overall_recall = aggregated_metrics["Recall"].mean()
    overall_f1 = aggregated_metrics["F1-Score"].mean()

    # Add "Average" row to the results using pd.concat
    average_row = pd.DataFrame(
        {
            "Class": ["Average"],
            "Precision": [overall_precision],
            "Recall": [overall_recall],
            "F1-Score": [overall_f1],
        }
    )
    aggregated_metrics = pd.concat([aggregated_metrics, average_row], ignore_index=True)

    # Save to a new CSV file
    aggregated_metrics.to_csv(output_filename, index=False)
    print(f"Overall performance metrics saved to '{output_filename}'.")

    return aggregated_metrics


In [12]:

def display_as_dataframe(aggregated_metrics):
    """
    Display the overall performance metrics as a clean DataFrame.

    Args:
        aggregated_metrics (pd.DataFrame): DataFrame containing overall performance metrics.
    """
    # Rename columns to match the teacher's format
    aggregated_metrics = aggregated_metrics.rename(columns={"Class": " ", "Precision": "Precision", "Recall": "Recall", "F1-Score": "F-Score"})
    
    # Display the DataFrame as is
    display(aggregated_metrics.style.set_table_styles(
        [{'selector': 'th', 'props': [('text-align', 'center'), ('font-weight', 'bold')]},
         {'selector': 'td', 'props': [('text-align', 'center')]}]
    ).set_caption("Overall Performance Metrics"))


# --- Execute Training ---

In [13]:
# --- Execute Cross-Validation with Hyperparameter Search ---
best_metrics = cross_validate(texts, labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Fold 1/5
Train size: 1200, Validation size: 300

Training Fold 1
Total steps: 3000, Warmup steps: 600

Epoch 1/10
------------------------------


Training: 100%|██████████| 300/300 [03:15<00:00,  1.53batch/s, loss=3.1676, lr=1.50e-05, wd=1.00e-02]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Training Metrics:
Loss: 3.4507
Precision: 0.0296
Recall: 0.0383
F1-Score: 0.0292
Learning Rate: 1.50e-05
Weight Decay: 1.00e-02

Validation Metrics:
Loss: 3.2467
Accuracy: 0.1300
Precision: 0.0611
Recall: 0.1300
F1-Score: 0.0643

Saved best model for fold 1 with F1: 0.0643

Epoch 2/10
------------------------------


Training: 100%|██████████| 300/300 [03:12<00:00,  1.56batch/s, loss=2.9511, lr=3.00e-05, wd=9.10e-03]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Training Metrics:
Loss: 2.8286
Precision: 0.2630
Recall: 0.2567
F1-Score: 0.2354
Learning Rate: 3.00e-05
Weight Decay: 9.10e-03

Validation Metrics:
Loss: 2.0417
Accuracy: 0.5167
Precision: 0.5008
Recall: 0.5167
F1-Score: 0.4495

Saved best model for fold 1 with F1: 0.4495

Epoch 3/10
------------------------------


Training: 100%|██████████| 300/300 [03:14<00:00,  1.54batch/s, loss=1.2142, lr=2.89e-05, wd=8.20e-03]



Training Metrics:
Loss: 1.6600
Precision: 0.6902
Recall: 0.6817
F1-Score: 0.6733
Learning Rate: 2.89e-05
Weight Decay: 8.20e-03

Validation Metrics:
Loss: 1.2702
Accuracy: 0.7900
Precision: 0.8232
Recall: 0.7900
F1-Score: 0.7696

Saved best model for fold 1 with F1: 0.7696

Epoch 4/10
------------------------------


Training: 100%|██████████| 300/300 [03:12<00:00,  1.56batch/s, loss=0.8007, lr=2.56e-05, wd=7.30e-03]



Training Metrics:
Loss: 1.0612
Precision: 0.8604
Recall: 0.8625
F1-Score: 0.8594
Learning Rate: 2.56e-05
Weight Decay: 7.30e-03

Validation Metrics:
Loss: 1.0495
Accuracy: 0.8467
Precision: 0.8659
Recall: 0.8467
F1-Score: 0.8418

Saved best model for fold 1 with F1: 0.8418

Epoch 5/10
------------------------------


Training: 100%|██████████| 300/300 [03:08<00:00,  1.59batch/s, loss=0.6609, lr=2.07e-05, wd=6.40e-03]



Training Metrics:
Loss: 0.8158
Precision: 0.9493
Recall: 0.9492
F1-Score: 0.9487
Learning Rate: 2.07e-05
Weight Decay: 6.40e-03

Validation Metrics:
Loss: 0.9092
Accuracy: 0.9033
Precision: 0.9187
Recall: 0.9033
F1-Score: 0.9049

Saved best model for fold 1 with F1: 0.9049

Epoch 6/10
------------------------------


Training: 100%|██████████| 300/300 [03:12<00:00,  1.56batch/s, loss=0.6516, lr=1.50e-05, wd=5.50e-03]



Training Metrics:
Loss: 0.7083
Precision: 0.9827
Recall: 0.9825
F1-Score: 0.9825
Learning Rate: 1.50e-05
Weight Decay: 5.50e-03

Validation Metrics:
Loss: 0.9405
Accuracy: 0.9033
Precision: 0.9237
Recall: 0.9033
F1-Score: 0.8991

Epoch 7/10
------------------------------


Training: 100%|██████████| 300/300 [03:13<00:00,  1.55batch/s, loss=0.6579, lr=9.26e-06, wd=4.60e-03]



Training Metrics:
Loss: 0.6671
Precision: 0.9967
Recall: 0.9967
F1-Score: 0.9967
Learning Rate: 9.26e-06
Weight Decay: 4.60e-03

Validation Metrics:
Loss: 0.8519
Accuracy: 0.9267
Precision: 0.9360
Recall: 0.9267
F1-Score: 0.9271

Saved best model for fold 1 with F1: 0.9271

Epoch 8/10
------------------------------


Training: 100%|██████████| 300/300 [03:12<00:00,  1.56batch/s, loss=0.6487, lr=4.39e-06, wd=3.70e-03]



Training Metrics:
Loss: 0.6591
Precision: 0.9984
Recall: 0.9983
F1-Score: 0.9983
Learning Rate: 4.39e-06
Weight Decay: 3.70e-03

Validation Metrics:
Loss: 0.8292
Accuracy: 0.9367
Precision: 0.9401
Recall: 0.9367
F1-Score: 0.9362

Saved best model for fold 1 with F1: 0.9362

Epoch 9/10
------------------------------


Training: 100%|██████████| 300/300 [03:15<00:00,  1.53batch/s, loss=0.6562, lr=1.14e-06, wd=2.80e-03]



Training Metrics:
Loss: 0.6564
Precision: 0.9984
Recall: 0.9983
F1-Score: 0.9983
Learning Rate: 1.14e-06
Weight Decay: 2.80e-03

Validation Metrics:
Loss: 0.8367
Accuracy: 0.9333
Precision: 0.9398
Recall: 0.9333
F1-Score: 0.9337

Epoch 10/10
------------------------------


Training: 100%|██████████| 300/300 [03:14<00:00,  1.54batch/s, loss=0.6509, lr=0.00e+00, wd=1.90e-03]



Training Metrics:
Loss: 0.6555
Precision: 0.9992
Recall: 0.9992
F1-Score: 0.9992
Learning Rate: 0.00e+00
Weight Decay: 1.90e-03

Validation Metrics:
Loss: 0.8402
Accuracy: 0.9300
Precision: 0.9384
Recall: 0.9300
F1-Score: 0.9305
Confusion Matrix Shape: (30, 30)
Plots saved to plots/dbmdz_bert-base-turkish-cased/combined_metrics_fold_1.png
Metrics for fold 1 saved to fold_metrics/dbmdz_bert-base-turkish-cased/performance_metrics_fold_1.csv

Fold 1 Results:
Loss: 0.8292
Accuracy: 0.9367
Precision: 0.9401
Recall: 0.9367
F1-Score: 0.9362


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Fold 2/5
Train size: 1200, Validation size: 300

Training Fold 2
Total steps: 3000, Warmup steps: 600

Epoch 1/10
------------------------------


Training: 100%|██████████| 300/300 [03:15<00:00,  1.54batch/s, loss=3.1956, lr=1.50e-05, wd=1.00e-02]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Training Metrics:
Loss: 3.4176
Precision: 0.0463
Recall: 0.0450
F1-Score: 0.0389
Learning Rate: 1.50e-05
Weight Decay: 1.00e-02

Validation Metrics:
Loss: 3.1758
Accuracy: 0.1500
Precision: 0.1162
Recall: 0.1500
F1-Score: 0.0789

Saved best model for fold 2 with F1: 0.0789

Epoch 2/10
------------------------------


Training: 100%|██████████| 300/300 [03:11<00:00,  1.56batch/s, loss=1.7823, lr=3.00e-05, wd=9.10e-03]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Training Metrics:
Loss: 2.7445
Precision: 0.2868
Recall: 0.2933
F1-Score: 0.2745
Learning Rate: 3.00e-05
Weight Decay: 9.10e-03

Validation Metrics:
Loss: 1.9456
Accuracy: 0.5600
Precision: 0.5678
Recall: 0.5600
F1-Score: 0.5093

Saved best model for fold 2 with F1: 0.5093

Epoch 3/10
------------------------------


Training: 100%|██████████| 300/300 [03:13<00:00,  1.55batch/s, loss=0.9900, lr=2.89e-05, wd=8.20e-03]



Training Metrics:
Loss: 1.6017
Precision: 0.6957
Recall: 0.6933
F1-Score: 0.6812
Learning Rate: 2.89e-05
Weight Decay: 8.20e-03

Validation Metrics:
Loss: 1.3342
Accuracy: 0.7467
Precision: 0.7727
Recall: 0.7467
F1-Score: 0.7334

Saved best model for fold 2 with F1: 0.7334

Epoch 4/10
------------------------------


Training: 100%|██████████| 300/300 [03:12<00:00,  1.56batch/s, loss=1.2859, lr=2.56e-05, wd=7.30e-03]



Training Metrics:
Loss: 1.0549
Precision: 0.8653
Recall: 0.8658
F1-Score: 0.8637
Learning Rate: 2.56e-05
Weight Decay: 7.30e-03

Validation Metrics:
Loss: 1.1452
Accuracy: 0.8267
Precision: 0.8815
Recall: 0.8267
F1-Score: 0.8196

Saved best model for fold 2 with F1: 0.8196

Epoch 5/10
------------------------------


Training: 100%|██████████| 300/300 [03:12<00:00,  1.56batch/s, loss=1.4795, lr=2.07e-05, wd=6.40e-03]



Training Metrics:
Loss: 0.8014
Precision: 0.9571
Recall: 0.9558
F1-Score: 0.9555
Learning Rate: 2.07e-05
Weight Decay: 6.40e-03

Validation Metrics:
Loss: 1.0435
Accuracy: 0.8500
Precision: 0.8707
Recall: 0.8500
F1-Score: 0.8470

Saved best model for fold 2 with F1: 0.8470

Epoch 6/10
------------------------------


Training: 100%|██████████| 300/300 [03:12<00:00,  1.56batch/s, loss=0.6541, lr=1.50e-05, wd=5.50e-03]



Training Metrics:
Loss: 0.6977
Precision: 0.9878
Recall: 0.9875
F1-Score: 0.9875
Learning Rate: 1.50e-05
Weight Decay: 5.50e-03

Validation Metrics:
Loss: 1.0166
Accuracy: 0.8800
Precision: 0.8966
Recall: 0.8800
F1-Score: 0.8760

Saved best model for fold 2 with F1: 0.8760

Epoch 7/10
------------------------------


Training: 100%|██████████| 300/300 [03:12<00:00,  1.56batch/s, loss=0.7063, lr=9.26e-06, wd=4.60e-03]



Training Metrics:
Loss: 0.6715
Precision: 0.9935
Recall: 0.9933
F1-Score: 0.9933
Learning Rate: 9.26e-06
Weight Decay: 4.60e-03

Validation Metrics:
Loss: 0.9852
Accuracy: 0.8900
Precision: 0.9074
Recall: 0.8900
F1-Score: 0.8875

Saved best model for fold 2 with F1: 0.8875

Epoch 8/10
------------------------------


Training: 100%|██████████| 300/300 [03:11<00:00,  1.57batch/s, loss=0.7393, lr=4.39e-06, wd=3.70e-03]



Training Metrics:
Loss: 0.6595
Precision: 0.9984
Recall: 0.9983
F1-Score: 0.9983
Learning Rate: 4.39e-06
Weight Decay: 3.70e-03

Validation Metrics:
Loss: 1.0057
Accuracy: 0.8667
Precision: 0.8852
Recall: 0.8667
F1-Score: 0.8651

Epoch 9/10
------------------------------


Training: 100%|██████████| 300/300 [03:13<00:00,  1.55batch/s, loss=0.6572, lr=1.14e-06, wd=2.80e-03]



Training Metrics:
Loss: 0.6570
Precision: 0.9984
Recall: 0.9983
F1-Score: 0.9983
Learning Rate: 1.14e-06
Weight Decay: 2.80e-03

Validation Metrics:
Loss: 0.9973
Accuracy: 0.8800
Precision: 0.8987
Recall: 0.8800
F1-Score: 0.8788

Epoch 10/10
------------------------------


Training: 100%|██████████| 300/300 [03:12<00:00,  1.56batch/s, loss=0.6518, lr=0.00e+00, wd=1.90e-03]



Training Metrics:
Loss: 0.6542
Precision: 0.9992
Recall: 0.9992
F1-Score: 0.9992
Learning Rate: 0.00e+00
Weight Decay: 1.90e-03

Validation Metrics:
Loss: 1.0130
Accuracy: 0.8733
Precision: 0.8939
Recall: 0.8733
F1-Score: 0.8721

Early stopping triggered
Confusion Matrix Shape: (30, 30)
Plots saved to plots/dbmdz_bert-base-turkish-cased/combined_metrics_fold_2.png
Metrics for fold 2 saved to fold_metrics/dbmdz_bert-base-turkish-cased/performance_metrics_fold_2.csv

Fold 2 Results:
Loss: 0.9852
Accuracy: 0.8900
Precision: 0.9074
Recall: 0.8900
F1-Score: 0.8875


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Fold 3/5
Train size: 1200, Validation size: 300

Training Fold 3
Total steps: 3000, Warmup steps: 600

Epoch 1/10
------------------------------


Training: 100%|██████████| 300/300 [03:11<00:00,  1.56batch/s, loss=3.4857, lr=1.50e-05, wd=1.00e-02]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Training Metrics:
Loss: 3.4452
Precision: 0.0628
Recall: 0.0475
F1-Score: 0.0344
Learning Rate: 1.50e-05
Weight Decay: 1.00e-02

Validation Metrics:
Loss: 3.3365
Accuracy: 0.1333
Precision: 0.0446
Recall: 0.1333
F1-Score: 0.0602

Saved best model for fold 3 with F1: 0.0602

Epoch 2/10
------------------------------


Training: 100%|██████████| 300/300 [03:10<00:00,  1.58batch/s, loss=1.9697, lr=3.00e-05, wd=9.10e-03]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Training Metrics:
Loss: 2.8184
Precision: 0.2852
Recall: 0.2683
F1-Score: 0.2600
Learning Rate: 3.00e-05
Weight Decay: 9.10e-03

Validation Metrics:
Loss: 1.9860
Accuracy: 0.5667
Precision: 0.5794
Recall: 0.5667
F1-Score: 0.5166

Saved best model for fold 3 with F1: 0.5166

Epoch 3/10
------------------------------


Training: 100%|██████████| 300/300 [03:09<00:00,  1.58batch/s, loss=1.1848, lr=2.89e-05, wd=8.20e-03]



Training Metrics:
Loss: 1.6521
Precision: 0.6637
Recall: 0.6667
F1-Score: 0.6550
Learning Rate: 2.89e-05
Weight Decay: 8.20e-03

Validation Metrics:
Loss: 1.1419
Accuracy: 0.8567
Precision: 0.8873
Recall: 0.8567
F1-Score: 0.8478

Saved best model for fold 3 with F1: 0.8478

Epoch 4/10
------------------------------


Training: 100%|██████████| 300/300 [03:09<00:00,  1.59batch/s, loss=1.5226, lr=2.56e-05, wd=7.30e-03]



Training Metrics:
Loss: 1.0300
Precision: 0.8867
Recall: 0.8858
F1-Score: 0.8841
Learning Rate: 2.56e-05
Weight Decay: 7.30e-03

Validation Metrics:
Loss: 0.9521
Accuracy: 0.8733
Precision: 0.8946
Recall: 0.8733
F1-Score: 0.8736

Saved best model for fold 3 with F1: 0.8736

Epoch 5/10
------------------------------


Training: 100%|██████████| 300/300 [03:09<00:00,  1.58batch/s, loss=0.6742, lr=2.07e-05, wd=6.40e-03]



Training Metrics:
Loss: 0.7995
Precision: 0.9490
Recall: 0.9483
F1-Score: 0.9480
Learning Rate: 2.07e-05
Weight Decay: 6.40e-03

Validation Metrics:
Loss: 0.9588
Accuracy: 0.8867
Precision: 0.9090
Recall: 0.8867
F1-Score: 0.8800

Saved best model for fold 3 with F1: 0.8800

Epoch 6/10
------------------------------


Training: 100%|██████████| 300/300 [03:09<00:00,  1.58batch/s, loss=0.6528, lr=1.50e-05, wd=5.50e-03]



Training Metrics:
Loss: 0.6997
Precision: 0.9845
Recall: 0.9842
F1-Score: 0.9841
Learning Rate: 1.50e-05
Weight Decay: 5.50e-03

Validation Metrics:
Loss: 0.9091
Accuracy: 0.9067
Precision: 0.9180
Recall: 0.9067
F1-Score: 0.9045

Saved best model for fold 3 with F1: 0.9045

Epoch 7/10
------------------------------


Training: 100%|██████████| 300/300 [03:10<00:00,  1.58batch/s, loss=0.6524, lr=9.26e-06, wd=4.60e-03]



Training Metrics:
Loss: 0.6688
Precision: 0.9960
Recall: 0.9958
F1-Score: 0.9958
Learning Rate: 9.26e-06
Weight Decay: 4.60e-03

Validation Metrics:
Loss: 0.9062
Accuracy: 0.9167
Precision: 0.9262
Recall: 0.9167
F1-Score: 0.9140

Saved best model for fold 3 with F1: 0.9140

Epoch 8/10
------------------------------


Training: 100%|██████████| 300/300 [03:09<00:00,  1.58batch/s, loss=0.6514, lr=4.39e-06, wd=3.70e-03]



Training Metrics:
Loss: 0.6582
Precision: 0.9992
Recall: 0.9992
F1-Score: 0.9992
Learning Rate: 4.39e-06
Weight Decay: 3.70e-03

Validation Metrics:
Loss: 0.9057
Accuracy: 0.9067
Precision: 0.9212
Recall: 0.9067
F1-Score: 0.9034

Epoch 9/10
------------------------------


Training: 100%|██████████| 300/300 [03:12<00:00,  1.56batch/s, loss=0.6506, lr=1.14e-06, wd=2.80e-03]



Training Metrics:
Loss: 0.6546
Precision: 1.0000
Recall: 1.0000
F1-Score: 1.0000
Learning Rate: 1.14e-06
Weight Decay: 2.80e-03

Validation Metrics:
Loss: 0.8821
Accuracy: 0.9067
Precision: 0.9176
Recall: 0.9067
F1-Score: 0.9042

Epoch 10/10
------------------------------


Training: 100%|██████████| 300/300 [03:09<00:00,  1.58batch/s, loss=0.6535, lr=0.00e+00, wd=1.90e-03]



Training Metrics:
Loss: 0.6543
Precision: 0.9992
Recall: 0.9992
F1-Score: 0.9992
Learning Rate: 0.00e+00
Weight Decay: 1.90e-03

Validation Metrics:
Loss: 0.8830
Accuracy: 0.9100
Precision: 0.9218
Recall: 0.9100
F1-Score: 0.9075

Early stopping triggered
Confusion Matrix Shape: (30, 30)
Plots saved to plots/dbmdz_bert-base-turkish-cased/combined_metrics_fold_3.png
Metrics for fold 3 saved to fold_metrics/dbmdz_bert-base-turkish-cased/performance_metrics_fold_3.csv

Fold 3 Results:
Loss: 0.9062
Accuracy: 0.9167
Precision: 0.9262
Recall: 0.9167
F1-Score: 0.9140


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Fold 4/5
Train size: 1200, Validation size: 300

Training Fold 4
Total steps: 3000, Warmup steps: 600

Epoch 1/10
------------------------------


Training: 100%|██████████| 300/300 [03:12<00:00,  1.56batch/s, loss=3.1255, lr=1.50e-05, wd=1.00e-02]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Training Metrics:
Loss: 3.4169
Precision: 0.0837
Recall: 0.0575
F1-Score: 0.0451
Learning Rate: 1.50e-05
Weight Decay: 1.00e-02

Validation Metrics:
Loss: 3.1952
Accuracy: 0.1700
Precision: 0.1940
Recall: 0.1700
F1-Score: 0.1309

Saved best model for fold 4 with F1: 0.1309

Epoch 2/10
------------------------------


Training: 100%|██████████| 300/300 [03:10<00:00,  1.57batch/s, loss=2.6366, lr=3.00e-05, wd=9.10e-03]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Training Metrics:
Loss: 2.6970
Precision: 0.2874
Recall: 0.2950
F1-Score: 0.2727
Learning Rate: 3.00e-05
Weight Decay: 9.10e-03

Validation Metrics:
Loss: 1.9219
Accuracy: 0.5867
Precision: 0.6467
Recall: 0.5867
F1-Score: 0.5282

Saved best model for fold 4 with F1: 0.5282

Epoch 3/10
------------------------------


Training: 100%|██████████| 300/300 [03:10<00:00,  1.57batch/s, loss=1.0758, lr=2.89e-05, wd=8.20e-03]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Training Metrics:
Loss: 1.5812
Precision: 0.7079
Recall: 0.7117
F1-Score: 0.7045
Learning Rate: 2.89e-05
Weight Decay: 8.20e-03

Validation Metrics:
Loss: 1.2756
Accuracy: 0.7700
Precision: 0.8212
Recall: 0.7700
F1-Score: 0.7622

Saved best model for fold 4 with F1: 0.7622

Epoch 4/10
------------------------------


Training: 100%|██████████| 300/300 [03:10<00:00,  1.57batch/s, loss=0.7996, lr=2.56e-05, wd=7.30e-03]



Training Metrics:
Loss: 1.0016
Precision: 0.8899
Recall: 0.8908
F1-Score: 0.8892
Learning Rate: 2.56e-05
Weight Decay: 7.30e-03

Validation Metrics:
Loss: 1.0341
Accuracy: 0.8733
Precision: 0.8809
Recall: 0.8733
F1-Score: 0.8707

Saved best model for fold 4 with F1: 0.8707

Epoch 5/10
------------------------------


Training: 100%|██████████| 300/300 [03:09<00:00,  1.59batch/s, loss=0.6777, lr=2.07e-05, wd=6.40e-03]



Training Metrics:
Loss: 0.7842
Precision: 0.9595
Recall: 0.9592
F1-Score: 0.9590
Learning Rate: 2.07e-05
Weight Decay: 6.40e-03

Validation Metrics:
Loss: 1.0484
Accuracy: 0.8733
Precision: 0.8864
Recall: 0.8733
F1-Score: 0.8688

Epoch 6/10
------------------------------


Training: 100%|██████████| 300/300 [03:10<00:00,  1.58batch/s, loss=0.6572, lr=1.50e-05, wd=5.50e-03]



Training Metrics:
Loss: 0.6941
Precision: 0.9879
Recall: 0.9875
F1-Score: 0.9875
Learning Rate: 1.50e-05
Weight Decay: 5.50e-03

Validation Metrics:
Loss: 1.0046
Accuracy: 0.8833
Precision: 0.8949
Recall: 0.8833
F1-Score: 0.8798

Saved best model for fold 4 with F1: 0.8798

Epoch 7/10
------------------------------


Training: 100%|██████████| 300/300 [03:09<00:00,  1.58batch/s, loss=0.6506, lr=9.26e-06, wd=4.60e-03]



Training Metrics:
Loss: 0.6721
Precision: 0.9942
Recall: 0.9942
F1-Score: 0.9941
Learning Rate: 9.26e-06
Weight Decay: 4.60e-03

Validation Metrics:
Loss: 0.9655
Accuracy: 0.9000
Precision: 0.9144
Recall: 0.9000
F1-Score: 0.8975

Saved best model for fold 4 with F1: 0.8975

Epoch 8/10
------------------------------


Training: 100%|██████████| 300/300 [03:08<00:00,  1.59batch/s, loss=0.6509, lr=4.39e-06, wd=3.70e-03]



Training Metrics:
Loss: 0.6596
Precision: 0.9984
Recall: 0.9983
F1-Score: 0.9983
Learning Rate: 4.39e-06
Weight Decay: 3.70e-03

Validation Metrics:
Loss: 0.9396
Accuracy: 0.9067
Precision: 0.9192
Recall: 0.9067
F1-Score: 0.9052

Saved best model for fold 4 with F1: 0.9052

Epoch 9/10
------------------------------


Training: 100%|██████████| 300/300 [03:09<00:00,  1.58batch/s, loss=0.6534, lr=1.14e-06, wd=2.80e-03]



Training Metrics:
Loss: 0.6558
Precision: 0.9992
Recall: 0.9992
F1-Score: 0.9992
Learning Rate: 1.14e-06
Weight Decay: 2.80e-03

Validation Metrics:
Loss: 0.9043
Accuracy: 0.9233
Precision: 0.9293
Recall: 0.9233
F1-Score: 0.9227

Saved best model for fold 4 with F1: 0.9227

Epoch 10/10
------------------------------


Training: 100%|██████████| 300/300 [03:10<00:00,  1.58batch/s, loss=0.6504, lr=0.00e+00, wd=1.90e-03]



Training Metrics:
Loss: 0.6557
Precision: 0.9984
Recall: 0.9983
F1-Score: 0.9983
Learning Rate: 0.00e+00
Weight Decay: 1.90e-03

Validation Metrics:
Loss: 0.9071
Accuracy: 0.9200
Precision: 0.9277
Recall: 0.9200
F1-Score: 0.9194
Confusion Matrix Shape: (30, 30)
Plots saved to plots/dbmdz_bert-base-turkish-cased/combined_metrics_fold_4.png
Metrics for fold 4 saved to fold_metrics/dbmdz_bert-base-turkish-cased/performance_metrics_fold_4.csv

Fold 4 Results:
Loss: 0.9043
Accuracy: 0.9233
Precision: 0.9293
Recall: 0.9233
F1-Score: 0.9227


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Fold 5/5
Train size: 1200, Validation size: 300

Training Fold 5
Total steps: 3000, Warmup steps: 600

Epoch 1/10
------------------------------


Training: 100%|██████████| 300/300 [03:10<00:00,  1.57batch/s, loss=3.2787, lr=1.50e-05, wd=1.00e-02]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Training Metrics:
Loss: 3.4687
Precision: 0.0202
Recall: 0.0325
F1-Score: 0.0209
Learning Rate: 1.50e-05
Weight Decay: 1.00e-02

Validation Metrics:
Loss: 3.3496
Accuracy: 0.0833
Precision: 0.0354
Recall: 0.0833
F1-Score: 0.0411

Saved best model for fold 5 with F1: 0.0411

Epoch 2/10
------------------------------


Training: 100%|██████████| 300/300 [03:10<00:00,  1.57batch/s, loss=1.6988, lr=3.00e-05, wd=9.10e-03]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Training Metrics:
Loss: 2.8808
Precision: 0.2833
Recall: 0.2625
F1-Score: 0.2526
Learning Rate: 3.00e-05
Weight Decay: 9.10e-03

Validation Metrics:
Loss: 2.1217
Accuracy: 0.5033
Precision: 0.6007
Recall: 0.5033
F1-Score: 0.4669

Saved best model for fold 5 with F1: 0.4669

Epoch 3/10
------------------------------


Training: 100%|██████████| 300/300 [03:11<00:00,  1.56batch/s, loss=0.9219, lr=2.89e-05, wd=8.20e-03]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Training Metrics:
Loss: 1.6106
Precision: 0.6963
Recall: 0.6958
F1-Score: 0.6880
Learning Rate: 2.89e-05
Weight Decay: 8.20e-03

Validation Metrics:
Loss: 1.1966
Accuracy: 0.8100
Precision: 0.8001
Recall: 0.8100
F1-Score: 0.7835

Saved best model for fold 5 with F1: 0.7835

Epoch 4/10
------------------------------


Training: 100%|██████████| 300/300 [03:11<00:00,  1.57batch/s, loss=0.9533, lr=2.56e-05, wd=7.30e-03]



Training Metrics:
Loss: 1.0083
Precision: 0.8764
Recall: 0.8783
F1-Score: 0.8760
Learning Rate: 2.56e-05
Weight Decay: 7.30e-03

Validation Metrics:
Loss: 0.9181
Accuracy: 0.9033
Precision: 0.9114
Recall: 0.9033
F1-Score: 0.9003

Saved best model for fold 5 with F1: 0.9003

Epoch 5/10
------------------------------


Training: 100%|██████████| 300/300 [03:12<00:00,  1.56batch/s, loss=0.6729, lr=2.07e-05, wd=6.40e-03]



Training Metrics:
Loss: 0.7806
Precision: 0.9596
Recall: 0.9592
F1-Score: 0.9590
Learning Rate: 2.07e-05
Weight Decay: 6.40e-03

Validation Metrics:
Loss: 0.9336
Accuracy: 0.8900
Precision: 0.9051
Recall: 0.8900
F1-Score: 0.8858

Epoch 6/10
------------------------------


Training: 100%|██████████| 300/300 [03:10<00:00,  1.58batch/s, loss=0.6513, lr=1.50e-05, wd=5.50e-03]



Training Metrics:
Loss: 0.6928
Precision: 0.9853
Recall: 0.9850
F1-Score: 0.9850
Learning Rate: 1.50e-05
Weight Decay: 5.50e-03

Validation Metrics:
Loss: 0.8715
Accuracy: 0.9233
Precision: 0.9286
Recall: 0.9233
F1-Score: 0.9202

Saved best model for fold 5 with F1: 0.9202

Epoch 7/10
------------------------------


Training: 100%|██████████| 300/300 [03:08<00:00,  1.59batch/s, loss=0.6555, lr=9.26e-06, wd=4.60e-03]



Training Metrics:
Loss: 0.6622
Precision: 0.9984
Recall: 0.9983
F1-Score: 0.9983
Learning Rate: 9.26e-06
Weight Decay: 4.60e-03

Validation Metrics:
Loss: 0.8598
Accuracy: 0.9300
Precision: 0.9343
Recall: 0.9300
F1-Score: 0.9277

Saved best model for fold 5 with F1: 0.9277

Epoch 8/10
------------------------------


Training: 100%|██████████| 300/300 [03:09<00:00,  1.58batch/s, loss=0.6544, lr=4.39e-06, wd=3.70e-03]



Training Metrics:
Loss: 0.6565
Precision: 0.9984
Recall: 0.9983
F1-Score: 0.9983
Learning Rate: 4.39e-06
Weight Decay: 3.70e-03

Validation Metrics:
Loss: 0.8713
Accuracy: 0.9267
Precision: 0.9316
Recall: 0.9267
F1-Score: 0.9235

Epoch 9/10
------------------------------


Training: 100%|██████████| 300/300 [03:09<00:00,  1.58batch/s, loss=0.6497, lr=1.14e-06, wd=2.80e-03]



Training Metrics:
Loss: 0.6535
Precision: 1.0000
Recall: 1.0000
F1-Score: 1.0000
Learning Rate: 1.14e-06
Weight Decay: 2.80e-03

Validation Metrics:
Loss: 0.8896
Accuracy: 0.9200
Precision: 0.9246
Recall: 0.9200
F1-Score: 0.9150

Epoch 10/10
------------------------------


Training: 100%|██████████| 300/300 [03:08<00:00,  1.59batch/s, loss=0.6570, lr=0.00e+00, wd=1.90e-03]



Training Metrics:
Loss: 0.6528
Precision: 0.9992
Recall: 0.9992
F1-Score: 0.9992
Learning Rate: 0.00e+00
Weight Decay: 1.90e-03

Validation Metrics:
Loss: 0.8884
Accuracy: 0.9167
Precision: 0.9221
Recall: 0.9167
F1-Score: 0.9118

Early stopping triggered
Confusion Matrix Shape: (30, 30)
Plots saved to plots/dbmdz_bert-base-turkish-cased/combined_metrics_fold_5.png
Metrics for fold 5 saved to fold_metrics/dbmdz_bert-base-turkish-cased/performance_metrics_fold_5.csv

Fold 5 Results:
Loss: 0.8598
Accuracy: 0.9300
Precision: 0.9343
Recall: 0.9300
F1-Score: 0.9277

Average metrics across folds:
Loss: 0.8969
Accuracy: 0.9193
Precision: 0.9274
Recall: 0.9193
F1-Score: 0.9176

Saved detailed fold results to 'fold_results.csv'


In [14]:
# --- Save Overall Results ---
results_df = pd.DataFrame([{
    'Loss': best_metrics['loss'],
    'Accuracy': best_metrics['accuracy'],
    'Precision': best_metrics['precision'],
    'Recall': best_metrics['recall'],
    'F1-Score': best_metrics['f1']
}])
results_df.to_csv("best_metrics.csv", index=False)
print("Results saved to 'best_metrics.csv'")

Results saved to 'best_metrics.csv'


In [15]:
# Generate and display the final performance table as required
overall_results = generate_overall_results("fold_metrics/dbmdz_bert-base-turkish-cased", output_filename="overall_performance_metrics.csv")
display_as_dataframe(overall_results)


Overall performance metrics saved to 'overall_performance_metrics.csv'.


Unnamed: 0,Unnamed: 1,Precision,Recall,F-Score
0,1,0.911111,0.96,0.932057
1,2,0.906667,0.92,0.911292
2,3,0.856429,0.64,0.720224
3,4,0.981818,0.96,0.969424
4,5,0.96,0.88,0.916725
5,6,0.981818,1.0,0.990476
6,7,1.0,1.0,1.0
7,8,1.0,1.0,1.0
8,9,0.905,0.92,0.901098
9,10,0.852525,0.78,0.804637
