In [1]:
import os
import random
import numpy as np
import pandas as pd

from PIL import Image
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from sklearn.model_selection import train_test_split
from datasets import load_dataset
from tqdm.auto import tqdm

from helpers import slugify

In [6]:
# Configuration
ARTEMIS_PATH = "data/artemis-v2/dataset/combined/train/artemis_preprocessed.csv"
# WIKIART_ROOT = "data/wikiart_extracted/"

# Subset configuration
USE_SUBSET = False        # Set to True to use a subset, False to use all train/val data
SUBSET_FRACTION = 0.05     # Fraction of the combined train+val data to use (e.g., 0.1 for 10%)

# Training configuration
BATCH_SIZE = 32
NUM_WORKERS = 4
NUM_EPOCHS = 20
LEARNING_RATE = 1e-4
RANDOM_SEED = 6552
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
CURRENT_TIME = pd.Timestamp.now().strftime("%Y-%m-%d_%H-%M-%S")

In [14]:
import unicodedata

torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED) 

df = pd.read_csv(ARTEMIS_PATH)

print(f"Loaded {len(df)} annotations.")

df['image_file'] = df['image_file'].apply(lambda x: unicodedata.normalize('NFC', x))

# Create integer labels for emotions
emotions = sorted(df['emotion'].unique())
emotion_to_idx = {emotion: i for i, emotion in enumerate(emotions)}
idx_to_emotion = {i: emotion for emotion, i in emotion_to_idx.items()}
num_classes = len(emotions)
print(f"Found {num_classes} unique emotion classes: {emotions}")

# Add integer label column
df['emotion_idx'] = df['emotion'].map(emotion_to_idx)

# Group by the unique image path and the original split
grouped = df.groupby(['image_file', 'split'])

multi_label_data = []
for (img_path, split), group in tqdm(grouped, desc="Grouping Images"):
    # Get all unique emotion indices for this image within this split
    unique_emotion_indices = group['emotion_idx'].unique()

    # Create multi-hot encoded vector
    label_vector = torch.zeros(num_classes, dtype=torch.float32)
    label_vector[unique_emotion_indices] = 1.0

    multi_label_data.append({
        'image_file': img_path,
        'split': split,
        'multi_hot_label': label_vector
    })

df_unique_images = pd.DataFrame(multi_label_data)
print(f"Created multi-label data for {len(df_unique_images)} unique images across splits.")

# --- 5. Split Unique Image Data ---
train_df_full = df_unique_images[df_unique_images['split'] == 'train'].reset_index(drop=True)
val_df_full   = df_unique_images[df_unique_images['split'] == 'val'].reset_index(drop=True)
test_df       = df_unique_images[df_unique_images['split'] == 'test'].reset_index(drop=True) # Keep test set separate

print(f"Unique images per split: Train={len(train_df_full)}, Val={len(val_df_full)}, Test={len(test_df)}")


# --- 6. Subset Selection (Random Sample of Unique Images) ---
if USE_SUBSET and SUBSET_FRACTION > 0 and SUBSET_FRACTION < 1:
    print(f"Selecting a random {SUBSET_FRACTION:.0%} subset of UNIQUE train/val images...")

    # Combine unique train and validation images
    train_val_unique_full = pd.concat([train_df_full, val_df_full], ignore_index=True)

    # Sample a fraction of the unique images randomly
    # NOTE: Stratification on multi-hot labels is complex, doing random sample for simplicity.
    subset_df = train_val_unique_full.sample(frac=SUBSET_FRACTION, random_state=RANDOM_SEED).reset_index(drop=True)
    print(f"Total unique images in subset: {len(subset_df)}")

    # Split this SUBSET back into train and validation (simple split, no stratification here)
    # Calculate the original train/(train+val) ratio based on *unique image counts*
    if (len(train_df_full) + len(val_df_full)) > 0:
       original_train_ratio = len(train_df_full) / (len(train_df_full) + len(val_df_full))
    else:
       original_train_ratio = 1.0 # Default if no val data

    if len(subset_df) < 2 or original_train_ratio >=1 or original_train_ratio <=0 :
         print("Warning: Subset too small or original ratio invalid for train/val split. Using entire subset for training.")
         train_df = subset_df
         val_df = pd.DataFrame(columns=subset_df.columns) # Empty df
    else:
        # Use train_test_split just for index splitting
        train_indices, val_indices = train_test_split(
            subset_df.index,
            train_size=original_train_ratio,
            random_state=RANDOM_SEED
            # No stratification here for simplicity
        )
        train_df = subset_df.loc[train_indices].reset_index(drop=True)
        val_df = subset_df.loc[val_indices].reset_index(drop=True)

    print(f"Unique images in subset splits: Train={len(train_df)}, Val={len(val_df)}")

else:
    print("Using full unique image sets for training and validation.")
    train_df = train_df_full
    val_df = val_df_full

Loaded 475996 annotations.
Found 9 unique emotion classes: ['amusement', 'anger', 'awe', 'contentment', 'disgust', 'excitement', 'fear', 'sadness', 'something else']


Grouping Images:   0%|          | 0/80317 [00:00<?, ?it/s]

Created multi-label data for 80317 unique images across splits.
Unique images per split: Train=69529, Val=3598, Test=7190
Using full unique image sets for training and validation.


In [10]:
# Example with an absolute path (replace with your actual path)
absolute_image_path = r"\data\wikiart_extracted\Naive_Art_Primitivism\grã©goire-michonze_figures-in-the-village(1).jpg"
absolute_image_path = "data/wikiart_extracted/Naive_Art_Primitivism/grã©goire-michonze_figures-in-the-village(1).jpg"

if os.path.exists(absolute_image_path):
    img = Image.open(absolute_image_path)
    print("Opened with absolute path")
else:
    print(f"Absolute path not found: {absolute_image_path}")

Absolute path not found: data/wikiart_extracted/Naive_Art_Primitivism/grã©goire-michonze_figures-in-the-village(1).jpg


In [15]:
class MultiLabelImageDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        """
        Args:
            dataframe (pd.DataFrame): DataFrame with unique images.
                                      Must have 'image_file' and 'multi_hot_label'.
            transform (callable, optional): Transform to apply to the image.
        """
        self.df = dataframe
        self.transform = transform
        if 'image_file' not in self.df.columns or 'multi_hot_label' not in self.df.columns:
             raise ValueError("DataFrame must contain 'image_file' and 'multi_hot_label'")

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_path = self.df.loc[idx, 'image_file']
        # The label is already a pre-computed tensor
        label = self.df.loc[idx, 'multi_hot_label']

        try:
            image = Image.open(img_path).convert('RGB')
        except FileNotFoundError:
            print(f"ERROR: Image file not found: {img_path}")
            return None, None # Indicate error
        except Exception as e:
             print(f"ERROR: Could not load/process image {img_path}: {e}")
             return None, None # Indicate error

        if self.transform:
            image = self.transform(image)

        # Label is already a tensor, just return it
        return image, label

# --- 8. Transforms (Same as before) ---
train_tf = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
val_tf = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# --- 9. DataLoaders (Same collate_fn) ---
print("Creating DataLoaders...")
train_ds = MultiLabelImageDataset(train_df, transform=train_tf)
# Only create val_ds if val_df is not empty
val_ds = None
if not val_df.empty:
    val_ds = MultiLabelImageDataset(val_df, transform=val_tf)


def collate_fn_skip_error(batch):
    batch = list(filter(lambda x: x is not None and x[0] is not None and x[1] is not None, batch))
    if not batch:
        return None, None
    return torch.utils.data.dataloader.default_collate(batch)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, pin_memory=False, collate_fn=collate_fn_skip_error, persistent_workers=True if NUM_WORKERS > 0 else False) # pin_memory=False for CPU usually

val_loader = None
if val_ds:
    val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=False, collate_fn=collate_fn_skip_error, persistent_workers=True if NUM_WORKERS > 0 else False)
else:
    print("Validation set is empty, skipping validation loader creation.")


# --- 10. Model Definition (Output layer unchanged, interpretation changes) ---
print("Setting up model...")
model = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V1)
in_features = model.fc.in_features
# Output layer still has num_classes neurons, one for each potential emotion label
model.fc = nn.Linear(in_features, num_classes)
model = model.to(DEVICE) # Moves model to CPU

# --- 11. Loss Function and Optimizer (CHANGED FOR MULTI-LABEL) ---
# BCEWithLogitsLoss is standard for multi-label problems with multi-hot targets
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

# --- 12. Training Loop (Adapted for Multi-Label) ---
print(f"\nStarting training for {NUM_EPOCHS} epochs...")

for epoch in range(NUM_EPOCHS):
    # --- Training Phase ---
    model.train()
    train_loss = 0.0
    # Accuracy calculation is less straightforward for multi-label.
    # We can track loss, or use metrics like Hamming Loss or threshold-based accuracy later.
    # Let's focus on loss for the training printout.
    processed_samples_train = 0 # Keep track of successfully processed samples

    train_loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS} [Train]", leave=False)
    for imgs, labels in train_loop:
        if imgs is None or labels is None: continue # Skip batches with loading errors

        # Labels are already float tensors (multi-hot) from the dataset
        imgs, labels = imgs.to(DEVICE), labels.to(DEVICE)

        # Forward pass
        outputs = model(imgs) # Raw logits from the model

        # Calculate loss
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * imgs.size(0)
        processed_samples_train += imgs.size(0)
        train_loop.set_postfix(loss=loss.item())

    avg_train_loss = train_loss / processed_samples_train if processed_samples_train > 0 else 0

    # --- Validation Phase ---
    avg_val_loss = float('nan')
    processed_samples_val = 0
    if val_loader:
        model.eval()
        val_loss = 0.0
        val_loop = tqdm(val_loader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS} [Val]", leave=False)

        with torch.no_grad():
            for imgs, labels in val_loop:
                if imgs is None or labels is None: continue

                imgs, labels = imgs.to(DEVICE), labels.to(DEVICE)
                outputs = model(imgs)
                loss = criterion(outputs, labels) # Use the same multi-label loss

                val_loss += loss.item() * imgs.size(0)
                processed_samples_val += imgs.size(0)
                val_loop.set_postfix(loss=loss.item())

        avg_val_loss = val_loss / processed_samples_val if processed_samples_val > 0 else 0

    # --- Log Epoch Results (Focus on Loss) ---
    print(f"Epoch {epoch+1}/{NUM_EPOCHS} - "
          f"Train Loss: {avg_train_loss:.4f} - "
          f"Val Loss: {avg_val_loss:.4f}")
          # Add multi-label accuracy metrics here if desired later

    # Optional: Save checkpoint
    torch.save(model.state_dict(), f'wikiart_emotion_ml_epoch_{epoch+1}.pth')


print("\nTraining finished.")

# Save final model
torch.save(model.state_dict(), f'wikiart_emotion_ml_{CURRENT_TIME}.pth')

Creating DataLoaders...
Setting up model...

Starting training for 20 epochs...


Epoch 1/20 [Train]:   0%|          | 0/2173 [00:00<?, ?it/s]



Epoch 1/20 [Val]:   0%|          | 0/113 [00:00<?, ?it/s]

Epoch 1/20 - Train Loss: 0.5866 - Val Loss: 0.6100


Epoch 2/20 [Train]:   0%|          | 0/2173 [00:00<?, ?it/s]



Epoch 2/20 [Val]:   0%|          | 0/113 [00:00<?, ?it/s]

Epoch 2/20 - Train Loss: 0.5808 - Val Loss: 0.5989


Epoch 3/20 [Train]:   0%|          | 0/2173 [00:00<?, ?it/s]



Epoch 3/20 [Val]:   0%|          | 0/113 [00:00<?, ?it/s]

Epoch 3/20 - Train Loss: 0.5781 - Val Loss: 0.6065


Epoch 4/20 [Train]:   0%|          | 0/2173 [00:00<?, ?it/s]



Epoch 4/20 [Val]:   0%|          | 0/113 [00:00<?, ?it/s]

Epoch 4/20 - Train Loss: 0.5769 - Val Loss: 0.5979


Epoch 5/20 [Train]:   0%|          | 0/2173 [00:00<?, ?it/s]

Epoch 5/20 [Val]:   0%|          | 0/113 [00:00<?, ?it/s]

Epoch 5/20 - Train Loss: 0.5747 - Val Loss: 0.5984


Epoch 6/20 [Train]:   0%|          | 0/2173 [00:00<?, ?it/s]

Epoch 6/20 [Val]:   0%|          | 0/113 [00:00<?, ?it/s]

Epoch 6/20 - Train Loss: 0.5731 - Val Loss: 0.5939


Epoch 7/20 [Train]:   0%|          | 0/2173 [00:00<?, ?it/s]



Epoch 7/20 [Val]:   0%|          | 0/113 [00:00<?, ?it/s]

Epoch 7/20 - Train Loss: 0.5717 - Val Loss: 0.5940


Epoch 8/20 [Train]:   0%|          | 0/2173 [00:00<?, ?it/s]

Epoch 8/20 [Val]:   0%|          | 0/113 [00:00<?, ?it/s]

Epoch 8/20 - Train Loss: 0.5703 - Val Loss: 0.5940


Epoch 9/20 [Train]:   0%|          | 0/2173 [00:00<?, ?it/s]

Epoch 9/20 [Val]:   0%|          | 0/113 [00:00<?, ?it/s]

Epoch 9/20 - Train Loss: 0.5689 - Val Loss: 0.5960


Epoch 10/20 [Train]:   0%|          | 0/2173 [00:00<?, ?it/s]

Epoch 10/20 [Val]:   0%|          | 0/113 [00:00<?, ?it/s]

Epoch 10/20 - Train Loss: 0.5672 - Val Loss: 0.5885


Epoch 11/20 [Train]:   0%|          | 0/2173 [00:00<?, ?it/s]

Epoch 11/20 [Val]:   0%|          | 0/113 [00:00<?, ?it/s]

Epoch 11/20 - Train Loss: 0.5661 - Val Loss: 0.5891


Epoch 12/20 [Train]:   0%|          | 0/2173 [00:00<?, ?it/s]

Epoch 12/20 [Val]:   0%|          | 0/113 [00:00<?, ?it/s]

Epoch 12/20 - Train Loss: 0.5646 - Val Loss: 0.5952


Epoch 13/20 [Train]:   0%|          | 0/2173 [00:00<?, ?it/s]

Epoch 13/20 [Val]:   0%|          | 0/113 [00:00<?, ?it/s]

Epoch 13/20 - Train Loss: 0.5634 - Val Loss: 0.5963


Epoch 14/20 [Train]:   0%|          | 0/2173 [00:00<?, ?it/s]

Epoch 14/20 [Val]:   0%|          | 0/113 [00:00<?, ?it/s]

Epoch 14/20 - Train Loss: 0.5616 - Val Loss: 0.5932


Epoch 15/20 [Train]:   0%|          | 0/2173 [00:00<?, ?it/s]

Epoch 15/20 [Val]:   0%|          | 0/113 [00:00<?, ?it/s]

Epoch 15/20 - Train Loss: 0.5603 - Val Loss: 0.5900


Epoch 16/20 [Train]:   0%|          | 0/2173 [00:00<?, ?it/s]

Epoch 16/20 [Val]:   0%|          | 0/113 [00:00<?, ?it/s]

Epoch 16/20 - Train Loss: 0.5588 - Val Loss: 0.5996


Epoch 17/20 [Train]:   0%|          | 0/2173 [00:00<?, ?it/s]

Epoch 17/20 [Val]:   0%|          | 0/113 [00:00<?, ?it/s]

Epoch 17/20 - Train Loss: 0.5571 - Val Loss: 0.5931


Epoch 18/20 [Train]:   0%|          | 0/2173 [00:00<?, ?it/s]

Epoch 18/20 [Val]:   0%|          | 0/113 [00:00<?, ?it/s]

Epoch 18/20 - Train Loss: 0.5557 - Val Loss: 0.5993


Epoch 19/20 [Train]:   0%|          | 0/2173 [00:00<?, ?it/s]

Epoch 19/20 [Val]:   0%|          | 0/113 [00:00<?, ?it/s]

Epoch 19/20 - Train Loss: 0.5537 - Val Loss: 0.5943


Epoch 20/20 [Train]:   0%|          | 0/2173 [00:00<?, ?it/s]

Epoch 20/20 [Val]:   0%|          | 0/113 [00:00<?, ?it/s]

Epoch 20/20 - Train Loss: 0.5525 - Val Loss: 0.5983

Training finished.


In [16]:
# --- Evaluation Loop with Multi-Label Metrics ---
from sklearn.metrics import hamming_loss, f1_score, precision_score, recall_score, accuracy_score, roc_auc_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

def evaluate_model(model, data_loader, threshold=0.5, device=DEVICE):
    """
    Evaluate a trained multi-label classification model
    
    Args:
        model: Trained PyTorch model
        data_loader: DataLoader with test/validation data
        threshold: Probability threshold for positive prediction (default: 0.5)
        device: Device to run evaluation on
        
    Returns:
        Dictionary with various evaluation metrics
    """
    model.eval()
    all_labels = []
    all_predictions = []
    all_probs = []
    
    with torch.no_grad():
        for images, labels in tqdm(data_loader, desc="Evaluating"):
            if images is None or labels is None:
                continue
                
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            probs = torch.sigmoid(outputs)
            preds = (probs >= threshold).float()
            
            all_labels.append(labels.cpu().numpy())
            all_predictions.append(preds.cpu().numpy())
            all_probs.append(probs.cpu().numpy())
    
    # Concatenate batch results
    y_true = np.vstack(all_labels)
    y_pred = np.vstack(all_predictions)
    y_prob = np.vstack(all_probs)
    
    # Compute metrics
    metrics = {
        'hamming_loss': hamming_loss(y_true, y_pred),
        'exact_match': accuracy_score(y_true, y_pred),  # All labels must match
        'f1_micro': f1_score(y_true, y_pred, average='micro'),
        'f1_macro': f1_score(y_true, y_pred, average='macro'),
        'f1_weighted': f1_score(y_true, y_pred, average='weighted'),
        'f1_samples': f1_score(y_true, y_pred, average='samples'),
        'precision_micro': precision_score(y_true, y_pred, average='micro', zero_division=0),
        'recall_micro': recall_score(y_true, y_pred, average='micro', zero_division=0),
        'precision_macro': precision_score(y_true, y_pred, average='macro', zero_division=0),
        'recall_macro': recall_score(y_true, y_pred, average='macro', zero_division=0),
    }
    
    # Add per-class metrics
    for i, emotion in enumerate(emotions):
        metrics[f'precision_{emotion}'] = precision_score(y_true[:, i], y_pred[:, i], zero_division=0)
        metrics[f'recall_{emotion}'] = recall_score(y_true[:, i], y_pred[:, i], zero_division=0)
        metrics[f'f1_{emotion}'] = f1_score(y_true[:, i], y_pred[:, i], zero_division=0)
        
        # ROC AUC if we have both positive and negative samples
        if len(np.unique(y_true[:, i])) > 1:
            metrics[f'auc_{emotion}'] = roc_auc_score(y_true[:, i], y_prob[:, i])
        else:
            metrics[f'auc_{emotion}'] = float('nan')
    
    return metrics, y_true, y_pred, y_prob

# --- Create test loader if you haven't already ---
if 'test_df' in locals() and len(test_df) > 0:
    test_ds = MultiLabelImageDataset(test_df, transform=val_tf)
    test_loader = DataLoader(
        test_ds, 
        batch_size=BATCH_SIZE, 
        shuffle=False, 
        num_workers=NUM_WORKERS, 
        pin_memory=False, 
        collate_fn=collate_fn_skip_error
    )
    print(f"Created test loader with {len(test_ds)} images")
else:
    print("No test set available. Using validation set for evaluation.")
    test_loader = val_loader

# --- Run evaluation ---
print("\nEvaluating model on test set...")
metrics, y_true, y_pred, y_prob = evaluate_model(model, test_loader)

# --- Print metrics ---
print("\n===== Model Evaluation Results =====")
print(f"Hamming Loss: {metrics['hamming_loss']:.4f} (lower is better)")
print(f"Exact Match Accuracy: {metrics['exact_match']:.4f}")
print(f"Micro-F1 Score: {metrics['f1_micro']:.4f}")
print(f"Macro-F1 Score: {metrics['f1_macro']:.4f}")
print(f"Micro-Precision: {metrics['precision_micro']:.4f}")
print(f"Micro-Recall: {metrics['recall_micro']:.4f}")

# --- Plot confusion matrix for each emotion ---
plt.figure(figsize=(15, 12))
for i, emotion in enumerate(emotions):
    plt.subplot(3, 3, i+1 if i < 8 else 9)
    cm = confusion_matrix(y_true[:, i], y_pred[:, i])
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Not '+emotion, emotion], 
                yticklabels=['Not '+emotion, emotion])
    plt.title(f"{emotion}: F1={metrics[f'f1_{emotion}']:.2f}, AUC={metrics.get(f'auc_{emotion}', 'N/A')}")
    plt.xlabel('Predicted')
    plt.ylabel('True')
plt.tight_layout()
plt.savefig(f'emotion_confusion_matrices_{CURRENT_TIME}.png', dpi=300)
plt.show()

# --- Plot the distribution of predicted emotions ---
plt.figure(figsize=(12, 6))
emotion_counts_true = y_true.sum(axis=0)
emotion_counts_pred = y_pred.sum(axis=0)
indices = np.arange(len(emotions))
width = 0.35

plt.bar(indices - width/2, emotion_counts_true, width, label='True')
plt.bar(indices + width/2, emotion_counts_pred, width, label='Predicted')
plt.xticks(indices, emotions, rotation=45, ha='right')
plt.xlabel('Emotion')
plt.ylabel('Count')
plt.title('Distribution of True vs Predicted Emotions')
plt.legend()
plt.tight_layout()
plt.savefig(f'emotion_distribution_{CURRENT_TIME}.png', dpi=300)
plt.show()

# --- Save evaluation results ---
results_df = pd.DataFrame({
    'Metric': list(metrics.keys()),
    'Value': list(metrics.values())
})
results_df.to_csv(f'evaluation_results_{CURRENT_TIME}.csv', index=False)
print(f"Evaluation results saved to evaluation_results_{CURRENT_TIME}.csv")

# --- Save predictions for further analysis ---
pred_df = pd.DataFrame(y_prob, columns=emotions)
if 'test_df' in locals() and len(test_df) > 0:
    # Add image paths
    pred_df['image_file'] = test_df['image_file'].values
    
pred_df.to_csv(f'predictions_{CURRENT_TIME}.csv', index=False)
print(f"Predictions saved to predictions_{CURRENT_TIME}.csv")

Created test loader with 7190 images

Evaluating model on test set...


Evaluating:   0%|          | 0/225 [00:00<?, ?it/s]


===== Model Evaluation Results =====
Hamming Loss: 0.3121 (lower is better)
Exact Match Accuracy: 0.0441
Micro-F1 Score: 0.4879
Macro-F1 Score: 0.3650
Micro-Precision: 0.6482
Micro-Recall: 0.3912


  plt.show()


Evaluation results saved to evaluation_results_2025-05-15_14-06-46.csv
Predictions saved to predictions_2025-05-15_14-06-46.csv


  plt.show()
