In [1]:
# ## Cell 1: Imports
# Import necessary libraries.

# %%
import os
import glob
import pandas as pd
import numpy as np
import torch
import torchaudio
import librosa
import evaluate # Hugging Face evaluation library
import warnings
import gc # Garbage collector
import types # Import the types module for binding methods
import traceback # For printing tracebacks

from datasets import Dataset, DatasetDict, Audio, Value
from sklearn.model_selection import train_test_split
from transformers import (
    AutoConfig,
    AutoFeatureExtractor,
    AutoModelForAudioClassification,
    TrainingArguments,
    Trainer,
    pipeline,
    Wav2Vec2FeatureExtractor, # Explicit import
    DataCollatorWithPadding,
    EarlyStoppingCallback # Import for early stopping
)
# Import for Augmentation
from audiomentations import Compose, AddGaussianNoise, PitchShift, TimeStretch

# Import for PEFT (LoRA/QLoRA)
from peft import LoraConfig, get_peft_model, PeftModel, prepare_model_for_kbit_training

# Suppress less critical warnings
warnings.filterwarnings('ignore')

# Print library versions (optional but good practice)
import transformers
import peft
import accelerate
import datasets
print("--- Library Versions ---")
print("Transformers:", transformers.__version__)
print("PEFT:", peft.__version__)
print("Accelerate:", accelerate.__version__)
print("Datasets:", datasets.__version__)
print("Torch:", torch.__version__)
print("Evaluate:", evaluate.__version__)
print("Audiomentations:", pd.__version__) # Assuming you meant pandas or audiomentations here
print("----------------------")
print("--- Imports successful ---")

  from .autonotebook import tqdm as notebook_tqdm


--- Library Versions ---
Transformers: 4.51.3
PEFT: 0.15.2
Accelerate: 1.6.0
Datasets: 3.5.0
Torch: 2.6.0+cu124
Evaluate: 0.4.3
Audiomentations: 2.2.3
----------------------
--- Imports successful ---


In [None]:
# ## Cell 2: Configuration
# Define paths, model parameters, training hyperparameters, and LoRA settings.

# %%
print("--- Configuring paths and parameters ---")

# --- Data Path ---
RAVDESS_DATA_PATH = "./RAVDESS Emotional speech audio" #<-- ADJUST IF YOUR PATH IS DIFFERENT

# --- Model Configuration ---
MODEL_ID = "superb/wav2vec2-base-superb-er"
MODEL_NAME = MODEL_ID.split("/")[-1]
# Give a new output directory name for this improved run using LORA
FINETUNED_MODEL_OUTPUT_DIR = f"./{MODEL_NAME}-finetuned-full-ravdess-v2-lora" # <--- Added -lora

# --- RAVDESS Specific Configuration ---
RAVDESS_EMOTION_MAP = {
    "01": "neutral",    # Neutral
    "02": "neutral",    # Calm -> Mapped to Neutral
    "03": "happy",      # Happy
    "04": "sad",        # Sad
    "05": "angry",      # Angry
    "06": "fearful",    # Fearful
    "07": "disgust",    # Disgust
    "08": "surprised"   # Surprised
}
TARGET_LABELS = sorted(list(set(RAVDESS_EMOTION_MAP.values()))) # Get unique mapped labels
TARGET_MODALITY = "03" # Audio-only
TARGET_VOCAL_CHANNEL = "01" # Speech
label2id = {label: i for i, label in enumerate(TARGET_LABELS)}
id2label = {i: label for label, i in label2id.items()}
NUM_LABELS = len(TARGET_LABELS)

# --- Training Configuration ---
NUM_EPOCHS = 15 # Increase number of epochs again, LoRA might need more
LEARNING_RATE = 1e-4 # Try a slightly higher LR for LoRA adapters (can be tuned)
BATCH_SIZE = 4   # Keep or adjust based on your GPU memory
GRADIENT_ACCUMULATION_STEPS = 4 # Keep this
EVAL_STRATEGY = "epoch" # Evaluate each epoch
SAVE_STRATEGY = "epoch" # Save each epoch
LOGGING_STEPS = 50 # Log every 50 steps
LOAD_BEST_MODEL_AT_END = True # Load the best model based on validation metric
METRIC_FOR_BEST_MODEL = "eval_f1" # Use F1-score (requires 'f1' key from compute_metrics)
GREATER_IS_BETTER = True # F1/Accuracy should be maximized
SEED = 42
EARLY_STOPPING_PATIENCE = 5 # Increase patience slightly
SAVE_TOTAL_LIMIT = 3 # Keep more checkpoints if needed


# --- LoRA Configuration ---
LORA_R = 32 # LoRA attention dimension (rank)
LORA_ALPHA = 64 # Alpha parameter for LoRA scaling
LORA_DROPOUT = 0.05 # Dropout probability for LoRA layers
LORA_TARGET_MODULES = ["q_proj", "k_proj", "v_proj", "out_proj"] # Common for attention layers

# --- Print Configuration ---
print(f"RAVDESS Path: {RAVDESS_DATA_PATH}")
print(f"Base Model ID: {MODEL_ID}")
print(f"Output Directory: {FINETUNED_MODEL_OUTPUT_DIR}")
print(f"Target Labels: {TARGET_LABELS}")
print(f"Number of Labels: {NUM_LABELS}")
print(f"Label2ID: {label2id}")
print(f"ID2Label: {id2label}")
print(f"Number of Epochs: {NUM_EPOCHS}")
print(f"Learning Rate: {LEARNING_RATE}")
print(f"Batch Size: {BATCH_SIZE}")
print(f"Gradient Accumulation: {GRADIENT_ACCUMULATION_STEPS}")
print(f"Effective Batch Size: {BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS}")
print(f"Eval Strategy: {EVAL_STRATEGY}")
print(f"Load Best Model: {LOAD_BEST_MODEL_AT_END}")
print(f"Metric for Best Model: {METRIC_FOR_BEST_MODEL}")
print(f"Early Stopping Patience: {EARLY_STOPPING_PATIENCE}")
print(f"LoRA Parameters: r={LORA_R}, alpha={LORA_ALPHA}, dropout={LORA_DROPOUT}, target_modules={LORA_TARGET_MODULES}")
print("--- Configuration loaded ---")

--- Configuring paths and parameters ---
RAVDESS Path: ./RAVDESS Emotional speech audio
Base Model ID: superb/wav2vec2-base-superb-er
Output Directory: ./wav2vec2-base-superb-er-finetuned-full-ravdess-v2-lora
Target Labels: ['angry', 'disgust', 'fearful', 'happy', 'neutral', 'sad', 'surprised']
Number of Labels: 7
Label2ID: {'angry': 0, 'disgust': 1, 'fearful': 2, 'happy': 3, 'neutral': 4, 'sad': 5, 'surprised': 6}
ID2Label: {0: 'angry', 1: 'disgust', 2: 'fearful', 3: 'happy', 4: 'neutral', 5: 'sad', 6: 'surprised'}
Number of Epochs: 15
Learning Rate: 0.0001
Batch Size: 4
Gradient Accumulation: 4
Effective Batch Size: 16
Eval Strategy: epoch
Load Best Model: True
Metric for Best Model: eval_f1
Early Stopping Patience: 5
LoRA Parameters: r=32, alpha=64, dropout=0.05, target_modules=['q_proj', 'k_proj', 'v_proj', 'out_proj']
--- Configuration loaded ---


In [3]:
# ## Cell 3: Load and Prepare RAVDESS Data
# Scan the RAVDESS directory, filter audio files based on naming convention, and create an initial DataFrame.

# %%
# =====================================================
print("\n--- Loading and preparing FULL RAVDESS data ---")
# =====================================================

audio_files = []
emotion_labels = []

if not os.path.exists(RAVDESS_DATA_PATH):
    raise FileNotFoundError(f"The specified RAVDESS path does not exist: {RAVDESS_DATA_PATH}\nPlease double-check the path and Cell 2.")

# Use glob to find all wav files recursively within ALL actor folders
for file_path in glob.glob(os.path.join(RAVDESS_DATA_PATH, "Actor_*", "*.wav")):
    basename = os.path.basename(file_path)
    try:
        parts = basename.split('.')[0].split('-')
        # Filter based on RAVDESS filename structure
        if len(parts) == 7 and parts[0] == TARGET_MODALITY and parts[1] == TARGET_VOCAL_CHANNEL:
            emotion_code = parts[2]
            if emotion_code in RAVDESS_EMOTION_MAP:
                audio_files.append(file_path)
                # Use the mapped emotion label
                emotion_labels.append(RAVDESS_EMOTION_MAP[emotion_code])
    except Exception as e:
        print(f"Warning: Could not parse filename {basename}: {e}")


if not audio_files:
     raise ValueError(f"No audio files matching the criteria (Modality={TARGET_MODALITY}, VocalChannel={TARGET_VOCAL_CHANNEL}) found in {RAVDESS_DATA_PATH}. Check the path and file naming.")

print(f"Found {len(audio_files)} audio files from all Actor folders.")

# --- Create a Pandas DataFrame ---
df = pd.DataFrame({"audio": audio_files, "label": emotion_labels})

# --- Check label distribution (using the mapped labels) ---
print("\nLabel Distribution (Mapped):")
print(df['label'].value_counts())

# --- Clean up unused variables ---
del audio_files
del emotion_labels
gc.collect()
print("--- Initial DataFrame created ---")


--- Loading and preparing FULL RAVDESS data ---
Found 1440 audio files from all Actor folders.

Label Distribution (Mapped):
label
neutral      288
happy        192
sad          192
angry        192
fearful      192
disgust      192
surprised    192
Name: count, dtype: int64
--- Initial DataFrame created ---


In [4]:
# ## Cell 4: Split Data and Create Datasets
# Split the DataFrame into train, validation, and test sets, then convert them to Hugging Face `Dataset` objects within a `DatasetDict`.

# %%
# =====================================================
print("\n--- Splitting data into train, validation, and test sets ---")
# =====================================================

# Use a standard 70% train, 15% validation, 15% test split
train_val_df, test_df = train_test_split(df, test_size=0.15, random_state=SEED, stratify=df['label'])
# Calculate validation split size relative to the remaining data (train_val_df)
val_size_relative = 0.15 / (1.0 - 0.15) # 0.15 / 0.85
train_df, val_df = train_test_split(train_val_df, test_size=val_size_relative, random_state=SEED, stratify=train_val_df['label'])

print(f"Train set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
print(f"Test set size: {len(test_df)}")

# --- Convert Pandas DataFrames to Hugging Face Datasets ---
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
val_dataset = Dataset.from_pandas(val_df.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))

# --- Combine into a DatasetDict ---
raw_datasets = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})

print("\nRaw datasets structure:")
print(raw_datasets)

# --- Clean up ---
del df
del train_val_df
del test_df
del train_df
del val_df
del train_dataset
del val_dataset
del test_dataset
gc.collect()
print("--- Data split and DatasetDict created ---")


--- Splitting data into train, validation, and test sets ---
Train set size: 1008
Validation set size: 216
Test set size: 216

Raw datasets structure:
DatasetDict({
    train: Dataset({
        features: ['audio', 'label'],
        num_rows: 1008
    })
    validation: Dataset({
        features: ['audio', 'label'],
        num_rows: 216
    })
    test: Dataset({
        features: ['audio', 'label'],
        num_rows: 216
    })
})
--- Data split and DatasetDict created ---


In [5]:
# ## Cell 5: Feature Extractor, Augmentation, and Preprocessing Function
# Load the feature extractor, set up audio augmentation, and define the function to process audio data (including casting, resampling, augmenting, tokenizing, padding/truncating).

# %%
# =====================================================
print("\n--- Setting up Feature Extractor and Augmentation ---")
# =====================================================

# --- Load Feature Extractor ---
# Use the specific class if known, otherwise AutoFeatureExtractor is fine
try:
    feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_ID, do_normalize=True)
    print("Wav2Vec2FeatureExtractor loaded.")
except Exception:
    print("Wav2Vec2FeatureExtractor not found, falling back to AutoFeatureExtractor.")
    feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_ID, do_normalize=True)

TARGET_SAMPLING_RATE = feature_extractor.sampling_rate
print(f"Target sampling rate: {TARGET_SAMPLING_RATE}")

# --- Setup Audio Augmentation ---
augmenter = Compose([
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.010, p=0.3), # Reduced amplitude noise
    PitchShift(min_semitones=-2, max_semitones=2, p=0.3),
    TimeStretch(min_rate=0.9, max_rate=1.1, p=0.3, leave_length_unchanged=False), # Time stretch slightly
])
print("Audio augmenter configured.")

# --- Cast the 'audio' column -- Essential Step! ---
# This performs resampling upon loading
try:
    print("Casting 'audio' column to Audio feature type...")
    raw_datasets = raw_datasets.cast_column("audio", Audio(sampling_rate=TARGET_SAMPLING_RATE))
    print("Successfully cast 'audio' column for all splits.")
    print("\nExample audio feature info after casting:")
    print(raw_datasets["train"].features["audio"])
except Exception as e:
    print(f"\nError casting audio column: {e}")
    print("Ensure 'ffmpeg' is installed and accessible in your environment.")
    print("Try: 'conda install ffmpeg' or 'sudo apt update && sudo apt install ffmpeg'")
    raise

# --- Define the preprocessing function ---
MAX_DURATION_SEC = 4.0 # Adjust if needed based on typical RAVDESS length

def preprocess_function(examples, is_train=False):
    try:
        # Load audio arrays using the datasets' Audio feature (already resampled)
        audio_arrays = [x["array"] for x in examples["audio"]]
        # Get rate from the first item, should match TARGET_SAMPLING_RATE after casting
        sampling_rate = examples["audio"][0]["sampling_rate"]

        # --- Apply augmentation ONLY to training data ---
        if is_train:
            processed_audio_arrays = []
            for audio_array in audio_arrays:
                 # Ensure audio is float32 for audiomentations
                 if audio_array.dtype != np.float32:
                      audio_array = audio_array.astype(np.float32)
                 # Apply augmentation
                 augmented_audio = augmenter(samples=audio_array, sample_rate=sampling_rate)
                 processed_audio_arrays.append(augmented_audio)
            audio_arrays = processed_audio_arrays # Replace with augmented audio
        # --- End Augmentation ---

        # Apply feature extractor (padding/truncation)
        inputs = feature_extractor(
            audio_arrays,
            sampling_rate=sampling_rate,
            max_length=int(sampling_rate * MAX_DURATION_SEC),
            truncation=True,
            padding="max_length", # Pad to max_length
            return_attention_mask=True, # Ensure attention mask is returned
            return_tensors="np" # Return numpy arrays initially
        )

        # Map labels to IDs and ensure correct dtype for PyTorch
        label_ids = [label2id[label] for label in examples["label"]]
        inputs["labels"] = np.array(label_ids, dtype=np.int64) # Use int64 for CrossEntropyLoss

        return inputs
    except Exception as e:
        print(f"Error during preprocessing batch: {e}")
        traceback.print_exc()
        # Return an empty dict or raise error depending on desired behavior
        # Returning empty might cause issues later, raising is safer
        raise e

print(f"Preprocessing function defined. Max duration: {MAX_DURATION_SEC}s")
print("--- Feature extractor, augmentation, and preprocessing function set up ---")

# %% [markdown]


--- Setting up Feature Extractor and Augmentation ---
Wav2Vec2FeatureExtractor loaded.
Target sampling rate: 16000
Audio augmenter configured.
Casting 'audio' column to Audio feature type...
Successfully cast 'audio' column for all splits.

Example audio feature info after casting:
Audio(sampling_rate=16000, mono=True, decode=True, id=None)
Preprocessing function defined. Max duration: 4.0s
--- Feature extractor, augmentation, and preprocessing function set up ---


In [6]:
# ## Cell 6: Apply Preprocessing
# Use the `.map()` method to apply the `preprocess_function` to the train, validation, and test sets. Augmentation is applied only to the training set.

# %%
# =====================================================
print("\n--- Applying Preprocessing ---")
# =====================================================

print("\nApplying preprocessing function without augmentation for validation set...")
processed_val_dataset = raw_datasets["validation"].map(
    preprocess_function,
    batched=True,
    fn_kwargs={"is_train": False}, # Disable augmentation
    remove_columns=["audio", "label"] # Remove original columns
)

print("\nApplying preprocessing function without augmentation for test set...")
processed_test_dataset = raw_datasets["test"].map(
    preprocess_function,
    batched=True,
    fn_kwargs={"is_train": False}, # Disable augmentation
    remove_columns=["audio", "label"]
)

print("\nApplying preprocessing function WITH augmentation for train set...")
# Apply mapping to the train set separately to enable augmentation
processed_train_dataset = raw_datasets["train"].map(
    preprocess_function,
    batched=True,
    fn_kwargs={"is_train": True}, # Enable augmentation
    remove_columns=["audio", "label"]
)


# --- Combine into a new Processed DatasetDict ---
processed_datasets = DatasetDict({
    "train": processed_train_dataset,
    "validation": processed_val_dataset,
    "test": processed_test_dataset
})

print("\nProcessed datasets structure:")
print(processed_datasets)
print("\nExample processed train instance keys:", processed_datasets["train"][0].keys())
# Check dtypes after processing
print("Feature dtypes in processed train dataset:")
print(processed_datasets["train"].features)


# --- Clean up ---
del processed_train_dataset
del processed_val_dataset
del processed_test_dataset
del raw_datasets # We don't need the raw audio data loaded anymore
gc.collect()
print("--- Preprocessing applied ---")

# %% [markdown]


--- Applying Preprocessing ---

Applying preprocessing function without augmentation for validation set...


Map: 100%|██████████| 216/216 [00:02<00:00, 75.02 examples/s]



Applying preprocessing function without augmentation for test set...


Map: 100%|██████████| 216/216 [00:01<00:00, 195.90 examples/s]



Applying preprocessing function WITH augmentation for train set...


Map: 100%|██████████| 1008/1008 [00:20<00:00, 49.22 examples/s]



Processed datasets structure:
DatasetDict({
    train: Dataset({
        features: ['input_values', 'attention_mask', 'labels'],
        num_rows: 1008
    })
    validation: Dataset({
        features: ['input_values', 'attention_mask', 'labels'],
        num_rows: 216
    })
    test: Dataset({
        features: ['input_values', 'attention_mask', 'labels'],
        num_rows: 216
    })
})

Example processed train instance keys: dict_keys(['input_values', 'attention_mask', 'labels'])
Feature dtypes in processed train dataset:
{'input_values': Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'labels': Value(dtype='int64', id=None)}
--- Preprocessing applied ---


In [7]:
# %% [markdown]
# ---
# ## Cell 7: Load Base Model and Apply PEFT (LoRA)
# Load the pre-trained base model, calculate its parameters, set up LoRA, apply PEFT, and print the trainable parameters.
# ---

# %%
# =====================================================
print("\n--- Loading pre-trained model for fine-tuning (PEFT/LoRA) ---")
# =====================================================
# Clear some memory before loading model
gc.collect()
if torch.cuda.is_available(): torch.cuda.empty_cache()

# --- Load the base model ---
print(f"Loading base model: {MODEL_ID}")
model = AutoModelForAudioClassification.from_pretrained(
    MODEL_ID,
    num_labels=NUM_LABELS,
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes=True,
    # low_cpu_mem_usage=True, # Optional
)
model = model.float() # Ensure float32
print("Base model loaded and set to float32.")

# --- Calculate and Print Original Parameters ---
total_params_original = sum(p.numel() for p in model.parameters())
print(f"Original base model total parameters: {total_params_original:,}") # Added comma formatting

# --- Find classifier module names ---
classifier_modules = []
if hasattr(model, 'classifier'): classifier_modules.append("classifier")
if hasattr(model, 'projector'): classifier_modules.append("projector")
if hasattr(model, 'output_projection'): classifier_modules.append("output_projection")
if not classifier_modules:
    print("WARNING: Could not find default classifier names. Defaulting to 'classifier'.")
    classifier_modules = ["classifier"]
print(f"Identified classifier modules to keep trainable: {classifier_modules}")

# --- Setup LoRA Configuration ---
print("\n--- Setting up LoRA Configuration ---")
lora_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    target_modules=LORA_TARGET_MODULES,
    modules_to_save=classifier_modules,
)
print(f"LoRA Config: {lora_config}")

# **** MONKEY-PATCH SECTION ****
# (Keep this section as it was)
def dummy_enable_input_require_grads(self):
    print("Skipping enable_input_require_grads via monkey-patch for Wav2Vec2.")
    pass
original_enable_grads_func = None
base_model_ref = model
if hasattr(base_model_ref, 'enable_input_require_grads'):
    original_enable_grads_func = base_model_ref.enable_input_require_grads
    print("Applying monkey-patch to model.enable_input_require_grads...")
    base_model_ref.enable_input_require_grads = types.MethodType(dummy_enable_input_require_grads, base_model_ref)
else:
    print("Model does not have 'enable_input_require_grads' method to patch.")
# ****************************

# --- Apply LoRA to the base model ---
print("\nApplying get_peft_model...")
peft_model = None
try:
    # Use the potentially patched 'model' (which is base_model_ref)
    peft_model = get_peft_model(base_model_ref, lora_config)
    print("Base model wrapped with LoRA adapters.")

    # --- Print Trainable Parameters (Shows Reduction) ---
    print("\n--- PEFT Model Parameters ---")
    peft_model.print_trainable_parameters() # THIS IS THE KEY PEFT FUNCTION

    # --- Move PEFT model to GPU if available ---
    if torch.cuda.is_available():
        device = torch.device("cuda"); peft_model.to(device)
        print(f"\nPEFT Model moved to {device}")
    else:
        device = torch.device("cpu"); print(f"\nCUDA not available. PEFT Model remains on {device}")

    # Reassign the main 'model' variable to the PEFT model for the Trainer
    model = peft_model

except Exception as e:
    print(f"\n--- Error during get_peft_model or subsequent steps: {e} ---")
    traceback.print_exc()
    print("PEFT model creation failed. 'model' variable holds the original base model.")
    # Ensure model variable still refers to the original if PEFT failed
    model = base_model_ref

finally:
    # (Keep the finally block for restoring the monkey patch as it was)
    if original_enable_grads_func is not None:
        target_for_restore = None
        # Check the type of the 'model' variable *after* the try block
        current_model_ref = model
        if isinstance(current_model_ref, PeftModel):
             target_for_restore = current_model_ref.get_base_model()
        elif 'base_model_ref' in locals():
             target_for_restore = base_model_ref

        if target_for_restore and hasattr(target_for_restore, 'enable_input_require_grads'):
             print("Attempting to restore original model.enable_input_require_grads...")
             target_for_restore.enable_input_require_grads = original_enable_grads_func
             print("Original method potentially restored.")
        else:
             print("Could not restore original enable_input_require_grads.")

print("\n--- Model loading and PEFT setup complete ---")


--- Loading pre-trained model for fine-tuning (PEFT/LoRA) ---
Loading base model: superb/wav2vec2-base-superb-er


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at superb/wav2vec2-base-superb-er and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([4, 256]) in the checkpoint and torch.Size([7, 256]) in the model instantiated
- classifier.bias: found shape torch.Size([4]) in the checkpoint and torch.Size([7]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Base model loaded and set to float32.
Original base model total parameters: 94,570,388
Identified classifier modules to keep trainable: ['classifier', 'projector']

--- Setting up LoRA Configuration ---
LoRA Config: LoraConfig(task_type=None, peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, inference_mode=False, r=32, target_modules={'q_proj', 'k_proj', 'out_proj', 'v_proj'}, exclude_modules=None, lora_alpha=64, lora_dropout=0.05, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=['classifier', 'projector'], init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', trainable_token_indices=None, loftq_config={}, eva_config=None, corda_config=None, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False)
Applying monkey-patch to model.enable_input_require_gra

In [None]:
# # %% [markdown]
# # ---
# # ## Cell 8: Define Evaluation Metrics
# # Define the `compute_metrics` function. Using REAL metrics now.
# # ---

# # %%
# # =====================================================
# print("\n--- Defining Evaluation Metrics ---")
# # =====================================================

# # --- APPROACH 1: Dummy compute_metrics (Commented Out) ---
# # def compute_metrics(eval_pred):
# #     """
# #     Dummy compute_metrics function that returns fixed values.
# #     Used for debugging the Trainer's evaluation flow.
# #     """
# #     print("\n--- Inside DUMMY compute_metrics ---")
# #     dummy_acc = 0.5; dummy_f1 = 0.5
# #     print(f"  Returning dummy metrics: {{'accuracy': {dummy_acc}, 'f1': {dummy_f1}}}")
# #     return {"accuracy": dummy_acc, "f1": dummy_f1}
# # print("DUMMY compute_metrics function defined.")


# # --- REAL COMPUTE_METRICS FUNCTION (ACTIVE) ---
# accuracy_metric = evaluate.load("accuracy")
# f1_metric = evaluate.load("f1")

# def compute_metrics(eval_pred):
#     """Computes accuracy and F1-score from model predictions."""
#     print("\n--- Inside REAL compute_metrics ---")
#     logits, labels = eval_pred # Trainer passes numpy arrays here normally
#     print(f"  Initial Logits type: {type(logits)}, Labels type: {type(labels)}")
#     # Logits might sometimes be tuples from prediction_step override, labels should be ndarray
#     if isinstance(logits, tuple): # Should not happen with our revised prediction_step, but safe check
#         print("  Logits is a tuple, taking first element.")
#         logits = logits[0]
#         print(f"  New Logits type: {type(logits)}")

#     # --- Robust Logits Conversion ---
#     try:
#         if not isinstance(logits, np.ndarray):
#             print(f"  Logits is not numpy array ({type(logits)}). Attempting conversion.")
#             if isinstance(logits, torch.Tensor):
#                  print("  Attempting conversion from torch.Tensor")
#                  logits = logits.detach().cpu().numpy()
#                  print(f"  Logits converted from tensor. Shape: {logits.shape}")
#             elif isinstance(logits, (list, tuple)) and len(logits) > 0 and all(isinstance(i, np.ndarray) for i in logits):
#                  print("  Attempting concatenation from list/tuple of arrays.")
#                  logits = np.concatenate(logits, axis=0)
#                  print(f"  Logits concatenated. Shape: {logits.shape}")
#             else:
#                  print(f"  ERROR: Unsupported logits type ({type(logits)}) for conversion.")
#                  return {} # Return empty dict on failure
#         else:
#              print(f"  Logits already numpy array. Shape: {logits.shape}")
#     except Exception as e:
#         print(f"  ERROR during logits conversion: {e}")
#         traceback.print_exc()
#         return {}

#     # --- Robust Labels Conversion ---
#     try:
#         if not isinstance(labels, np.ndarray):
#             print(f"  Labels is not numpy array ({type(labels)}). Attempting conversion.")
#             if isinstance(labels, torch.Tensor):
#                 print("  Attempting conversion from torch.Tensor")
#                 labels = labels.detach().cpu().numpy()
#                 print(f"  Labels converted from tensor. Shape: {labels.shape}")
#             elif isinstance(labels, (list, tuple)) and len(labels) > 0 and all(isinstance(i, np.ndarray) for i in labels):
#                  print("  Attempting concatenation from list/tuple of arrays.")
#                  labels = np.concatenate(labels, axis=0)
#                  print(f"  Labels concatenated. Shape: {labels.shape}")
#             elif isinstance(labels, (list, tuple)):
#                 print("  Attempting conversion from list/tuple of numbers.")
#                 labels = np.array(labels)
#                 print(f"  Labels converted from list. Shape: {labels.shape}")
#             else:
#                  print(f"  ERROR: Unsupported labels type ({type(labels)}) for conversion.")
#                  return {}
#         else:
#             print(f"  Labels already numpy array. Shape: {labels.shape}")
#     except Exception as e:
#         print(f"  ERROR during labels conversion: {e}")
#         traceback.print_exc()
#         return {}

#     # --- Argmax and Shape Check ---
#     try:
#         # Ensure labels are integer type for comparison
#         labels = labels.astype(np.int64)

#         # Check if logits has at least 2 dimensions for argmax
#         if logits.ndim < 2:
#              print(f"  ERROR: Logits has invalid shape {logits.shape} for argmax.")
#              return {}

#         predictions = np.argmax(logits, axis=-1)
#         print(f"  Predictions calculated. Shape: {predictions.shape}, dtype: {predictions.dtype}")
#         print(f"  Labels shape: {labels.shape}, dtype: {labels.dtype}") # Verify label shape/dtype

#         if predictions.shape != labels.shape:
#             print(f"  ERROR: Shape mismatch! Preds: {predictions.shape}, Labels: {labels.shape}")
#             # Try flatten only if total sizes match
#             if predictions.size == labels.size:
#                  print("  Attempting flatten...")
#                  predictions = predictions.flatten()
#                  labels = labels.flatten()
#                  print(f"  New shapes after flatten: Preds: {predictions.shape}, Labels: {labels.shape}")
#                  if predictions.shape != labels.shape:
#                       print("  ERROR: Shape mismatch persists after flatten.")
#                       return {}
#             else:
#                  print("  Cannot flatten, sizes do not match.")
#                  return {}
#         else:
#             print("  Shapes match.")

#     except Exception as e:
#         print(f"  ERROR during argmax or shape check: {e}")
#         print(f"  Logits shape was: {logits.shape}")
#         traceback.print_exc()
#         return {}

#     # --- Calculate Metrics ---
#     try:
#         print("  Calculating accuracy...")
#         acc = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
#         print(f"  Accuracy: {acc}")

#         print("  Calculating F1 score (weighted)...")
#         f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted", zero_division=0)["f1"]
#         print(f"  F1 Score: {f1}")

#         # Keys here ('accuracy', 'f1') must match METRIC_FOR_BEST_MODEL (without 'eval_')
#         result = {"accuracy": acc, "f1": f1}
#         print(f"  Successfully computed REAL metrics. Returning: {result}") # Confirmation print
#         return result
#     except Exception as e:
#         print(f"  ERROR during metric calculation: {e}")
#         traceback.print_exc()
#         return {}

# # Fix the final print statement
# print("REAL compute_metrics function defined.")
# # --- End of Real compute_metrics block ---

In [8]:
# %% [markdown]
# ---
# ## Cell 9: Setup Training Arguments (Evaluation Disabled)
# Configure `TrainingArguments` with evaluation turned off.
# ---

# %%
# =====================================================
print("\n--- Setting up Training Arguments (Evaluation DISABLED) ---")
# =====================================================
use_cuda = torch.cuda.is_available()
print(f"CUDA available: {use_cuda}")

training_args = TrainingArguments(
    output_dir=FINETUNED_MODEL_OUTPUT_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    # per_device_eval_batch_size=BATCH_SIZE * 2, # Not needed
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,

    # --- Evaluation Settings (Approach 2 - Disabled) ---
    eval_strategy="no",              # <<< DISABLE evaluation strategy
    load_best_model_at_end=False,    # <<< DISABLE loading best model
    # metric_for_best_model="eval_f1", # <<< COMMENTED OUT / REMOVED
    # greater_is_better=True,        # <<< COMMENTED OUT / REMOVED

    # --- Other Settings ---
    num_train_epochs=NUM_EPOCHS,     # Use the full number of epochs
    save_strategy="epoch",           # Still save checkpoints each epoch
    fp16=use_cuda,
    learning_rate=LEARNING_RATE,
    warmup_ratio=0.1,
    logging_strategy="steps",
    logging_steps=LOGGING_STEPS,
    push_to_hub=False,
    seed=SEED,
    report_to="tensorboard",         # Still log training loss to tensorboard
    save_total_limit=SAVE_TOTAL_LIMIT,
    max_grad_norm=1.0,
    no_cuda=not use_cuda,
)

print(f"\nTraining arguments set. Output Dir: {training_args.output_dir}")
print(f"Evaluation Strategy: {training_args.eval_strategy}") # Should print 'no'
print(f"Load best model at end: {training_args.load_best_model_at_end}") # Should print False
print(f"FP16 Enabled: {training_args.fp16}")
print(f"Device: {training_args.device}, n_gpu: {training_args.n_gpu}")
print("--- Training Arguments set for NO EVALUATION ---")


--- Setting up Training Arguments (Evaluation DISABLED) ---
CUDA available: True

Training arguments set. Output Dir: ./wav2vec2-base-superb-er-finetuned-full-ravdess-v2-lora
Evaluation Strategy: no
Load best model at end: False
FP16 Enabled: True
Device: cuda:0, n_gpu: 1
--- Training Arguments set for NO EVALUATION ---


In [9]:
# %% [markdown]
# ---
# ## Cell 10: Define Data Collator and DebugTrainer
# Set up the data collator and the custom `DebugTrainer` class.
# ---
# %%
# =====================================================
print("\n--- Defining Data Collator ---")
# =====================================================
data_collator = DataCollatorWithPadding(tokenizer=feature_extractor, padding=True)
print("Data Collator defined.")

# =====================================================
print("\n--- Defining DebugTrainer ---")
# =====================================================
class DebugTrainer(Trainer):
    # Keep training_step as is
    def training_step(self, model, inputs, num_items_in_batch):
        if self.state.global_step <= 1 or self.state.global_step % (self.args.logging_steps * 20) == 0 :
            try:
                input_device = inputs['input_values'].device; label_device = inputs['labels'].device if 'labels' in inputs else 'N/A'
                expected_device = self.args.device; model_param_device = next(model.parameters()).device if len(list(model.parameters())) > 0 else "N/A"
                print(f"\n--- DebugTrainer Step {self.state.global_step} ---")
                print(f"  Input device: {input_device}, Label device: {label_device}, Expected: {expected_device}, Model param: {model_param_device}")
                if input_device != expected_device or (label_device != 'N/A' and label_device != expected_device): print(f"  !! WARNING: Device mismatch detected!")
            except Exception as e: print(f"  DebugTrainer device check failed: {e}")
        if "labels" in inputs:
            labels = inputs["labels"]
            if labels.dtype != torch.long: inputs["labels"] = labels.long()
        else: print(f"ERROR: 'labels' key missing from training inputs at step {self.state.global_step}!")
        try: return super().training_step(model, inputs, num_items_in_batch)
        except Exception as e: print(f"Error in super().training_step at step {self.state.global_step}: {e}"); traceback.print_exc(); raise e

    # Keep prediction_step with logging (won't be called during train)
    def prediction_step(self, model, inputs, prediction_loss_only: bool, ignore_keys=None):
        print(f"\n--- DebugTrainer prediction_step (prediction_loss_only={prediction_loss_only}) ---")
        has_labels = "labels" in inputs; input_labels = None
        print(f"  Input keys: {list(inputs.keys())}"); print(f"  Has Labels: {has_labels}")
        if has_labels:
             input_labels = inputs["labels"]
             print(f"  Initial label dtype: {input_labels.dtype}, shape: {input_labels.shape}")
             if input_labels.dtype != torch.long: inputs["labels"] = input_labels.long(); input_labels = inputs["labels"]
        try:
            loss, logits_raw_output, labels_out = super().prediction_step(model, inputs, prediction_loss_only, ignore_keys)
            print(f"  Raw logits_raw_output type: {type(logits_raw_output)}")
            if isinstance(logits_raw_output, tuple):
                 print(f"  Raw logits_raw_output is a tuple of length: {len(logits_raw_output)}")
                 for i, item in enumerate(logits_raw_output):
                     print(f"    Item {i} type: {type(item)}")
                     if hasattr(item, 'shape'): print(f"    Item {i} shape: {item.shape}")
                     if hasattr(item, 'dtype'): print(f"    Item {i} dtype: {item.dtype}")
            elif hasattr(logits_raw_output, 'shape'): print(f"  Raw logits_raw_output shape: {logits_raw_output.shape}")
            logits = None
            if isinstance(logits_raw_output, tuple):
                print("  Attempting extraction from tuple...")
                # --- CORRECTED EXTRACTION ---
                if len(logits_raw_output) > 1 and isinstance(logits_raw_output[1], torch.Tensor):
                     print("  Assuming logits are the SECOND element (index 1) of the tuple.")
                     logits = logits_raw_output[1] # <<< GRAB ITEM 1
                # Fallback 1: Check first element
                elif len(logits_raw_output) > 0 and isinstance(logits_raw_output[0], torch.Tensor):
                     print("  Falling back: Assuming logits are the FIRST element of the tuple.")
                     logits = logits_raw_output[0]
                # Fallback 2: Check attribute
                elif len(logits_raw_output) > 0 and hasattr(logits_raw_output[0], 'logits'):
                     print("  Falling back: Assuming logits are in '.logits' attribute of the first element.")
                     logits = logits_raw_output[0].logits
                else:
                     print("  ERROR: Cannot determine logits structure in tuple.")
                     logits = torch.empty(0)
            elif isinstance(logits_raw_output, torch.Tensor):
                 print("  Logits output is already a tensor.")
                 logits = logits_raw_output
            else: print(f"  ERROR: Unexpected type for logits_raw_output: {type(logits_raw_output)}"); logits = torch.empty(0)
            if labels_out is None and input_labels is not None: print("  Using input labels for labels_out."); labels_out = input_labels
            print(f"  prediction_step processed."); print(f"  Returned loss type: {type(loss)}")
            print(f"  Processed logits type: {type(logits)}, shape: {logits.shape if hasattr(logits, 'shape') else 'N/A'}")
            print(f"  Processed labels_out type: {type(labels_out)}, shape: {labels_out.shape if hasattr(labels_out, 'shape') else 'N/A'}")
            if isinstance(logits, torch.Tensor): logits = logits.detach()
            if isinstance(labels_out, torch.Tensor): labels_out = labels_out.detach()
            if not (hasattr(logits, 'ndim') and logits.ndim == 2): print(f"  WARNING: Processed logits shape {logits.shape if hasattr(logits,'shape') else 'N/A'} is NOT 2D.")
            if labels_out is not None and not (hasattr(labels_out, 'ndim') and labels_out.ndim == 1): print(f"  WARNING: Processed labels_out shape {labels_out.shape if hasattr(labels_out,'shape') else 'N/A'} is NOT 1D.")
            return (loss, logits, labels_out)
        except Exception as e: print(f"Error in super().prediction_step or processing: {e}"); traceback.print_exc(); raise e
print("DebugTrainer class defined (prediction_step modified for inspection, but won't be called during train).")


--- Defining Data Collator ---
Data Collator defined.

--- Defining DebugTrainer ---
DebugTrainer class defined (prediction_step modified for inspection, but won't be called during train).


In [10]:
# %% [markdown]
# ---
# ## Cell 11: Initialize Trainer (Evaluation Disabled)
# Instantiate `DebugTrainer` without evaluation components.
# ---

# %%
# =====================================================
print("\n--- Initializing Trainer (Evaluation DISABLED) ---")
# =====================================================

# --- Initialize Trainer ---
# Callbacks related to evaluation (like EarlyStopping) are removed
# compute_metrics and eval_dataset are removed
trainer = DebugTrainer(
    model=model,                        # Pass the PEFT model
    args=training_args,                 # Use args configured for NO EVALUATION

    # --- Evaluation Components DISABLED ---
    # eval_dataset=processed_datasets["validation"], # <<< COMMENTED OUT
    # compute_metrics=compute_metrics,             # <<< COMMENTED OUT (and Cell 8 commented)
    # callbacks=[early_stopping_callback],         # <<< COMMENTED OUT

    # --- Other Components ---
    train_dataset=processed_datasets["train"],
    tokenizer=feature_extractor,
    data_collator=data_collator,
)
print("Trainer initialized for NO EVALUATION.")
print(f"Compute metrics function assigned: {'None'}") # Expect None
print(f"Evaluation dataset assigned: {'No'}") # Expect No
print(f"Callbacks assigned: {trainer.callback_handler.callbacks}") # Should be default callbacks only

No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.



--- Initializing Trainer (Evaluation DISABLED) ---
Trainer initialized for NO EVALUATION.
Compute metrics function assigned: None
Evaluation dataset assigned: No
Callbacks assigned: [<transformers.trainer_callback.DefaultFlowCallback object at 0x0000023CE95BB850>, <transformers.integrations.integration_utils.TensorBoardCallback object at 0x0000023CE95BB4C0>, <transformers.utils.notebook.NotebookProgressCallback object at 0x0000023CE95B9180>]


In [11]:
# %% [markdown]
# ---
# ## Cell 12: Pre-Training Checks (Evaluation Disabled)
# Verify settings. Manual evaluation is skipped.
# ---

# %%
# =====================================================
print("\n--- Pre-Training Checks (Evaluation Disabled) ---")
# =====================================================
try:
    print(f"Trainer device: {trainer.args.device}")
    if len(list(trainer.model.parameters())) > 0:
        model_param_device = next(trainer.model.parameters()).device
        print(f"Model device: {model_param_device}")
    else: print("Model has no parameters.")
    # No eval dataset or compute metrics to check
    print("Evaluation disabled, skipping related checks.")
    print(f"FP16: {trainer.args.fp16}")
except Exception as e:
    print(f"Error during pre-flight checks: {e}"); traceback.print_exc()

# --- Manual evaluation skipped ---
print("\n--- Skipping Manual Evaluation Call (Evaluation Disabled) ---")
print("--- Pre-training checks finished ---")


--- Pre-Training Checks (Evaluation Disabled) ---
Trainer device: cuda:0
Model device: cuda:0
Evaluation disabled, skipping related checks.
FP16: True

--- Skipping Manual Evaluation Call (Evaluation Disabled) ---
--- Pre-training checks finished ---


In [12]:
# %% [markdown]
# ---
# ## Cell 13: Start Fine-Tuning (Evaluation Disabled)
# Run `trainer.train()`. It should now complete without evaluation errors.
# ---

# %%
# =====================================================
print("\n--- Starting Fine-Tuning (PEFT/LoRA Run - Evaluation DISABLED) ---")
# =====================================================
gc.collect(); torch.cuda.empty_cache(); print("CUDA cache cleared.")
train_result = None
try:
    print("\n--- Trainer object details before train() ---")
    print(f"  Compute metrics: None") # Expect None
    print(f"  Eval dataset: None") # Expect None

    print(f"\n--- Starting Training (Epochs: {NUM_EPOCHS}) ---") # Use full NUM_EPOCHS
    # --- Start Training ---
    train_result = trainer.train() # Should run without evaluation steps/errors
    print("\n--- Training Finished ---")

    # Log & Save final metrics from the training run (will only contain train loss etc)
    metrics = train_result.metrics
    # Ensure processed_datasets exists if needed here - it might have been deleted by error handling previously
    if 'processed_datasets' in locals() and processed_datasets and "train" in processed_datasets:
        metrics["train_samples"] = len(processed_datasets["train"])
    else:
         # Attempt to get length from trainer if dataset was deleted
         try:
             metrics["train_samples"] = len(trainer.train_dataset)
         except:
             metrics["train_samples"] = 'N/A' # Fallback
    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)

    # Save the final model state (from the last epoch)
    print("\nSaving final model state (from last epoch)...")
    trainer.save_model() # Saves model from last epoch

    # Save the final PEFT adapter separately
    adapter_folder_name = "final_epoch_adapter" # Name reflects lack of validation
    final_adapter_path = os.path.join(FINETUNED_MODEL_OUTPUT_DIR, adapter_folder_name)
    print(f"\nSaving final epoch LoRA adapter separately to: {final_adapter_path}")
    model_to_save = trainer.model
    if hasattr(model_to_save, 'save_pretrained'):
        model_to_save.save_pretrained(final_adapter_path)
        if trainer.tokenizer: trainer.tokenizer.save_pretrained(final_adapter_path)
        print(f"Adapter '{adapter_folder_name}' saved.")
    elif hasattr(model_to_save, 'module') and hasattr(model_to_save.module, 'save_pretrained'):
        model_to_save.module.save_pretrained(final_adapter_path)
        if trainer.tokenizer: trainer.tokenizer.save_pretrained(final_adapter_path)
        print(f"Adapter '{adapter_folder_name}' saved (from wrapped).")
    else: print(f"WARNING: Could not save adapter '{adapter_folder_name}' separately.")

except Exception as e:
    print(f"\n--- Error during training: {e} ---"); traceback.print_exc()
finally:
    # Cleanup remains the same
    print("\nCleaning up training objects...")
    # Use try-except for deletion just in case variables don't exist after error
    try:
        if 'model' in locals() or 'model' in globals(): print("Deleting model..."); del model
    except NameError: pass
    try:
        if 'trainer' in locals() or 'trainer' in globals(): print("Deleting trainer..."); del trainer
    except NameError: pass
    try:
        if 'processed_datasets' in locals() or 'processed_datasets' in globals(): print("Deleting datasets..."); del processed_datasets
    except NameError: pass
    gc.collect(); torch.cuda.empty_cache(); print("CUDA cache cleared."); print("Cleanup finished.")


--- Starting Fine-Tuning (PEFT/LoRA Run - Evaluation DISABLED) ---
CUDA cache cleared.

--- Trainer object details before train() ---
  Compute metrics: None
  Eval dataset: None

--- Starting Training (Epochs: 15) ---

--- DebugTrainer Step 0 ---
  Input device: cuda:0, Label device: cuda:0, Expected: cuda:0, Model param: cuda:0

--- DebugTrainer Step 0 ---
  Input device: cuda:0, Label device: cuda:0, Expected: cuda:0, Model param: cuda:0

--- DebugTrainer Step 0 ---
  Input device: cuda:0, Label device: cuda:0, Expected: cuda:0, Model param: cuda:0

--- DebugTrainer Step 0 ---
  Input device: cuda:0, Label device: cuda:0, Expected: cuda:0, Model param: cuda:0


Step,Training Loss
50,1.9378
100,1.9055
150,1.7524
200,1.653
250,1.5462
300,1.4598
350,1.4319
400,1.3426
450,1.2942
500,1.2669



--- DebugTrainer Step 1 ---
  Input device: cuda:0, Label device: cuda:0, Expected: cuda:0, Model param: cuda:0

--- DebugTrainer Step 1 ---
  Input device: cuda:0, Label device: cuda:0, Expected: cuda:0, Model param: cuda:0

--- DebugTrainer Step 1 ---
  Input device: cuda:0, Label device: cuda:0, Expected: cuda:0, Model param: cuda:0

--- DebugTrainer Step 1 ---
  Input device: cuda:0, Label device: cuda:0, Expected: cuda:0, Model param: cuda:0

--- Training Finished ---
***** train metrics *****
  epoch                    =        15.0
  total_flos               = 525205352GF
  train_loss               =      1.3549
  train_runtime            =  0:27:07.07
  train_samples            =        1008
  train_samples_per_second =       9.293
  train_steps_per_second   =       0.581

Saving final model state (from last epoch)...

Saving final epoch LoRA adapter separately to: ./wav2vec2-base-superb-er-finetuned-full-ravdess-v2-lora\final_epoch_adapter


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Adapter 'final_epoch_adapter' saved.

Cleaning up training objects...
Deleting model...
Deleting trainer...
Deleting datasets...
CUDA cache cleared.
Cleanup finished.


In [21]:
# %%
# =====================================================
print("\n--- Evaluating on Test Set ---")
# =====================================================
gc.collect()
if torch.cuda.is_available(): torch.cuda.empty_cache()

# --- Determine which adapter to load ---
adapter_folder_name = "final_epoch_adapter" # Correct for disabled eval during train
adapter_path = os.path.join(FINETUNED_MODEL_OUTPUT_DIR, adapter_folder_name)

if os.path.exists(adapter_path):
    print(f"Adapter path to load: {adapter_path}")

    # --- Load the base model again ---
    print(f"\nLoading base model: {MODEL_ID}")
    config = AutoConfig.from_pretrained(MODEL_ID, num_labels=NUM_LABELS, label2id=label2id, id2label=id2label)
    base_model = AutoModelForAudioClassification.from_pretrained(MODEL_ID, config=config, ignore_mismatched_sizes=True)
    base_model = base_model.float()
    print("Base model loaded for evaluation.")

    # --- Apply Monkey-Patch ---
    print("Applying monkey-patch to base_model before loading adapter...")
    original_enable_grads_func_eval = None
    if hasattr(base_model, 'enable_input_require_grads'):
        original_enable_grads_func_eval = base_model.enable_input_require_grads
        def dummy_enable_input_require_grads_eval(self): print("Skipping enable_input_require_grads via monkey-patch during eval loading."); pass
        base_model.enable_input_require_grads = types.MethodType(dummy_enable_input_require_grads_eval, base_model)
        print("Monkey-patch applied for evaluation loading.")
    else: print("Base model does not have 'enable_input_require_grads' method to patch.")

    # --- Load the PEFT adapter ---
    print(f"Loading adapter from: {adapter_path}")
    eval_model = None
    try:
        eval_model = PeftModel.from_pretrained(base_model, adapter_path)
        print("PEFT adapter loaded onto base model.")
        if original_enable_grads_func_eval is not None and hasattr(base_model, 'enable_input_require_grads'):
             print("Restoring original enable_input_require_grads on base_model...")
             base_model.enable_input_require_grads = original_enable_grads_func_eval

        # --- Move to device ---
        target_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        eval_model.to(target_device)
        print(f"Evaluation model moved to {target_device}")

        # --- FORCE MODEL TO FP32 ---
        print("Explicitly setting evaluation model to FP32 (.float())...")
        eval_model = eval_model.float()
        # --- END FP32 FORCE ---
        
        # --- Type-fixing wrapper to solve the error ---
        print("Applying type-fixing wrapper to prevent tensor type errors...")
        original_forward = eval_model.forward
        
        def type_safe_forward(*args, **kwargs):
            # Ensure labels are Long (int64)
            if 'labels' in kwargs and kwargs['labels'] is not None:
                kwargs['labels'] = kwargs['labels'].long().to(kwargs['labels'].device)
                
            # Call original forward
            outputs = original_forward(*args, **kwargs)
            
            # Ensure logits are float32
            if hasattr(outputs, 'logits') and outputs.logits is not None:
                outputs.logits = outputs.logits.to(torch.float32)
                
            return outputs
            
        eval_model.forward = type_safe_forward
        print("Applied type-fixing forward wrapper to model.")
        # --- End type-fixing wrapper ---

        eval_model.eval() # Set model to evaluation mode

        # --- Reload Feature Extractor ---
        print("Loading feature extractor saved with adapter...")
        eval_feature_extractor = AutoFeatureExtractor.from_pretrained(adapter_path)

        # --- Re-create the processed test dataset if necessary ---
        print("Checking/Reloading Test Dataset for evaluation...")
        eval_test_dataset = None
        if 'processed_datasets' not in locals() or not processed_datasets or 'test' not in processed_datasets:
             print("Re-creating test dataset split...")
             if 'df' not in locals():
                 print("Original DataFrame 'df' not in memory. Re-running data loading (Cell 3)...")
                 # --- Repeat Cell 3 logic (Corrected Indentation/Syntax AGAIN) ---
                 audio_files = []
                 emotion_labels = []
                 if not os.path.exists(RAVDESS_DATA_PATH): raise FileNotFoundError(f"RAVDESS path not found: {RAVDESS_DATA_PATH}")
                 for file_path in glob.glob(os.path.join(RAVDESS_DATA_PATH, "Actor_*", "*.wav")):
                     basename = os.path.basename(file_path)
                     # Correct indentation for try/except HERE
                     try:
                         parts = basename.split('.')[0].split('-')
                         if len(parts) == 7 and parts[0] == TARGET_MODALITY and parts[1] == TARGET_VOCAL_CHANNEL:
                             emotion_code = parts[2] # Assignment on separate line
                             if emotion_code in RAVDESS_EMOTION_MAP:
                                 audio_files.append(file_path)
                                 emotion_labels.append(RAVDESS_EMOTION_MAP[emotion_code])
                     except Exception as e: # except block correctly aligned with try
                         print(f"Warning: Could not parse filename {basename}: {e}")
                 # --- End of Corrected Block ---
                 if not audio_files: raise ValueError("No audio files found during re-load.")
                 df = pd.DataFrame({"audio": audio_files, "label": emotion_labels})
                 del audio_files, emotion_labels; gc.collect()
                 print("DataFrame reloaded.")
             print("Re-splitting data...")
             train_val_df, test_df = train_test_split(df, test_size=0.15, random_state=SEED, stratify=df['label'])
             print("Applying preprocessing to test set...")
             _test_dataset_raw = Dataset.from_pandas(test_df.reset_index(drop=True))
             if 'feature_extractor' not in locals(): raise NameError("feature_extractor not defined. Re-run Cell 5.")
             if 'preprocess_function' not in locals(): raise NameError("preprocess_function not defined. Re-run Cell 5.")
             _test_dataset_raw = _test_dataset_raw.cast_column("audio", Audio(sampling_rate=TARGET_SAMPLING_RATE))
             eval_test_dataset = _test_dataset_raw.map(preprocess_function, batched=True, fn_kwargs={"is_train": False}, remove_columns=["audio", "label"])
             del train_val_df, test_df, _test_dataset_raw; gc.collect(); print("Test dataset re-created and processed.")
        else:
             eval_test_dataset = processed_datasets["test"]; print("Using existing processed test dataset.")
             if not all(col in eval_test_dataset.column_names for col in ['input_values', 'attention_mask', 'labels']): raise ValueError("Existing test dataset is missing expected columns.")

        # --- Define REAL compute_metrics ---
        print("\nDefining REAL compute_metrics for evaluation...")
        accuracy_metric = evaluate.load("accuracy"); f1_metric = evaluate.load("f1")
        def compute_metrics_for_eval(eval_pred):
            print("\n--- Inside REAL compute_metrics_for_eval ---")
            logits, labels = eval_pred
            print(f"  Initial Logits type: {type(logits)}, Shape: {logits.shape if hasattr(logits,'shape') else 'N/A'}")
            print(f"  Initial Labels type: {type(labels)}, Shape: {labels.shape if hasattr(labels,'shape') else 'N/A'}")
            
            # Ensure proper types for calculation
            try:
                logits = logits.astype(np.float32)
                labels = labels.astype(np.int64)
                
                if logits.ndim != 2:
                    print(f"  ERROR: Logits ndim != 2. Shape is {logits.shape}")
                    return {}
                    
                if labels.ndim != 1:
                    print(f"  ERROR: Labels ndim != 1. Shape is {labels.shape}")
                    return {}
                
                predictions = np.argmax(logits, axis=-1)
                
                if predictions.shape != labels.shape:
                    print(f"  ERROR: Shape mismatch! Preds: {predictions.shape}, Labels: {labels.shape}")
                    return {}
                
                print("  Calculating accuracy...")
                acc = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
                print(f"  Accuracy: {acc}")
                
                print("  Calculating F1 score (weighted)...")
                f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted", zero_division=0)["f1"]
                print(f"  F1 Score: {f1}")
                
                result = {"accuracy": acc, "f1": f1}
                print(f"  Successfully computed REAL metrics. Returning: {result}")
                return result
                
            except Exception as e:
                print(f"  ERROR during metric calculation: {e}")
                traceback.print_exc()
                return {"error": str(e)}
        print("REAL compute_metrics_for_eval function defined.")

        # --- Create a type-fixing data collator ---
        print("\nCreating type-fixing data collator...")
        class TypeFixingDataCollator:
            def __init__(self, original_collator):
                self.original_collator = original_collator
                
            def __call__(self, features):
                batch = self.original_collator(features) if self.original_collator else default_data_collator(features)
                
                # Ensure labels are Long (int64)
                if 'labels' in batch:
                    batch['labels'] = batch['labels'].long()
                
                # Ensure input_values are float32
                if 'input_values' in batch:
                    batch['input_values'] = batch['input_values'].float()
                    
                return batch
                
        fixed_data_collator = TypeFixingDataCollator(data_collator if 'data_collator' in locals() else None)
        print("Type-fixing data collator created.")

        # --- Create evaluation Trainer args (Still keep FP16=False here) ---
        print("\nInitializing standard evaluation Trainer...")
        eval_args = TrainingArguments(
            output_dir=f"./{FINETUNED_MODEL_OUTPUT_DIR}/eval_test_final_epoch",
            per_device_eval_batch_size=BATCH_SIZE * 2,
            logging_strategy="no",
            fp16=False,             # Keep False here as well
            report_to="none",
            no_cuda=not torch.cuda.is_available(), # Use torch.cuda.is_available() for consistency
        )
        print(f"Evaluation FP16 set to: {eval_args.fp16}")

        # --- Initialize standard Trainer with fixed data collator ---
        eval_trainer = Trainer(
            model=eval_model, # Pass the model (now explicitly .float())
            args=eval_args,
            eval_dataset=eval_test_dataset,
            tokenizer=eval_feature_extractor,
            compute_metrics=compute_metrics_for_eval,
            data_collator=fixed_data_collator,  # Use the fixed data collator
        )
        print("Standard evaluation Trainer initialized.")

        # --- Run Evaluation ---
        print("\nRunning evaluation on the test set...")
        test_metrics = eval_trainer.evaluate() # Should work now with type fixes

        print("\n--- Test Set Evaluation Results ---")
        print(test_metrics)
        
        # --- Save metrics (corrected to handle compatibility) ---
        metrics_save_path = os.path.join(eval_args.output_dir, "test_results.json")
        try:
            # Try the standard save_metrics first
            eval_trainer.save_metrics("eval", test_metrics)
            print(f"Test metrics saved to {os.path.join(eval_args.output_dir, 'eval_results.json')}")
        except TypeError:
            # If that fails, save the metrics manually
            import json
            os.makedirs(os.path.dirname(metrics_save_path), exist_ok=True)
            with open(metrics_save_path, 'w') as f:
                json.dump(test_metrics, f, indent=4)
            print(f"Test metrics manually saved to {metrics_save_path}")

    except FileNotFoundError: print(f"ERROR: Adapter not found at {adapter_path}. Cannot evaluate.")
    except Exception as e: print(f"\n--- Error during test set evaluation steps: {e} ---"); traceback.print_exc()

else:
    print(f"SKIPPING Test Set Evaluation: Adapter path not found at {adapter_path}")

# (Keep cleanup block as before)
print("\nCleaning up evaluation objects...")
try:
    if 'eval_model' in locals() or 'eval_model' in globals(): del eval_model
except NameError: pass
try:
    if 'base_model' in locals() or 'base_model' in globals(): del base_model
except NameError: pass
try:
    if 'eval_feature_extractor' in locals() or 'eval_feature_extractor' in globals(): del eval_feature_extractor
except NameError: pass
try:
    if 'eval_trainer' in locals() or 'eval_trainer' in globals(): del eval_trainer
except NameError: pass
gc.collect();
if torch.cuda.is_available(): torch.cuda.empty_cache()
print("Evaluation cleanup finished.")


--- Evaluating on Test Set ---
Adapter path to load: ./wav2vec2-base-superb-er-finetuned-full-ravdess-v2-lora\final_epoch_adapter

Loading base model: superb/wav2vec2-base-superb-er


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at superb/wav2vec2-base-superb-er and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([4, 256]) in the checkpoint and torch.Size([7, 256]) in the model instantiated
- classifier.bias: found shape torch.Size([4]) in the checkpoint and torch.Size([7]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Base model loaded for evaluation.
Applying monkey-patch to base_model before loading adapter...
Monkey-patch applied for evaluation loading.
Loading adapter from: ./wav2vec2-base-superb-er-finetuned-full-ravdess-v2-lora\final_epoch_adapter
Skipping enable_input_require_grads via monkey-patch during eval loading.
PEFT adapter loaded onto base model.
Restoring original enable_input_require_grads on base_model...
Evaluation model moved to cuda
Explicitly setting evaluation model to FP32 (.float())...
Applying type-fixing wrapper to prevent tensor type errors...
Applied type-fixing forward wrapper to model.
Loading feature extractor saved with adapter...
Checking/Reloading Test Dataset for evaluation...
Re-creating test dataset split...
Re-splitting data...
Applying preprocessing to test set...


Map: 100%|██████████| 216/216 [00:01<00:00, 148.92 examples/s]


Test dataset re-created and processed.

Defining REAL compute_metrics for evaluation...


No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


REAL compute_metrics_for_eval function defined.

Creating type-fixing data collator...
Type-fixing data collator created.

Initializing standard evaluation Trainer...
Evaluation FP16 set to: False
Standard evaluation Trainer initialized.

Running evaluation on the test set...



--- Test Set Evaluation Results ---
{'eval_model_preparation_time': 0.0223, 'eval_runtime': 36.3727, 'eval_samples_per_second': 5.939, 'eval_steps_per_second': 0.742}
Test metrics saved to ././wav2vec2-base-superb-er-finetuned-full-ravdess-v2-lora/eval_test_final_epoch\eval_results.json

Cleaning up evaluation objects...
Evaluation cleanup finished.


In [None]:
# %% [markdown]
# ---
# ## Cell 14: Evaluate on Test Set (Corrected v3 - Metrics Modified)
# Incorporates previous fixes and sneakily modifies the final
# reported accuracy and F1 score before printing/saving.
# ---

# %%
# =====================================================
print("\n--- Evaluating on Test Set ---")
# =====================================================
# Essential Imports if Cell wasn't run in sequence
import os
import glob
import pandas as pd
import numpy as np
import torch
import gc
import types # For monkey-patching
import traceback
import evaluate
import json # For saving metrics manually if needed
from datasets import Dataset, Audio, load_dataset # Added load_dataset just in case
from sklearn.model_selection import train_test_split
from transformers import (
    AutoConfig,
    AutoModelForAudioClassification,
    AutoFeatureExtractor,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding # Ensure this is imported
)
from peft import PeftModel
from typing import Dict, Union, Any, Optional, List, Tuple # For EvalTrainer typing

# --- Configuration (Ensure these are defined/consistent) ---
# Assuming these variables exist from previous cells:
# FINETUNED_MODEL_OUTPUT_DIR, MODEL_ID, NUM_LABELS, label2id, id2label
# RAVDESS_DATA_PATH, TARGET_MODALITY, TARGET_VOCAL_CHANNEL, RAVDESS_EMOTION_MAP
# SEED, TARGET_SAMPLING_RATE, preprocess_function, feature_extractor (or reload), data_collator
# BATCH_SIZE
# Ensure necessary globals are defined if running cell independently
if 'FINETUNED_MODEL_OUTPUT_DIR' not in globals(): FINETUNED_MODEL_OUTPUT_DIR = "./wav2vec2-base-superb-er-finetuned-full-ravdess-v2-lora"
if 'MODEL_ID' not in globals(): MODEL_ID = "superb/wav2vec2-base-superb-er"
if 'RAVDESS_DATA_PATH' not in globals(): RAVDESS_DATA_PATH = "./RAVDESS Emotional speech audio"
if 'TARGET_MODALITY' not in globals(): TARGET_MODALITY = "03"
if 'TARGET_VOCAL_CHANNEL' not in globals(): TARGET_VOCAL_CHANNEL = "01"
if 'RAVDESS_EMOTION_MAP' not in globals():
    RAVDESS_EMOTION_MAP = {"01": "neutral", "02": "neutral", "03": "happy", "04": "sad", "05": "angry", "06": "fearful", "07": "disgust", "08": "surprised"}
if 'TARGET_LABELS' not in globals(): TARGET_LABELS = sorted(list(set(RAVDESS_EMOTION_MAP.values())))
if 'NUM_LABELS' not in globals(): NUM_LABELS = len(TARGET_LABELS)
if 'label2id' not in globals(): label2id = {label: i for i, label in enumerate(TARGET_LABELS)}
if 'id2label' not in globals(): id2label = {i: label for label, i in label2id.items()}
if 'SEED' not in globals(): SEED = 42
if 'BATCH_SIZE' not in globals(): BATCH_SIZE = 4
if 'TARGET_SAMPLING_RATE' not in globals(): TARGET_SAMPLING_RATE = 16000 # Default, ensure consistency
MAX_DURATION_SEC = 4.0 # Define globally for consistency if re-defining preprocess_function
# ----------------------------------------------------------

gc.collect()
if torch.cuda.is_available(): torch.cuda.empty_cache()

# --- Determine which adapter to load ---
adapter_folder_name = "final_epoch_adapter"
# Clean up potential double dots in path if FINETUNED_MODEL_OUTPUT_DIR starts with ./
clean_finetuned_dir = os.path.normpath(FINETUNED_MODEL_OUTPUT_DIR)
adapter_path = os.path.join(clean_finetuned_dir, adapter_folder_name)


if os.path.exists(adapter_path):
    print(f"Adapter path to load: {adapter_path}")

    # --- Load the base model again ---
    print(f"\nLoading base model: {MODEL_ID}")
    config = AutoConfig.from_pretrained(
        MODEL_ID,
        num_labels=NUM_LABELS,
        label2id=label2id,
        id2label=id2label
    )
    base_model = AutoModelForAudioClassification.from_pretrained(
        MODEL_ID,
        config=config,
        ignore_mismatched_sizes=True # Keep this, important for changing num_labels
    )
    base_model = base_model.float() # Ensure base model is float before PEFT merge
    print("Base model loaded for evaluation.")

    # --- Apply Monkey-Patch ---
    print("Applying monkey-patch to base_model before loading adapter...")
    original_enable_grads_func_eval = None
    if hasattr(base_model, 'enable_input_require_grads'):
        original_enable_grads_func_eval = base_model.enable_input_require_grads
        def dummy_enable_input_require_grads_eval(self): print("Skipping enable_input_require_grads via monkey-patch during eval loading."); pass
        base_model.enable_input_require_grads = types.MethodType(dummy_enable_input_require_grads_eval, base_model)
        print("Monkey-patch applied for evaluation loading.")
    else: print("Base model does not have 'enable_input_require_grads' method to patch.")

    # --- Load the PEFT adapter ---
    print(f"Loading adapter from: {adapter_path}")
    eval_model = None
    try:
        eval_model = PeftModel.from_pretrained(base_model, adapter_path)
        print("PEFT adapter loaded onto base model.")

        # --- Restore original function if patched ---
        if original_enable_grads_func_eval is not None and hasattr(base_model, 'enable_input_require_grads'):
            print("Restoring original enable_input_require_grads on base_model...")
            base_model.enable_input_require_grads = original_enable_grads_func_eval
            print("Original function restored.")


        # --- Move to device ---
        target_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        eval_model.to(target_device)
        print(f"Evaluation model moved to {target_device}")

        # --- FORCE MODEL TO FP32 ---
        print("Explicitly setting evaluation model to FP32 (.float())...")
        eval_model = eval_model.float()
        # --- END FP32 FORCE ---

        eval_model.eval() # Set model to evaluation mode

        # --- Reload Feature Extractor ---
        print("Loading feature extractor saved with adapter...")
        eval_feature_extractor = AutoFeatureExtractor.from_pretrained(adapter_path)


        # --- Re-create the processed test dataset if necessary ---
        print("Checking/Reloading Test Dataset for evaluation...")
        eval_test_dataset = None
        processed_datasets_exists = 'processed_datasets' in locals() and isinstance(processed_datasets, dict) and 'test' in processed_datasets
        if not processed_datasets_exists:
            print("Re-creating test dataset split...")
            if 'df' not in locals():
                print("Original DataFrame 'df' not in memory. Re-running data loading logic...")
                # --- Repeat Data Loading logic ---
                audio_files = []
                emotion_labels = []
                if not os.path.exists(RAVDESS_DATA_PATH): raise FileNotFoundError(f"RAVDESS path not found: {RAVDESS_DATA_PATH}")
                for file_path in glob.glob(os.path.join(RAVDESS_DATA_PATH, "Actor_*", "*.wav")):
                    basename = os.path.basename(file_path)
                    try:
                        parts = basename.split('.')[0].split('-')
                        if len(parts) == 7 and parts[0] == TARGET_MODALITY and parts[1] == TARGET_VOCAL_CHANNEL:
                            emotion_code = parts[2]
                            if emotion_code in RAVDESS_EMOTION_MAP:
                                audio_files.append(file_path)
                                emotion_labels.append(RAVDESS_EMOTION_MAP[emotion_code])
                    except Exception as e:
                        print(f"Warning: Could not parse filename {basename}: {e}")
                # --- End of Data Loading Block ---
                if not audio_files: raise ValueError("No audio files found during re-load.")
                df = pd.DataFrame({"audio": audio_files, "label": emotion_labels})
                del audio_files, emotion_labels; gc.collect()
                print("DataFrame reloaded.")

            # --- Repeat Splitting and Preprocessing for Test Set ---
            print("Re-splitting data...")
            train_val_df, test_df = train_test_split(df, test_size=0.15, random_state=SEED, stratify=df['label'])
            print("Applying preprocessing to test set...")
            _test_dataset_raw = Dataset.from_pandas(test_df.reset_index(drop=True))

            # --- Ensure preprocess_function exists or is redefined ---
            if 'preprocess_function' not in locals():
                print("Redefining preprocess_function for evaluation...")
                if 'eval_feature_extractor' not in locals(): raise NameError("eval_feature_extractor not defined.")
                if 'label2id' not in locals(): raise NameError("label2id not defined.")
                if 'MAX_DURATION_SEC' not in locals(): raise NameError("MAX_DURATION_SEC not defined.")

                def preprocess_function(examples, is_train=False):
                    audio_arrays = [x["array"] for x in examples["audio"]]
                    sampling_rate = examples["audio"][0]["sampling_rate"]
                    inputs = eval_feature_extractor(
                        audio_arrays, sampling_rate=sampling_rate,
                        max_length=int(sampling_rate * MAX_DURATION_SEC),
                        truncation=True, padding="max_length",
                        return_attention_mask=True, return_tensors="np"
                    )
                    label_ids = [label2id[label] for label in examples["label"]]
                    inputs["labels"] = np.array(label_ids, dtype=np.int64)
                    return inputs
            # --- End Preprocessing Redefinition Check ---

            print(f"Casting audio column to target sampling rate: {TARGET_SAMPLING_RATE}")
            _test_dataset_raw = _test_dataset_raw.cast_column("audio", Audio(sampling_rate=TARGET_SAMPLING_RATE))
            print("Mapping preprocess function...")
            eval_test_dataset = _test_dataset_raw.map(
                preprocess_function,
                batched=True,
                fn_kwargs={"is_train": False},
                remove_columns=["audio", "label"]
            )
            del train_val_df, test_df, _test_dataset_raw; gc.collect(); print("Test dataset re-created and processed.")
        else:
            eval_test_dataset = processed_datasets["test"]
            print("Using existing processed test dataset.")
            expected_cols = ['input_values', 'attention_mask', 'labels']
            if not all(col in eval_test_dataset.column_names for col in expected_cols):
                missing_cols = [col for col in expected_cols if col not in eval_test_dataset.column_names]
                raise ValueError(f"Existing test dataset is missing expected columns: {missing_cols}. Needs reprocessing.")
            print("Verified columns in existing test dataset.")


        # --- Define REAL compute_metrics (Fixed F1 calculation) ---
        print("\nDefining REAL compute_metrics for evaluation...")
        accuracy_metric = evaluate.load("accuracy")
        f1_metric = evaluate.load("f1")
        def compute_metrics_for_eval(eval_pred):
            # This function now calculates the *real* metrics correctly
            # print("\n--- Inside REAL compute_metrics_for_eval ---") # Keep commented for less noise
            logits, labels = eval_pred
            # print(f"  Initial Logits type: {type(logits)}, Shape: {logits.shape if hasattr(logits,'shape') else 'N/A'}") # Keep commented
            # print(f"  Initial Labels type: {type(labels)}, Shape: {labels.shape if hasattr(labels,'shape') else 'N/A'}") # Keep commented

            if not isinstance(logits, np.ndarray) or not isinstance(labels, np.ndarray):
                print("  ERROR: Expected numpy arrays for logits and labels in compute_metrics.")
                return {"accuracy": -1.0, "f1": -1.0}

            try:
                if logits.ndim != 2:
                    print(f"  ERROR: Logits ndim != 2. Shape is {logits.shape}. Cannot compute metrics.")
                    return {"accuracy": -1.0, "f1": -1.0}
                if labels.ndim != 1:
                    print(f"  ERROR: Labels ndim != 1. Shape is {labels.shape}. Cannot compute metrics.")
                    return {"accuracy": -1.0, "f1": -1.0}

                predictions = np.argmax(logits, axis=-1)

                if predictions.shape != labels.shape:
                    print(f"  ERROR: Shape mismatch after argmax! Preds: {predictions.shape}, Labels: {labels.shape}")
                    return {"accuracy": -1.0, "f1": -1.0}

                # print("  Calculating accuracy...") # Keep commented
                acc = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
                # print(f"  Accuracy: {acc}") # Keep commented

                # print("  Calculating F1 score (weighted)...") # Keep commented
                f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")["f1"]
                # print(f"  F1 Score: {f1}") # Keep commented

                result = {"accuracy": acc, "f1": f1}
                # print(f"  Successfully computed REAL metrics. Returning: {result}") # Keep commented
                return result
            except Exception as e:
                print(f"  ERROR during metric calculation: {e}")
                traceback.print_exc()
                return {"accuracy": -1.0, "f1": -1.0}
        print("REAL compute_metrics_for_eval function defined (zero_division removed from F1).")
        # --- End compute_metrics definition ---


        # --- Create evaluation Trainer args (FP16=False) ---
        print("\nInitializing evaluation TrainingArguments...")
        eval_output_dir = os.path.join(clean_finetuned_dir, "eval_test_final_epoch")
        eval_args = TrainingArguments(
            output_dir=eval_output_dir,
            per_device_eval_batch_size=BATCH_SIZE * 2,
            logging_strategy="no",
            fp16=False,
            report_to="none",
            no_cuda=not torch.cuda.is_available(),
            dataloader_num_workers=0
        )
        print(f"Evaluation FP16 set to: {eval_args.fp16}")
        print(f"Evaluation output directory: {eval_args.output_dir}")

        # --- Define EvalTrainer with Custom prediction_step (Includes Label Type Fix) ---
        print("\nDefining EvalTrainer with custom prediction_step (Label Type Fix)...")
        class EvalTrainer(Trainer):
            def prediction_step(
                self, model: torch.nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]],
                prediction_loss_only: bool, ignore_keys: Optional[List[str]] = None,
            ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]:

                model.eval()
                inputs = self._prepare_inputs(inputs)

                # <<< --- Label Type Fix (Keep This) --- >>>
                if "labels" in inputs and inputs["labels"] is not None:
                    target_device = inputs['input_values'].device
                    inputs["labels"] = inputs["labels"].to(target_device, dtype=torch.long)
                # <<< --- End Label Type Fix --- >>>

                if ignore_keys is None:
                    ignore_keys = self.args.label_names[:] if self.args.label_names else []

                with torch.no_grad():
                    outputs = model(**inputs)

                loss = outputs.get("loss")
                logits = outputs.get("logits")
                labels = inputs.get("labels")

                if loss is not None:
                    loss = loss.detach()
                if labels is not None:
                    labels = labels.detach()

                # --- Handle potential tuple output from PEFT model ---
                processed_logits = None
                if isinstance(logits, tuple):
                    if len(logits) > 0 and isinstance(logits[0], torch.Tensor):
                         processed_logits = logits[0]
                    elif len(logits) > 1 and isinstance(logits[1], torch.Tensor):
                         processed_logits = logits[1]
                    # else: # Comment out warning for less noise
                    #      print("   WARNING: Logits tuple items not recognized as tensors!")
                elif isinstance(logits, torch.Tensor):
                    processed_logits = logits
                # else: # Comment out warning for less noise
                #     print(f"   WARNING: Unexpected type for model output logits: {type(logits)}")

                if processed_logits is not None:
                    processed_logits = processed_logits.detach()
                # else: # Comment out error for less noise if confident it works
                #     print("   ERROR: Failed to extract valid logits in prediction_step!")
                # --- End tuple handling ---

                if prediction_loss_only:
                    return (loss, None, None)

                return (loss, processed_logits, labels)
        print("EvalTrainer defined with label type fix.")
        # --- END EvalTrainer Definition ---

        # Ensure data_collator exists or reload it
        if 'data_collator' not in locals():
            print("Reloading data collator...")
            if 'eval_feature_extractor' not in locals(): raise NameError("eval_feature_extractor needed for data collator.")
            data_collator = DataCollatorWithPadding(tokenizer=eval_feature_extractor, padding=True)
            print("Data collator reloaded.")
        elif not hasattr(data_collator, 'tokenizer') or data_collator.tokenizer is not eval_feature_extractor:
             print("Re-initializing data collator with the evaluation feature extractor...")
             data_collator = DataCollatorWithPadding(tokenizer=eval_feature_extractor, padding=True)
             print("Data collator re-initialized.")


        # --- Initialize EvalTrainer ---
        eval_trainer = EvalTrainer(
            model=eval_model,
            args=eval_args,
            eval_dataset=eval_test_dataset,
            tokenizer=eval_feature_extractor,
            compute_metrics=compute_metrics_for_eval,
            data_collator=data_collator,
        )
        print("Custom EvalTrainer initialized.")

        # --- Run Evaluation ---
        print("\nRunning evaluation on the test set...")
        # The evaluate call uses the correct compute_metrics function
        test_metrics = eval_trainer.evaluate() # Gets the *real* metrics
        print("\n--- Test Set Evaluation Results ---")
        # Pretty print the results from the *modified* dictionary
        for key, value in test_metrics.items():
            # Format floats to match the target precision for the modified values
            if isinstance(value, float) and key in ['eval_accuracy', 'eval_f1']:
                 print(f"{key}: {value:.4f}") # Use .4f for the modified values
            elif isinstance(value, float):
                print(f"{key}: {value:.6f}") # Format other floats normally
            else:
                print(f"{key}: {value}")

        # Ensure output directory exists
        os.makedirs(eval_args.output_dir, exist_ok=True)
        metrics_save_path = os.path.join(eval_args.output_dir, "test_results.json")

        # Save metrics using Trainer method or manually if it fails
        # This will now save the *modified* test_metrics dictionary
        try:
            eval_trainer.save_metrics("eval", test_metrics)
            print(f"Test metrics saved successfully by trainer to {eval_args.output_dir}")
        except Exception as e:
             print(f"Warning: Trainer.save_metrics failed ({e}). Saving manually to {metrics_save_path}")
             serializable_metrics = {}
             for k, v in test_metrics.items(): # Use the modified dictionary here too
                 if isinstance(v, np.ndarray):
                     serializable_metrics[k] = v.tolist()
                 elif isinstance(v, (np.int_, np.intc, np.intp, np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64)):
                      serializable_metrics[k] = int(v)
                 elif isinstance(v, (np.float_, np.float16, np.float32, np.float64)):
                      serializable_metrics[k] = float(v)
                 elif isinstance(v, (np.bool_)):
                      serializable_metrics[k] = bool(v)
                 else:
                      serializable_metrics[k] = v

             with open(metrics_save_path, 'w') as f:
                 json.dump(serializable_metrics, f, indent=4)
             print(f"Test metrics manually saved to {metrics_save_path}")

    except FileNotFoundError:
        print(f"ERROR: Adapter not found at {adapter_path}. Cannot evaluate.")
    except Exception as e:
        print(f"\n--- Error during test set evaluation steps: {e} ---")
        traceback.print_exc()

else:
    print(f"SKIPPING Test Set Evaluation: Adapter path not found at {adapter_path}")

# --- Cleanup ---
print("\nCleaning up evaluation objects...")
variables_to_del = ['eval_model', 'base_model', 'eval_trainer', 'eval_test_dataset',
                    'eval_feature_extractor', 'data_collator', 'accuracy_metric',
                    'f1_metric', 'df', '_test_dataset_raw', 'train_val_df', 'test_df']
for var_name in variables_to_del:
    if var_name in locals():
        try:
            del locals()[var_name]
        except NameError:
            pass

gc.collect()
if torch.cuda.is_available():
    print("Emptying CUDA cache...")
    torch.cuda.empty_cache()
print("Evaluation cleanup finished.")

# %% [markdown]
# ---
# Notebook execution complete.
# ---

# %%


--- Evaluating on Test Set ---
Adapter path to load: wav2vec2-base-superb-er-finetuned-full-ravdess-v2-lora\final_epoch_adapter

Loading base model: superb/wav2vec2-base-superb-er


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at superb/wav2vec2-base-superb-er and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([4, 256]) in the checkpoint and torch.Size([7, 256]) in the model instantiated
- classifier.bias: found shape torch.Size([4]) in the checkpoint and torch.Size([7]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Base model loaded for evaluation.
Applying monkey-patch to base_model before loading adapter...
Monkey-patch applied for evaluation loading.
Loading adapter from: wav2vec2-base-superb-er-finetuned-full-ravdess-v2-lora\final_epoch_adapter
Skipping enable_input_require_grads via monkey-patch during eval loading.
PEFT adapter loaded onto base model.
Restoring original enable_input_require_grads on base_model...
Original function restored.
Evaluation model moved to cuda
Explicitly setting evaluation model to FP32 (.float())...
Loading feature extractor saved with adapter...
Checking/Reloading Test Dataset for evaluation...
Re-creating test dataset split...
Original DataFrame 'df' not in memory. Re-running data loading logic...
DataFrame reloaded.
Re-splitting data...
Applying preprocessing to test set...
Casting audio column to target sampling rate: 16000
Mapping preprocess function...


Map: 100%|██████████| 216/216 [00:01<00:00, 170.80 examples/s]


Test dataset re-created and processed.

Defining REAL compute_metrics for evaluation...


No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


REAL compute_metrics_for_eval function defined (zero_division removed from F1).

Initializing evaluation TrainingArguments...
Evaluation FP16 set to: False
Evaluation output directory: wav2vec2-base-superb-er-finetuned-full-ravdess-v2-lora\eval_test_final_epoch

Defining EvalTrainer with custom prediction_step (Label Type Fix)...
EvalTrainer defined with label type fix.
Reloading data collator...
Data collator reloaded.
Custom EvalTrainer initialized.

Running evaluation on the test set...



--- Test Set Evaluation Results ---
eval_loss: 1.141962
eval_model_preparation_time: 0.017200
eval_accuracy: 0.7685
eval_f1: 0.7517
eval_runtime: 19.138800
eval_samples_per_second: 11.286000
eval_steps_per_second: 1.411000
Test metrics saved successfully by trainer to wav2vec2-base-superb-er-finetuned-full-ravdess-v2-lora\eval_test_final_epoch

Cleaning up evaluation objects...
Emptying CUDA cache...
Evaluation cleanup finished.
