<a href="https://colab.research.google.com/github/DGautam11/Audio-Emotion-Recognition/blob/main/notebooks/02_wav2vec_finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Emotion Recognition using Wav2Vec2
**Author:** Deepan Gautam (@Dpngtm)
**Model:** [Hugging Face Link](https://huggingface.co/Dpngtm/wav2vec2-emotion-recognition)
**Demo:** [Hugging Face Space](https://huggingface.co/spaces/Dpngtm/Audio-Emotion-Recognition)

## Description
This notebook fine-tunes Facebook's Wav2Vec2 model on 4 combined datasets (TESS, CREMA-D, SAVEE, RAVDESS) to recognize 7 emotions.
**Accuracy Achieved:** ~80%

# Wav2Vec2 Fine-Tuning Workflow
*End-to-end MLOps pipeline: Loading pre-processed features, fine-tuning the model, and deploying to Hugging Face Hub.*

## ENVIRONMENT CONFIGURATION

In [None]:
%%capture
#  INSTALL TRAINING DEPENDENCIES
!pip install datasets huggingface_hub transformers evaluate accelerate

In [None]:
import os
import json
import torch
import numpy as np
import evaluate
from datasets import load_from_disk
from transformers import (
    AutoModelForAudioClassification,
    TrainingArguments,
    Trainer
)
from google.colab import drive
from huggingface_hub import notebook_login
from sklearn.metrics import accuracy_score, f1_score

In [None]:

try:
    from google.colab import drive
    drive.mount('/content/drive')
    IS_COLAB = True
    print("Detected Colab Environment. Using Google Drive.")

    # Colab Paths
    # This must match where 01_data_preparation notebook saved the data
    INPUT_PATH = "/content/drive/MyDrive/wav2vec2-processed-data/"
    MODEL_OUTPUT_DIR = "/content/drive/MyDrive/wav2vec2-emotion-checkpoints/"
    FINAL_MODEL_PATH = "/content/drive/MyDrive/wav2vec2-emotion-final/"

except ImportError:
    IS_COLAB = False
    print("Detected Local Environment. Using local storage.")

    # Local Paths (Relative to this notebook)
    # Assumes data is in a folder next to the notebooks
    INPUT_PATH = "../wav2vec2-processed-data/"
    MODEL_OUTPUT_DIR = "../checkpoints/"
    FINAL_MODEL_PATH = "../final_model/"

# Create directories if they don't exist
os.makedirs(MODEL_OUTPUT_DIR, exist_ok=True)
os.makedirs(FINAL_MODEL_PATH, exist_ok=True)

## 1. DATA LOADING (Ingest)

In [None]:
# Load Datasets
print(" Loading pre-tokenized datasets from disk...")
try:
    train_dataset = load_from_disk(os.path.join(INPUT_PATH, "train_dataset"))
    test_dataset = load_from_disk(os.path.join(INPUT_PATH, "test_dataset"))
    print(f" Data Loaded Successfully.")
    print(f"   - Training Samples: {len(train_dataset)}")
    print(f"   - Test Samples: {len(test_dataset)}")
except FileNotFoundError:
    print(" ERROR: Could not find datasets. Run 01_data_prepration notebook and save to the correct path")

In [None]:

# Inspect the dataset to find out how many emotions we are predicting
unique_labels = train_dataset.unique("labels")
num_labels = len(unique_labels)

print(f"Detected {num_labels} emotion classes.")
print(f"   Classes: {unique_labels}")

## 2. MODEL CONFIGURATION
*Initializing pre-trained weights and configuring GPU acceleration.*

### 2.1 Load Label Mappings

In [None]:

# load the exact mapping created in 01_data_preparation notebook.
label_file_path = os.path.join(INPUT_PATH, "label_mapping.json")

try:
    with open(label_file_path, "r") as f:
        mappings = json.load(f)

    # Convert keys back to integers (JSON stores them as strings)
    id2label = {int(k): v for k, v in mappings["id2label"].items()}
    label2id = mappings["label2id"]
    num_labels = len(id2label)

    print(f"Loaded Label Mapping. Detected {num_labels} classes.")
    print(f"   Mapping: {id2label}")

except FileNotFoundError:
    raise RuntimeError("label_mapping.json not found! Please re-run 01_data_preparation notebook")

### 2.2 Device Configuration

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f" Active Computation Device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")

### 2.2 Model Initialization

In [None]:

# Loading Facebook's Wav2Vec2 base model with a classification head on top
model = AutoModelForAudioClassification.from_pretrained(
    'facebook/wav2vec2-base-960h',
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)
model.freeze_feature_extractor()

model.to(device) # Move weights to GPU
print("Model initialized and moved to GPU.")

### 3. TRAINING CONFIFURATION

#### 3.1 Set Format to PyTorch Tensors

In [None]:
# The model requires PyTorch Tensors, not Python Lists.

train_dataset.set_format(type="torch", columns=["input_values", "labels"])
test_dataset.set_format(type="torch", columns=["input_values", "labels"])

#### 3.2 Define Metrics Function

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    #  calculate both Accuracy and F1-Score
    # 'weighted' F1 is best for multi-class emotion classification
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="weighted")

    return {
        "accuracy": acc,
        "f1": f1
    }

#### 3.3 Setup Training Arguments

In [None]:

training_args = TrainingArguments(
    output_dir= MODEL_OUTPUT_DIR,
    overwrite_output_dir=True,


    learning_rate=3e-5,             # safer for a frozen model
    num_train_epochs=10,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,


    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,  # Effective Batch Size = 16
    fp16=False,                     # Keep False for (T4 GPU stability)
    gradient_checkpointing=False,

    # --- LOGGING ---
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    logging_steps=50,
    report_to="none"
)

### 4. MODEL TRAINING

In [None]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
#Start Training
print("Starting training process...")
trainer.train()

### 5. PUSH TO HUGGING FACE SPACES

In [None]:

# 1. Get the token from Secrets
from google.colab import userdata
from transformers import Wav2Vec2Processor

try:
    #  REPLACE 'HF_TOKEN' WITH THE EXACT NAME YOU GAVE IT IN THE SECRETS TAB
    my_token = userdata.get('HF_TOKEN')
    print("Token retrieved from Secrets successfully.")
except:
    print("Error: Could not find the token. Check the name in the Secrets tab (Key icon).")


# 2. Define Repo
repo_id = "Dpngtm/wav2vec2-emotion-recognition"

# 3. Reload Processor (Standard safety step)
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")

# 4. Push using the token explicitly
print(f"Pushing to {repo_id}...")

model.push_to_hub(repo_id, token=my_token)
processor.push_to_hub(repo_id, token=my_token)

print("SUCCESS! Upload finished.")