In [1]:
# Install necessary libraries
!pip install transformers datasets torchaudio librosa scikit-learn pandas

Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.0-py3-none-any.whl (474 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.3/474.3 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K

In [2]:
import os
import pandas as pd
import numpy as np
import torch
import librosa
from transformers import Wav2Vec2FeatureExtractor, AutoModelForAudioClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict, Features, ClassLabel, Audio
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
from zipfile import ZipFile
import shutil



In [3]:
class RAVDESSDatasetLoader:
    """
    Handles downloading, extracting, loading, and preprocessing the RAVDESS dataset.
    """
    def __init__(self, dataset_zip_name='Audio_Speech_Actors_01-24.zip',
                 dataset_folder='RAVDESS'):
        self.dataset_zip_name = dataset_zip_name
        self.dataset_folder = dataset_folder
        self.extract_path = f'./{self.dataset_folder}'
        self.emotion_mapping = {
            '01': 'neutral',
            '02': 'calm',
            '03': 'happy',
            '04': 'sad',
            '05': 'angry',
            '06': 'fearful',
            '07': 'disgust',
            '08': 'surprised'
        }
        self.intensity_mapping = {
            '01': 'normal',
            '02': 'strong'
        }

    def authenticate_and_create_drive(self):
        """
        Authenticates the user and creates a PyDrive GoogleDrive instance.
        """
        # Authenticate and create the PyDrive client.
        auth.authenticate_user()
        gauth = GoogleAuth()
        gauth.credentials = GoogleCredentials.get_application_default()
        drive = GoogleDrive(gauth)
        return drive

    def download_dataset(self, drive, file_id):
        """
        Downloads the RAVDESS dataset zip file from Google Drive using its file ID.

        Args:
            drive (GoogleDrive): Authenticated GoogleDrive instance.
            file_id (str): Google Drive file ID of the dataset zip file.
        """
        print(f"Downloading dataset from Google Drive with file ID: {file_id}")
        downloaded = drive.CreateFile({'id': file_id})
        downloaded.GetContentFile(self.dataset_zip_name)
        print(f"Downloaded dataset as {self.dataset_zip_name}")

    def extract_dataset(self):
        """
        Extracts the downloaded zip file to the specified directory.
        """
        if not os.path.exists(self.extract_path):
            print("Extracting RAVDESS dataset...")
            with ZipFile(self.dataset_zip_name, 'r') as zip_ref:
                zip_ref.extractall(self.extract_path)
            print("Extraction completed.")
        else:
            print("RAVDESS dataset already extracted.")

    def load_data(self):
        """
        Loads the RAVDESS dataset, parses filenames to extract labels, and filters out samples
        with 'unknown' or 'unspecified' labels.

        Returns:
            pd.DataFrame: DataFrame containing file paths and corresponding labels.
        """
        data = []
        for root, dirs, files in os.walk(self.extract_path):
            for file in files:
                if file.endswith('.wav'):
                    filepath = os.path.join(root, file)
                    filename = os.path.basename(file)
                    parts = filename.split('.')[0].split('-')
                    if len(parts) != 7:
                        continue  # Skip files that don't match the naming convention
                    emotion_code = parts[2]
                    intensity_code = parts[3]

                    emotion = self.emotion_mapping.get(emotion_code, 'unknown')
                    intensity = self.intensity_mapping.get(intensity_code, 'unspecified' if emotion != 'neutral' else 'normal')

                    # Skip samples with 'unknown' or 'unspecified' labels
                    if emotion == 'unknown' or intensity == 'unspecified':
                        continue

                    # Combine emotion and intensity into a single label
                    label = f"{emotion}_{intensity}"
                    data.append({'file_path': filepath, 'label': label})

        df = pd.DataFrame(data)
        print(f"Total samples after filtering: {len(df)}")
        return df

    def prepare_datasets(self, df):
        """
        Splits the DataFrame into training and testing sets and converts them into Hugging Face Datasets.

        Args:
            df (pd.DataFrame): DataFrame containing file paths and labels.

        Returns:
            DatasetDict: Hugging Face DatasetDict containing 'train' and 'test' datasets.
            dict: Mapping from labels to IDs.
            dict: Mapping from IDs to labels.
        """
        # Split the data
        train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
        print(f"Training samples: {len(train_df)}")
        print(f"Testing samples: {len(test_df)}")

        # Define label classes
        labels = sorted(df['label'].unique())
        num_labels = len(labels)
        label2id = {label: idx for idx, label in enumerate(labels)}
        id2label = {idx: label for label, idx in label2id.items()}

        # Define features for Hugging Face Dataset
        features = Features({
            'file_path': Audio(sampling_rate=16000),
            'label': ClassLabel(names=labels)
        })

        # Convert pandas DataFrame to Hugging Face Dataset
        train_dataset = Dataset.from_pandas(train_df).remove_columns('__index_level_0__').cast(features)
        test_dataset = Dataset.from_pandas(test_df).remove_columns('__index_level_0__').cast(features)

        dataset = DatasetDict({
            'train': train_dataset,
            'test': test_dataset
        })

        return dataset, label2id, id2label

    def run(self, dataset_file_id):
        """
        Executes the dataset loading and preparation steps.

        Args:
            dataset_file_id (str): Google Drive file ID of the dataset zip file.

        Returns:
            DatasetDict: Prepared dataset for training and testing.
            dict: Mapping from labels to IDs.
            dict: Mapping from IDs to labels.
        """
        drive = self.authenticate_and_create_drive()
        self.download_dataset(drive, dataset_file_id)
        self.extract_dataset()
        df = self.load_data()
        dataset, label2id, id2label = self.prepare_datasets(df)
        return dataset, label2id, id2label

class EmotionIntensityModel:
    """
    Manages the loading, fine-tuning, and evaluation of the DistilHuBERT model for emotion and emotional intensity classification.
    """
    def __init__(self, model_name, num_labels, label2id, id2label):
        """
        Initializes the model with the given configuration.

        Args:
            model_name (str): Pre-trained model name or path.
            num_labels (int): Number of unique labels.
            label2id (dict): Mapping from label names to IDs.
            id2label (dict): Mapping from label IDs to names.
        """
        self.model_name = model_name
        self.num_labels = num_labels
        self.label2id = label2id
        self.id2label = id2label
        self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
        self.model = self.initialize_model()

    def initialize_model(self):
        """
        Loads the pre-trained DistilHuBERT model and configures it for classification.

        Returns:
            AutoModelForAudioClassification: Configured model ready for fine-tuning.
        """
        model = AutoModelForAudioClassification.from_pretrained(
            self.model_name,
            num_labels=self.num_labels,
            label2id=self.label2id,
            id2label=self.id2label
        )
        return model

    def preprocess_function(self, examples):
        """
        Preprocesses the dataset by using the audio features provided in the dataset.

        Args:
            examples (dict): A batch of examples from the dataset.

        Returns:
            dict: A dictionary containing input values with padding and truncation.
        """
        audio = examples['file_path']  # audio is already loaded, so no need to load again
        inputs = self.feature_extractor(
            audio['array'],
            sampling_rate=audio['sampling_rate'],
            return_tensors="pt",
            padding="max_length",  # Add padding
            truncation=True,       # Add truncation
            max_length=61395       # You can set a max length based on your observations
        )
        examples["input_values"] = inputs["input_values"][0].numpy()
        return examples

    def prepare_data(self, dataset):
        """
        Applies preprocessing to the dataset.

        Args:
            dataset (DatasetDict): Hugging Face DatasetDict containing 'train' and 'test' datasets.

        Returns:
            DatasetDict: Preprocessed dataset.
        """
        dataset = dataset.map(self.preprocess_function, remove_columns=['file_path'])
        return dataset

    def compute_metrics(self, pred):
        """
        Computes evaluation metrics.

        Args:
            pred (EvalPrediction): Prediction object from Trainer.

        Returns:
            dict: Dictionary containing accuracy, precision, recall, and F1 scores.
        """
        labels = pred.label_ids
        preds = pred.predictions.argmax(-1)
        accuracy = accuracy_score(labels, preds)
        precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted', zero_division=0)
        return {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1,
        }

    def fine_tune(self, dataset, training_args):
        """
        Fine-tunes the model using the Trainer API.

        Args:
            dataset (DatasetDict): Preprocessed dataset.
            training_args (TrainingArguments): Training arguments.

        Returns:
            Trainer: Trained Trainer object.
        """
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=dataset["train"],
            eval_dataset=dataset["test"],
            tokenizer=None,  # No tokenizer needed
            compute_metrics=self.compute_metrics,
        )

        trainer.train()
        return trainer

    def evaluate(self, trainer, dataset):
        """
        Evaluates the fine-tuned model on the test set.

        Args:
            trainer (Trainer): Trained Trainer object.
            dataset (DatasetDict): Preprocessed dataset.

        Returns:
            dict: Evaluation results.
        """
        eval_results = trainer.evaluate()
        print(f"Evaluation Results: {eval_results}")
        return eval_results

    def generate_reports(self, trainer, dataset, id2label):
        """
        Generates classification reports and confusion matrices.

        Args:
            trainer (Trainer): Trained Trainer object.
            dataset (DatasetDict): Preprocessed dataset.
            id2label (dict): Mapping from label IDs to names.
        """
        # Get predictions
        predictions = trainer.predict(dataset["test"])
        pred_ids = np.argmax(predictions.predictions, axis=1)
        true_ids = predictions.label_ids

        # Classification Report
        print("\nClassification Report:")
        print(classification_report(true_ids, pred_ids, target_names=list(id2label.values()), zero_division=0))

        # Confusion Matrix
        cm = confusion_matrix(true_ids, pred_ids)
        print("Confusion Matrix:")
        print(cm)

    def save_model(self, path="./fine_tuned_model"):
        """
        Saves the fine-tuned model and feature extractor to the specified path.

        Args:
            path (str): Directory path to save the model.
        """
        self.model.save_pretrained(path)
        self.feature_extractor.save_pretrained(path)
        print(f"Model saved to {path}.")

    def load_model(self, path="./fine_tuned_model"):
        """
        Loads a previously fine-tuned model and feature extractor from the specified path.

        Args:
            path (str): Directory path from where to load the model.
        """
        self.model = AutoModelForAudioClassification.from_pretrained(path)
        self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(path)
        print(f"Model loaded from {path}.")

    def predict_all_labels(self, audio_file_path):
        """
        Predict all possible emotion and intensity labels with their respective confidence scores.

        Args:
            audio_file_path (str): Path to the audio file to be predicted.

        Returns:
            pd.DataFrame: DataFrame containing audio file name, emotion, intensity level, and confidence scores.
        """
        # Load and preprocess the audio file
        speech_array, sampling_rate = librosa.load(audio_file_path, sr=16000)
        inputs = self.feature_extractor(
            speech_array, return_tensors="pt", sampling_rate=sampling_rate, padding=True
        )

        # Perform prediction
        with torch.no_grad():
            logits = self.model(**inputs).logits
            probabilities = torch.softmax(logits, dim=-1).squeeze().numpy()  # Convert logits to probabilities

        # Get all possible labels
        id2label = self.model.config.id2label

        # Initialize list to hold result rows
        results = []

        # Loop through all labels to get emotion and intensity
        for label_id, confidence in enumerate(probabilities):
            emotion_intensity = id2label[label_id]
            # Handle cases where intensity might not be present
            if '_' in emotion_intensity:
                emotion, intensity = emotion_intensity.split('_')
            else:
                emotion = emotion_intensity
                intensity = 'unspecified'
            results.append({
                'audio_file': os.path.basename(audio_file_path),
                'emotion': emotion.capitalize(),
                'intensity': intensity.capitalize(),
                'confidence': confidence
            })

        # Convert results to a pandas DataFrame for better display
        df = pd.DataFrame(results)
        return df

class EmotionIntensityPipeline:
    """
    Orchestrates the workflow of downloading data, fine-tuning the model, and evaluating its performance.
    """
    def __init__(self, dataset_zip_file_id):
        """
        Initializes the pipeline with the Google Drive file ID of the dataset zip file.

        Args:
            dataset_zip_file_id (str): Google Drive file ID of the RAVDESS dataset zip file.
        """
        self.loader = RAVDESSDatasetLoader()
        self.model = None
        self.dataset_zip_file_id = dataset_zip_file_id

    def run(self):
        """
        Executes the entire pipeline: download, extract, load, preprocess, fine-tune, and evaluate.
        """
        # Step 1: Load and prepare the dataset
        dataset, label2id, id2label = self.loader.run(self.dataset_zip_file_id)

        # Step 2: Initialize the model
        model_name = "ntu-spml/distilhubert"
        self.model = EmotionIntensityModel(model_name=model_name,
                                           num_labels=len(label2id),
                                           label2id=label2id,
                                           id2label=id2label)

        # Step 3: Prepare the data
        dataset = self.model.prepare_data(dataset)

        # Step 4: Define training arguments (using pollner's hyperparameters)
        training_args = TrainingArguments(
            output_dir="./results",
            evaluation_strategy="epoch",
            save_strategy="epoch",
            learning_rate=5e-5,
            per_device_train_batch_size=8,
            per_device_eval_batch_size=8,
            num_train_epochs=10,
            weight_decay=0.01,
            optim="adamw_torch",  # AdamW optimizer
            logging_dir='./logs',
            logging_steps=10,
            lr_scheduler_type="linear",
            warmup_ratio=0.1,
            seed=42,
            load_best_model_at_end=True,
            metric_for_best_model="accuracy",
        )

        # Step 5: Fine-tune the model
        trainer = self.model.fine_tune(dataset, training_args)

        # Step 6: Evaluate the model
        self.model.evaluate(trainer, dataset)

        # Step 7: Generate detailed reports
        self.model.generate_reports(trainer, dataset, id2label)

        # Step 8: Save the fine-tuned model
        self.model.save_model()

    def authenticate_and_create_drive(self):
        """
        Authenticates the user and creates a PyDrive GoogleDrive instance.

        Returns:
            GoogleDrive: Authenticated GoogleDrive instance.
        """
        # Authenticate and create the PyDrive client.
        auth.authenticate_user()
        gauth = GoogleAuth()
        gauth.credentials = GoogleCredentials.get_application_default()
        drive = GoogleDrive(gauth)
        return drive

    def download_audio_from_drive(self, drive, audio_file_id, destination_path):
        """
        Downloads an audio file from Google Drive using its file ID.

        Args:
            drive (GoogleDrive): Authenticated GoogleDrive instance.
            audio_file_id (str): Google Drive file ID of the audio file.
            destination_path (str): Local path to save the downloaded audio file.
        """
        print(f"Downloading audio file from Google Drive with file ID: {audio_file_id}")
        downloaded = drive.CreateFile({'id': audio_file_id})
        downloaded.GetContentFile(destination_path)
        print(f"Downloaded audio file and saved as {destination_path}")

    def load_and_predict(self, audio_file_ids):
        """
        Downloads the audio files using their file IDs and predicts all possible labels.

        Args:
            audio_file_ids (dict): Dictionary mapping audio file names to their Google Drive file IDs.

        Returns:
            None
        """
        # Authenticate and create a GoogleDrive instance
        drive = self.authenticate_and_create_drive()

        for audio_file_name, audio_file_id in audio_file_ids.items():
            # Define local path to save the audio file
            destination_path = f"./{audio_file_name}"

            # Download the audio file
            self.download_audio_from_drive(drive, audio_file_id, destination_path)

            # Predict all emotion and intensity labels with confidence
            result_df = self.model.predict_all_labels(destination_path)
            print(result_df)

In [4]:
# Main function to execute the pipeline
def main():
    """
    Main function to execute the emotion and intensity classification pipeline.
    """
    # Define the Google Drive file ID for RAVDESS dataset zip file
    # IMPORTANT: Replace 'YOUR_DATASET_ZIP_FILE_ID' with the actual file ID
    dataset_zip_file_id = '1dSXSY-f6ZigkcJWV07v6kc3r_i3pMPvl'

    # Initialize and run the pipeline
    pipeline = EmotionIntensityPipeline(dataset_zip_file_id=dataset_zip_file_id)
    pipeline.run()

    # Define audio files to predict with their Google Drive file IDs
    audio_file_ids = {
        'audio1.mp3': '108kPpEQeA_6RkQXmmLWDJXQzdiISlm0r'
    }

    # Predict emotion and intensity for each audio file in the dictionary
    pipeline.load_and_predict(audio_file_ids)

if __name__ == "__main__":
    main()

Downloading dataset from Google Drive with file ID: 1dSXSY-f6ZigkcJWV07v6kc3r_i3pMPvl
Downloaded dataset as Audio_Speech_Actors_01-24.zip
Extracting RAVDESS dataset...
Extraction completed.
Total samples after filtering: 1440
Training samples: 1152
Testing samples: 288


Casting the dataset:   0%|          | 0/1152 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/288 [00:00<?, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/214 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/94.0M [00:00<?, ?B/s]

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at ntu-spml/distilhubert and are newly initialized: ['classifier.bias', 'classifier.weight', 'encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1152 [00:00<?, ? examples/s]

Map:   0%|          | 0/288 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4873,2.351829,0.229167,0.093424,0.229167,0.125035
2,1.9908,1.979484,0.319444,0.373816,0.319444,0.255872
3,1.6262,1.698708,0.402778,0.437964,0.402778,0.357451
4,1.3689,1.520619,0.496528,0.494886,0.496528,0.463162
5,1.1491,1.245581,0.565972,0.586594,0.565972,0.542768
6,1.0137,1.16198,0.625,0.629063,0.625,0.611793
7,0.6898,1.061697,0.652778,0.655989,0.652778,0.648165
8,0.5291,1.028208,0.642361,0.662253,0.642361,0.63737
9,0.5644,0.957226,0.6875,0.689847,0.6875,0.682009
10,0.4528,0.94352,0.663194,0.667938,0.663194,0.6581


Evaluation Results: {'eval_loss': 0.9572264552116394, 'eval_accuracy': 0.6875, 'eval_precision': 0.6898466480000488, 'eval_recall': 0.6875, 'eval_f1': 0.682008690074048, 'eval_runtime': 14.7513, 'eval_samples_per_second': 19.524, 'eval_steps_per_second': 2.44, 'epoch': 10.0}

Classification Report:
                  precision    recall  f1-score   support

    angry_normal       0.60      0.47      0.53        19
    angry_strong       0.79      1.00      0.88        19
     calm_normal       0.52      0.58      0.55        19
     calm_strong       0.68      0.75      0.71        20
  disgust_normal       0.76      0.84      0.80        19
  disgust_strong       0.68      0.68      0.68        19
  fearful_normal       0.72      0.65      0.68        20
  fearful_strong       0.83      0.79      0.81        19
    happy_normal       0.75      0.47      0.58        19
    happy_strong       0.69      0.47      0.56        19
  neutral_normal       0.58      0.74      0.65        19
   

RuntimeError: Input type (torch.FloatTensor) and weight type (torch.cuda.FloatTensor) should be the same or input should be a MKLDNN tensor and weight is a dense tensor