In [1]:
!pip install transformers datasets torchaudio librosa pydrive scikit-learn tqdm

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K  

In [2]:
import os
import pandas as pd
import numpy as np
import torch
import librosa
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import AutoModelForAudioClassification, Wav2Vec2FeatureExtractor
from tqdm import tqdm
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
from zipfile import ZipFile



In [4]:
class PretrainedEmotionModel:
    """
    Use the pre-trained DistilHuBERT model from Hugging Face for emotion classification.
    """
    def __init__(self, model_name):
        # Load the pre-trained DistilHuBERT model and feature extractor
        self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
        self.model = AutoModelForAudioClassification.from_pretrained(model_name)

    def predict_label(self, audio_file_path):
        """
        Predict the most likely emotion label for the given audio file.
        """
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        speech_array, sampling_rate = librosa.load(audio_file_path, sr=16000)
        inputs = self.feature_extractor(
            speech_array, return_tensors="pt", sampling_rate=sampling_rate, padding=True
        )
        inputs = {key: value.to(device) for key, value in inputs.items()}
        self.model.to(device)

        with torch.no_grad():
            logits = self.model(**inputs).logits
            predicted_id = torch.argmax(logits, dim=-1).item()

        id2label = self.model.config.id2label
        return id2label[predicted_id]


class RAVDESSDatasetLoader:
    """
    Handles downloading, extracting, loading, and preparing the RAVDESS dataset.
    """
    def __init__(self, dataset_zip_name='Audio_Speech_Actors_01-24.zip', dataset_folder='RAVDESS'):
        self.dataset_zip_name = dataset_zip_name
        self.dataset_folder = dataset_folder
        self.extract_path = f'./{self.dataset_folder}'
        self.emotion_mapping = {
            '01': 'neutral',
            '02': 'calm',
            '03': 'happy',
            '04': 'sad',
            '05': 'angry',
            '06': 'fearful',
            '07': 'disgust',
            '08': 'surprised'
        }

    def authenticate_and_create_drive(self):
        """
        Authenticates the user and creates a PyDrive GoogleDrive instance.
        """
        auth.authenticate_user()
        gauth = GoogleAuth()
        gauth.credentials = GoogleCredentials.get_application_default()
        drive = GoogleDrive(gauth)
        return drive

    def download_dataset(self, drive, file_id):
        """
        Downloads the RAVDESS dataset zip file from Google Drive using its file ID.
        """
        print(f"Downloading dataset from Google Drive with file ID: {file_id}")
        downloaded = drive.CreateFile({'id': file_id})
        downloaded.GetContentFile(self.dataset_zip_name)
        print(f"Downloaded dataset as {self.dataset_zip_name}")

    def extract_dataset(self):
        """
        Extracts the downloaded zip file to the specified directory.
        """
        if not os.path.exists(self.extract_path):
            print("Extracting RAVDESS dataset...")
            with ZipFile(self.dataset_zip_name, 'r') as zip_ref:
                zip_ref.extractall(self.extract_path)
            print("Extraction completed.")
        else:
            print("RAVDESS dataset already extracted.")

    def load_data(self):
        """
        Loads the RAVDESS dataset, parses filenames to extract emotion labels, and returns a DataFrame.
        """
        data = []
        for root, dirs, files in os.walk(self.extract_path):
            for file in files:
                if file.endswith('.wav'):
                    filepath = os.path.join(root, file)
                    filename = os.path.basename(file)
                    parts = filename.split('.')[0].split('-')
                    if len(parts) != 7:
                        continue
                    emotion_code = parts[2]
                    emotion = self.emotion_mapping.get(emotion_code, 'unknown')
                    if emotion != 'unknown':
                        data.append({'file_path': filepath, 'label': emotion})
        df = pd.DataFrame(data)
        return df

    def run(self, dataset_file_id):
        """
        Executes the dataset downloading, extracting, and loading steps.
        """
        drive = self.authenticate_and_create_drive()
        self.download_dataset(drive, dataset_file_id)
        self.extract_dataset()
        df = self.load_data()
        return df


def monte_carlo_cv(df, model, n_splits=10, test_size=0.2):
    """
    Perform Monte Carlo Cross-Validation with the given number of splits and test size.
    """
    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []

    for i in tqdm(range(n_splits)):
        # Split the data
        train_df, test_df = train_test_split(df, test_size=test_size, stratify=df['label'], random_state=i)

        # Run predictions on test set
        true_labels = test_df['label'].tolist()
        predicted_labels = []
        for _, row in test_df.iterrows():
            predicted_label = model.predict_label(row['file_path'])
            predicted_labels.append(predicted_label)

        # Calculate metrics
        accuracy = accuracy_score(true_labels, predicted_labels)
        precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predicted_labels, average='weighted', zero_division=0)

        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)

    # Return the average metrics
    return {
        'accuracy': np.mean(accuracies),
        'precision': np.mean(precisions),
        'recall': np.mean(recalls),
        'f1': np.mean(f1_scores)
    }


def main():
    # File ID of the RAVDESS dataset in Google Drive
    dataset_file_id = '1dSXSY-f6ZigkcJWV07v6kc3r_i3pMPvl'

    # Load the RAVDESS dataset
    dataset_loader = RAVDESSDatasetLoader()
    ravdess_df = dataset_loader.run(dataset_file_id)

    # Initialize the pre-trained model
    model_name = "pollner/distilhubert-finetuned-ravdess"
    emotion_model = PretrainedEmotionModel(model_name=model_name)

    # Perform Monte Carlo Cross-Validation
    metrics = monte_carlo_cv(ravdess_df, emotion_model, n_splits=10, test_size=0.2)

    # Print the results
    print(f"Monte Carlo Cross-Validation Results:")
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"Precision: {metrics['precision']:.4f}")
    print(f"Recall: {metrics['recall']:.4f}")
    print(f"F1-Score: {metrics['f1']:.4f}")


if __name__ == "__main__":
    main()

Downloading dataset from Google Drive with file ID: 1dSXSY-f6ZigkcJWV07v6kc3r_i3pMPvl
Downloaded dataset as Audio_Speech_Actors_01-24.zip
Extracting RAVDESS dataset...
Extraction completed.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.77k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/94.8M [00:00<?, ?B/s]

Some weights of the model checkpoint at pollner/distilhubert-finetuned-ravdess were not used when initializing HubertForSequenceClassification: ['hubert.encoder.pos_conv_embed.conv.weight_g', 'hubert.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing HubertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at pollner/distilhubert-finetuned-ravdess and are newly initialized: ['hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'hubert.encoder.pos_conv_em

Monte Carlo Cross-Validation Results:
Accuracy: 0.9920
Precision: 0.9923
Recall: 0.9920
F1-Score: 0.9920



