In [1]:
# Install necessary libraries
!pip install transformers datasets torchaudio librosa scikit-learn pandas



In [2]:
import os
import pandas as pd
import numpy as np
import torch
import librosa
from transformers import Wav2Vec2FeatureExtractor, AutoModelForAudioClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict, Features, ClassLabel, Audio
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
from zipfile import ZipFile
import shutil



In [3]:
class RAVDESSDatasetLoader:
    """
    Handles downloading, extracting, loading, and preprocessing the RAVDESS dataset.
    """
    def __init__(self, dataset_zip_name='Audio_Speech_Actors_01-24.zip', dataset_folder='RAVDESS'):
        self.dataset_zip_name = dataset_zip_name
        self.dataset_folder = dataset_folder
        self.extract_path = f'./{self.dataset_folder}'
        self.emotion_mapping = {
            '01': 'neutral',
            '02': 'calm',
            '03': 'happy',
            '04': 'sad',
            '05': 'angry',
            '06': 'fearful',
            '07': 'disgust',
            '08': 'surprised'
        }
        self.intensity_mapping = {
            '01': 'normal',
            '02': 'strong'
        }

    def authenticate_and_create_drive(self):
        """
        Authenticates the user and creates a PyDrive GoogleDrive instance.
        """
        auth.authenticate_user()
        gauth = GoogleAuth()
        gauth.credentials = GoogleCredentials.get_application_default()
        drive = GoogleDrive(gauth)
        return drive

    def download_dataset(self, drive, file_id):
        """
        Downloads the RAVDESS dataset zip file from Google Drive using its file ID.
        """
        print(f"Downloading dataset from Google Drive with file ID: {file_id}")
        downloaded = drive.CreateFile({'id': file_id})
        downloaded.GetContentFile(self.dataset_zip_name)
        print(f"Downloaded dataset as {self.dataset_zip_name}")

    def extract_dataset(self):
        """
        Extracts the downloaded zip file to the specified directory.
        """
        if not os.path.exists(self.extract_path):
            print("Extracting RAVDESS dataset...")
            with ZipFile(self.dataset_zip_name, 'r') as zip_ref:
                zip_ref.extractall(self.extract_path)
            print("Extraction completed.")
        else:
            print("RAVDESS dataset already extracted.")

    def load_data(self):
        """
        Loads the RAVDESS dataset, parses filenames to extract labels, and filters out samples
        with 'unknown' or 'unspecified' labels.
        """
        data = []
        for root, dirs, files in os.walk(self.extract_path):
            for file in files:
                if file.endswith('.wav'):
                    filepath = os.path.join(root, file)
                    filename = os.path.basename(file)
                    parts = filename.split('.')[0].split('-')
                    if len(parts) != 7:
                        continue  # Skip files that don't match the naming convention
                    emotion_code = parts[2]
                    intensity_code = parts[3]

                    emotion = self.emotion_mapping.get(emotion_code, 'unknown')
                    intensity = self.intensity_mapping.get(intensity_code, 'unspecified' if emotion != 'neutral' else 'normal')

                    if emotion == 'unknown' or intensity == 'unspecified':
                        continue

                    label = f"{emotion}_{intensity}"
                    data.append({'file_path': filepath, 'label': label})

        df = pd.DataFrame(data)
        print(f"Total samples after filtering: {len(df)}")
        return df

    def prepare_datasets(self, df):
        """
        Splits the DataFrame into training and testing sets and converts them into Hugging Face Datasets.
        """
        train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
        print(f"Training samples: {len(train_df)}")
        print(f"Testing samples: {len(test_df)}")

        labels = sorted(df['label'].unique())
        num_labels = len(labels)
        label2id = {label: idx for idx, label in enumerate(labels)}
        id2label = {idx: label for label, idx in label2id.items()}

        features = Features({
            'file_path': Audio(sampling_rate=16000),
            'label': ClassLabel(names=labels)
        })

        train_dataset = Dataset.from_pandas(train_df).remove_columns('__index_level_0__').cast(features)
        test_dataset = Dataset.from_pandas(test_df).remove_columns('__index_level_0__').cast(features)

        dataset = DatasetDict({
            'train': train_dataset,
            'test': test_dataset
        })

        return dataset, label2id, id2label

    def run(self, dataset_file_id):
        """
        Executes the dataset loading and preparation steps.
        """
        drive = self.authenticate_and_create_drive()
        self.download_dataset(drive, dataset_file_id)
        self.extract_dataset()
        df = self.load_data()
        dataset, label2id, id2label = self.prepare_datasets(df)
        return dataset, label2id, id2label

class EmotionIntensityModel:
    """
    Manages the loading, fine-tuning, and evaluation of the DistilHuBERT model for emotion and emotional intensity classification.
    """
    def __init__(self, model_name, num_labels, label2id, id2label):
        self.model_name = model_name
        self.num_labels = num_labels
        self.label2id = label2id
        self.id2label = id2label
        self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
        self.model = self.initialize_model()

    def initialize_model(self):
        """
        Loads the pre-trained DistilHuBERT model and configures it for classification.
        """
        model = AutoModelForAudioClassification.from_pretrained(
            self.model_name,
            num_labels=self.num_labels,
            label2id=self.label2id,
            id2label=self.id2label
        )
        return model

    def preprocess_function(self, examples):
        """
        Preprocesses the dataset by using the audio features provided in the dataset.
        """
        audio = examples['file_path']
        inputs = self.feature_extractor(
            audio['array'],
            sampling_rate=audio['sampling_rate'],
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=61395
        )
        examples["input_values"] = inputs["input_values"][0].numpy()
        return examples

    def prepare_data(self, dataset):
        """
        Applies preprocessing to the dataset.
        """
        dataset = dataset.map(self.preprocess_function, remove_columns=['file_path'])
        return dataset

    def compute_metrics(self, pred):
        """
        Computes evaluation metrics.
        """
        labels = pred.label_ids
        preds = pred.predictions.argmax(-1)
        accuracy = accuracy_score(labels, preds)
        precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted', zero_division=0)
        return {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1,
        }

    def fine_tune(self, dataset, training_args):
        """
        Fine-tunes the model using the Trainer API.
        """
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=dataset["train"],
            eval_dataset=dataset["test"],
            tokenizer=None,
            compute_metrics=self.compute_metrics,
        )

        trainer.train()
        return trainer

    def evaluate(self, trainer, dataset):
        """
        Evaluates the fine-tuned model on the test set.
        """
        eval_results = trainer.evaluate()
        print(f"Evaluation Results: {eval_results}")
        return eval_results

    def generate_reports(self, trainer, dataset, id2label):
        """
        Generates classification reports and confusion matrices.
        """
        predictions = trainer.predict(dataset["test"])
        pred_ids = np.argmax(predictions.predictions, axis=1)
        true_ids = predictions.label_ids

        print("\nClassification Report:")
        print(classification_report(true_ids, pred_ids, target_names=list(id2label.values()), zero_division=0))

        cm = confusion_matrix(true_ids, pred_ids)
        print("Confusion Matrix:")
        print(cm)

    def save_model(self, path="./fine_tuned_model"):
        """
        Saves the fine-tuned model and feature extractor.
        """
        self.model.save_pretrained(path)
        self.feature_extractor.save_pretrained(path)
        print(f"Model saved to {path}.")

    def load_model(self, path="./fine_tuned_model"):
        """
        Loads a previously fine-tuned model and feature extractor.
        """
        self.model = AutoModelForAudioClassification.from_pretrained(path)
        self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(path)
        print(f"Model loaded from {path}.")

    def predict_all_labels(self, audio_file_path):
        """
        Predict all possible emotion and intensity labels with their respective confidence scores.
        """
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        speech_array, sampling_rate = librosa.load(audio_file_path, sr=16000)
        inputs = self.feature_extractor(
            speech_array, return_tensors="pt", sampling_rate=sampling_rate, padding=True
        )

        inputs = {key: value.to(device) for key, value in inputs.items()}
        self.model.to(device)

        with torch.no_grad():
            logits = self.model(**inputs).logits
            probabilities = torch.softmax(logits, dim=-1).squeeze().cpu().numpy()

        id2label = self.model.config.id2label

        results = []
        for label_id, confidence in enumerate(probabilities):
            emotion_intensity = id2label[label_id]
            if '_' in emotion_intensity:
                emotion, intensity = emotion_intensity.split('_')
            else:
                emotion = emotion_intensity
                intensity = 'unspecified'
            results.append({
                'audio_file': os.path.basename(audio_file_path),
                'emotion': emotion.capitalize(),
                'intensity': intensity.capitalize(),
                'confidence': confidence
            })

        df = pd.DataFrame(results)
        return df

class EmotionIntensityPipeline:
    """
    Orchestrates the workflow of downloading data, fine-tuning the model, and evaluating its performance.
    """
    def __init__(self, dataset_zip_file_id):
        self.loader = RAVDESSDatasetLoader()
        self.model = None
        self.dataset_zip_file_id = dataset_zip_file_id

    def run(self):
        """
        Executes the entire pipeline: download, extract, load, preprocess, fine-tune, and evaluate.
        """
        dataset, label2id, id2label = self.loader.run(self.dataset_zip_file_id)

        model_name = "ntu-spml/distilhubert"
        self.model = EmotionIntensityModel(model_name=model_name, num_labels=len(label2id), label2id=label2id, id2label=id2label)

        dataset = self.model.prepare_data(dataset)

        training_args = TrainingArguments(
            output_dir="./results",
            evaluation_strategy="epoch",
            save_strategy="epoch",
            learning_rate=5e-5,
            per_device_train_batch_size=8,
            per_device_eval_batch_size=8,
            num_train_epochs=10,
            weight_decay=0.01,
            optim="adamw_torch",
            logging_dir='./logs',
            logging_steps=10,
            lr_scheduler_type="linear",
            warmup_ratio=0.1,
            seed=42,
            load_best_model_at_end=True,
            metric_for_best_model="accuracy",
        )

        trainer = self.model.fine_tune(dataset, training_args)
        self.model.evaluate(trainer, dataset)
        self.model.generate_reports(trainer, dataset, id2label)
        self.model.save_model()

    def authenticate_and_create_drive(self):
        """
        Authenticates the user and creates a PyDrive GoogleDrive instance.
        """
        auth.authenticate_user()
        gauth = GoogleAuth()
        gauth.credentials = GoogleCredentials.get_application_default()
        drive = GoogleDrive(gauth)
        return drive

    def download_audio_from_drive(self, drive, audio_file_id, destination_path):
        """
        Downloads an audio file from Google Drive using its file ID.
        """
        print(f"Downloading audio file from Google Drive with file ID: {audio_file_id}")
        downloaded = drive.CreateFile({'id': audio_file_id})
        downloaded.GetContentFile(destination_path)
        print(f"Downloaded audio file and saved as {destination_path}")

    def load_and_predict(self, audio_file_ids):
        """
        Downloads the audio files using their file IDs and predicts all possible labels.
        """
        drive = self.authenticate_and_create_drive()

        for audio_file_name, audio_file_id in audio_file_ids.items():
            destination_path = f"./{audio_file_name}"
            self.download_audio_from_drive(drive, audio_file_id, destination_path)

            result_df = self.model.predict_all_labels(destination_path)
            print(result_df)

In [4]:
# Main function to execute the pipeline
def main():
    """
    Main function to execute the emotion and intensity classification pipeline.
    """
    dataset_zip_file_id = '1dSXSY-f6ZigkcJWV07v6kc3r_i3pMPvl'

    pipeline = EmotionIntensityPipeline(dataset_zip_file_id=dataset_zip_file_id)
    pipeline.run()

    audio_file_ids = {
        'audio1.mp3': '108kPpEQeA_6RkQXmmLWDJXQzdiISlm0r'
    }

    pipeline.load_and_predict(audio_file_ids)

if __name__ == "__main__":
    main()

Downloading dataset from Google Drive with file ID: 1dSXSY-f6ZigkcJWV07v6kc3r_i3pMPvl
Downloaded dataset as Audio_Speech_Actors_01-24.zip
Extracting RAVDESS dataset...
Extraction completed.
Total samples after filtering: 1440
Training samples: 1152
Testing samples: 288


Casting the dataset:   0%|          | 0/1152 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/288 [00:00<?, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/214 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/94.0M [00:00<?, ?B/s]

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at ntu-spml/distilhubert and are newly initialized: ['classifier.bias', 'classifier.weight', 'encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1152 [00:00<?, ? examples/s]

Map:   0%|          | 0/288 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4743,2.363582,0.204861,0.132376,0.204861,0.133698
2,1.9648,1.984023,0.315972,0.279152,0.315972,0.255685
3,1.5768,1.656787,0.381944,0.350042,0.381944,0.314527
4,1.3492,1.448461,0.527778,0.556926,0.527778,0.507458
5,1.1464,1.22241,0.621528,0.621772,0.621528,0.609863
6,0.9708,1.113094,0.611111,0.623952,0.611111,0.602214
7,0.6217,1.057811,0.628472,0.633976,0.628472,0.621949
8,0.5028,1.005057,0.645833,0.659049,0.645833,0.638978
9,0.5249,1.000362,0.65625,0.660982,0.65625,0.646349
10,0.4041,0.967738,0.673611,0.687752,0.673611,0.665913


Evaluation Results: {'eval_loss': 0.9677378535270691, 'eval_accuracy': 0.6736111111111112, 'eval_precision': 0.6877521888213346, 'eval_recall': 0.6736111111111112, 'eval_f1': 0.6659125202990847, 'eval_runtime': 13.673, 'eval_samples_per_second': 21.063, 'eval_steps_per_second': 2.633, 'epoch': 10.0}

Classification Report:
                  precision    recall  f1-score   support

    angry_normal       0.62      0.68      0.65        19
    angry_strong       0.76      1.00      0.86        19
     calm_normal       0.56      0.74      0.64        19
     calm_strong       0.76      0.65      0.70        20
  disgust_normal       0.68      0.79      0.73        19
  disgust_strong       0.77      0.53      0.62        19
  fearful_normal       0.74      0.70      0.72        20
  fearful_strong       0.79      0.79      0.79        19
    happy_normal       0.75      0.63      0.69        19
    happy_strong       0.70      0.37      0.48        19
  neutral_normal       0.59      0.8