In [1]:
# Install necessary libraries
!pip install transformers datasets torchaudio librosa scikit-learn pandas



In [2]:
import os
import pandas as pd
import numpy as np
import torch
import librosa
from transformers import Wav2Vec2FeatureExtractor, AutoModelForAudioClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict, Features, ClassLabel, Audio
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix
from google.colab import drive
from zipfile import ZipFile
import shutil

In [3]:
class RAVDESSDatasetLoader:
    """
    Handles downloading, extracting, loading, and preprocessing the RAVDESS dataset.
    """
    def __init__(self, drive_file_id, drive_folder_path='RAVDESS', local_zip_path='Audio_Speech_Actors_01-24.zip'):
        self.drive_file_id = drive_file_id
        self.drive_folder_path = drive_folder_path
        self.local_zip_path = local_zip_path
        self.extract_path = './RAVDESS'
        self.emotion_mapping = {
            '01': 'neutral',
            '02': 'calm',
            '03': 'happy',
            '04': 'sad',
            '05': 'angry',
            '06': 'fearful',
            '07': 'disgust',
            '08': 'surprised'
        }
        self.intensity_mapping = {
            '01': 'normal',
            '02': 'strong'
        }

    def mount_drive(self):
        """
        Mounts Google Drive to access the dataset.
        """
        print("Mounting Google Drive...")
        drive.mount('/content/drive')
        print("Google Drive mounted.")

    def download_dataset(self):
        """
        Downloads the RAVDESS dataset zip file from Google Drive.
        """
        from google.colab import auth
        from googleapiclient.discovery import build
        from googleapiclient.http import MediaIoBaseDownload
        from oauth2client.client import GoogleCredentials
        import io

        print("Authenticating and building Google Drive service...")
        auth.authenticate_user()
        drive_service = build('drive', 'v3', credentials=GoogleCredentials.get_application_default())
        request = drive_service.files().get_media(fileId=self.drive_file_id)
        fh = io.BytesIO()
        downloader = MediaIoBaseDownload(fh, request)
        done = False
        print("Downloading RAVDESS dataset from Google Drive...")
        while not done:
            status, done = downloader.next_chunk()
            if status:
                print(f"Download {int(status.progress() * 100)}%.")
        fh.seek(0)
        with open(self.local_zip_path, 'wb') as f:
            f.write(fh.read())
        print("Download completed.")

    def extract_dataset(self):
        """
        Extracts the downloaded zip file to the specified directory.
        """
        if not os.path.exists(self.extract_path):
            print("Extracting RAVDESS dataset...")
            with ZipFile(self.local_zip_path, 'r') as zip_ref:
                zip_ref.extractall(self.extract_path)
            print("Extraction completed.")
        else:
            print("RAVDESS dataset already extracted.")

    def load_data(self):
        """
        Loads the RAVDESS dataset, parses filenames to extract labels, and filters out samples
        with 'unknown' or 'unspecified' labels.

        Returns:
            pd.DataFrame: DataFrame containing file paths and corresponding labels.
        """
        data = []
        for root, dirs, files in os.walk(self.extract_path):
            for file in files:
                if file.endswith('.wav'):
                    filepath = os.path.join(root, file)
                    filename = os.path.basename(file)
                    parts = filename.split('.')[0].split('-')
                    if len(parts) != 7:
                        continue  # Skip files that don't match the naming convention
                    emotion_code = parts[2]
                    intensity_code = parts[3]

                    emotion = self.emotion_mapping.get(emotion_code, 'unknown')
                    intensity = self.intensity_mapping.get(intensity_code, 'unspecified' if emotion != 'neutral' else 'normal')

                    # Skip samples with 'unknown' or 'unspecified' labels
                    if emotion == 'unknown' or intensity == 'unspecified':
                        continue

                    # Combine emotion and intensity into a single label
                    label = f"{emotion}_{intensity}"
                    data.append({'file_path': filepath, 'label': label})

        df = pd.DataFrame(data)
        print(f"Total samples after filtering: {len(df)}")
        return df

    def prepare_datasets(self, df):
        """
        Splits the DataFrame into training and testing sets and converts them into Hugging Face Datasets.

        Args:
            df (pd.DataFrame): DataFrame containing file paths and labels.

        Returns:
            DatasetDict: Hugging Face DatasetDict containing 'train' and 'test' datasets.
            dict: Mapping from labels to IDs.
            dict: Mapping from IDs to labels.
        """
        # Split the data
        train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
        print(f"Training samples: {len(train_df)}")
        print(f"Testing samples: {len(test_df)}")

        # Define label classes
        labels = sorted(df['label'].unique())
        num_labels = len(labels)
        label2id = {label: idx for idx, label in enumerate(labels)}
        id2label = {idx: label for label, idx in label2id.items()}

        # Define features for Hugging Face Dataset
        features = Features({
            'file_path': Audio(sampling_rate=16000),
            'label': ClassLabel(names=labels)
        })

        # Convert pandas DataFrame to Hugging Face Dataset
        train_dataset = Dataset.from_pandas(train_df).remove_columns('__index_level_0__').cast(features)
        test_dataset = Dataset.from_pandas(test_df).remove_columns('__index_level_0__').cast(features)

        dataset = DatasetDict({
            'train': train_dataset,
            'test': test_dataset
        })

        return dataset, label2id, id2label

    def run(self):
        """
        Executes the dataset loading and preparation steps.

        Returns:
            DatasetDict: Prepared dataset for training and testing.
            dict: Mapping from labels to IDs.
            dict: Mapping from IDs to labels.
        """
        self.mount_drive()
        self.download_dataset()
        self.extract_dataset()
        df = self.load_data()
        dataset, label2id, id2label = self.prepare_datasets(df)
        return dataset, label2id, id2label

class EmotionIntensityModel:
    """
    Manages the loading, fine-tuning, and evaluation of the DistilHuBERT model for emotion and emotional intensity classification.
    """
    def __init__(self, model_name, num_labels, label2id, id2label):
        """
        Initializes the model with the given configuration.

        Args:
            model_name (str): Pre-trained model name or path.
            num_labels (int): Number of unique labels.
            label2id (dict): Mapping from label names to IDs.
            id2label (dict): Mapping from label IDs to names.
        """
        self.model_name = model_name
        self.num_labels = num_labels
        self.label2id = label2id
        self.id2label = id2label
        self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)  # 使用 Wav2Vec2FeatureExtractor 代替 processor
        self.model = self.initialize_model()

    def initialize_model(self):
        """
        Loads the pre-trained DistilHuBERT model and configures it for classification.

        Returns:
            AutoModelForAudioClassification: Configured model ready for fine-tuning.
        """
        model = AutoModelForAudioClassification.from_pretrained(
            self.model_name,
            num_labels=self.num_labels,
            label2id=self.label2id,
            id2label=self.id2label
        )
        return model

    def preprocess_function(self, examples):
        """
        Preprocesses the dataset by using the audio features provided in the dataset.

        Args:
            examples (dict): A batch of examples from the dataset.

        Returns:
            dict: A dictionary containing input values with padding and truncation.
        """
        audio = examples['file_path']  # audio is already loaded, so no need to load again
        inputs = self.feature_extractor(
            audio['array'],
            sampling_rate=audio['sampling_rate'],
            return_tensors="pt",
            padding="max_length",  # Add padding
            truncation=True,       # Add truncation
            max_length=61395       # You can set a max length based on your observations
        )
        examples["input_values"] = inputs["input_values"][0].numpy()
        return examples

    def prepare_data(self, dataset):
        """
        Applies preprocessing to the dataset.

        Args:
            dataset (DatasetDict): Hugging Face DatasetDict containing 'train' and 'test' datasets.

        Returns:
            DatasetDict: Preprocessed dataset.
        """
        dataset = dataset.map(self.preprocess_function, remove_columns=['file_path'])
        return dataset

    def compute_metrics(self, pred):
        """
        Computes evaluation metrics.

        Args:
            pred (EvalPrediction): Prediction object from Trainer.

        Returns:
            dict: Dictionary containing accuracy, precision, recall, and F1 scores.
        """
        labels = pred.label_ids
        preds = pred.predictions.argmax(-1)
        accuracy = accuracy_score(labels, preds)
        precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted', zero_division=0)
        return {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1,
        }

    def fine_tune(self, dataset, training_args):
        """
        Fine-tunes the model using the Trainer API.

        Args:
            dataset (DatasetDict): Preprocessed dataset.
            training_args (TrainingArguments): Training arguments.

        Returns:
            Trainer: Trained Trainer object.
        """
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=dataset["train"],
            eval_dataset=dataset["test"],
            tokenizer=None,  # No tokenizer needed
            compute_metrics=self.compute_metrics,
        )

        trainer.train()
        return trainer

    def evaluate(self, trainer, dataset):
        """
        Evaluates the fine-tuned model on the test set.

        Args:
            trainer (Trainer): Trained Trainer object.
            dataset (DatasetDict): Preprocessed dataset.

        Returns:
            dict: Evaluation results.
        """
        eval_results = trainer.evaluate()
        print(f"Evaluation Results: {eval_results}")
        return eval_results

    def generate_reports(self, trainer, dataset, id2label):
        """
        Generates classification reports and confusion matrices.

        Args:
            trainer (Trainer): Trained Trainer object.
            dataset (DatasetDict): Preprocessed dataset.
            id2label (dict): Mapping from label IDs to names.
        """
        # Get predictions
        predictions = trainer.predict(dataset["test"])
        pred_ids = np.argmax(predictions.predictions, axis=1)
        true_ids = predictions.label_ids

        # Classification Report
        print("\nClassification Report:")
        print(classification_report(true_ids, pred_ids, target_names=list(id2label.values()), zero_division=0))

        # Confusion Matrix
        cm = confusion_matrix(true_ids, pred_ids)
        print("Confusion Matrix:")
        print(cm)

class EmotionIntensityPipeline:
    """
    Orchestrates the workflow of downloading data, fine-tuning the model, and evaluating its performance.
    """
    def __init__(self, drive_file_id):
        """
        Initializes the pipeline with the Google Drive file ID.

        Args:
            drive_file_id (str): The file ID of the RAVDESS zip file in Google Drive.
        """
        self.loader = RAVDESSDatasetLoader(drive_file_id=drive_file_id)
        self.model = None

    def run(self):
        """
        Executes the entire pipeline: download, extract, load, preprocess, fine-tune, and evaluate.
        """
        # Step 1: Load and prepare the dataset
        dataset, label2id, id2label = self.loader.run()

        # Step 2: Initialize the model
        model_name = "ntu-spml/distilhubert"
        self.model = EmotionIntensityModel(model_name=model_name,
                                           num_labels=len(label2id),
                                           label2id=label2id,
                                           id2label=id2label)

        # Step 3: Prepare the data
        dataset = self.model.prepare_data(dataset)

        # Step 4: Define training arguments
        training_args = TrainingArguments(
            output_dir="./results",
            evaluation_strategy="epoch",
            save_strategy="epoch",
            learning_rate=5e-5,
            per_device_train_batch_size=8,
            per_device_eval_batch_size=8,
            num_train_epochs=10,
            weight_decay=0.01,
            seed=42,
            logging_dir='./logs',
            logging_steps=10,
            optim="adamw_torch",
            lr_scheduler_type="linear",
            warmup_ratio=0.1,
            load_best_model_at_end=True,
            metric_for_best_model="accuracy",
        )

        # Step 5: Fine-tune the model
        trainer = self.model.fine_tune(dataset, training_args)

        # Step 6: Evaluate the model
        self.model.evaluate(trainer, dataset)

        # Step 7: Generate detailed reports
        self.model.generate_reports(trainer, dataset, id2label)

In [4]:
# Main function to execute the pipeline
def main():
    """
    Main function to execute the emotion and intensity classification pipeline.
    """
    # Define the Google Drive file ID for RAVDESS dataset zip file
    # IMPORTANT: Replace 'YOUR_GOOGLE_DRIVE_FILE_ID' with the actual file ID
    ravdess_drive_file_id = '1dSXSY-f6ZigkcJWV07v6kc3r_i3pMPvl'

    # Initialize and run the pipeline
    pipeline = EmotionIntensityPipeline(drive_file_id=ravdess_drive_file_id)
    pipeline.run()

if __name__ == "__main__":
    main()


Mounting Google Drive...
Mounted at /content/drive
Google Drive mounted.
Authenticating and building Google Drive service...
Downloading RAVDESS dataset from Google Drive...
Download 50%.
Download 100%.
Download completed.
Extracting RAVDESS dataset...
Extraction completed.
Total samples after filtering: 1440
Training samples: 1152
Testing samples: 288


Casting the dataset:   0%|          | 0/1152 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/288 [00:00<?, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/214 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/94.0M [00:00<?, ?B/s]

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at ntu-spml/distilhubert and are newly initialized: ['classifier.bias', 'classifier.weight', 'encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1152 [00:00<?, ? examples/s]

Map:   0%|          | 0/288 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.5326,2.433527,0.211806,0.184207,0.211806,0.13391
2,2.2355,2.210274,0.232639,0.257697,0.232639,0.163901
3,2.0753,2.068305,0.3125,0.34282,0.3125,0.238724
4,1.9808,1.973743,0.319444,0.275805,0.319444,0.260803
5,1.9772,1.867912,0.388889,0.43286,0.388889,0.359537
6,1.8259,1.785969,0.402778,0.430359,0.402778,0.369989
7,1.7118,1.721651,0.430556,0.432059,0.430556,0.401202
8,1.611,1.662663,0.482639,0.493578,0.482639,0.456109
9,1.5658,1.645638,0.4375,0.424971,0.4375,0.401928
10,1.5843,1.624366,0.496528,0.509311,0.496528,0.476659


Evaluation Results: {'eval_loss': 1.6243656873703003, 'eval_accuracy': 0.4965277777777778, 'eval_precision': 0.5093109483914913, 'eval_recall': 0.4965277777777778, 'eval_f1': 0.47665880698099056, 'eval_runtime': 17.826, 'eval_samples_per_second': 16.156, 'eval_steps_per_second': 2.02, 'epoch': 10.0}

Classification Report:
                  precision    recall  f1-score   support

    angry_normal       0.54      0.37      0.44        19
    angry_strong       0.54      1.00      0.70        19
     calm_normal       0.60      0.32      0.41        19
     calm_strong       0.48      0.80      0.60        20
  disgust_normal       0.67      0.63      0.65        19
  disgust_strong       0.48      0.58      0.52        19
  fearful_normal       0.45      0.25      0.32        20
  fearful_strong       0.61      0.58      0.59        19
    happy_normal       0.44      0.37      0.40        19
    happy_strong       0.57      0.21      0.31        19
  neutral_normal       0.56      0.7