In [1]:
!pip install --upgrade transformers
!pip install torchaudio
!pip install librosa
!pip install scikit-learn
!pip install PyDrive



In [2]:
import os
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from transformers import pipeline
from google.colab import auth
from googleapiclient.discovery import build
from google.auth.transport.requests import Request
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.credentials import Credentials
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from oauth2client.client import GoogleCredentials
import zipfile



In [3]:
# PredictionResult: Stores emotion classification results
class PredictionResult:
    """
    Stores the result of an emotion prediction.
    """
    def __init__(self, label, confidence):
        self.label = label  # Predicted emotion label
        self.confidence = confidence  # Confidence score

    def __repr__(self):
        """
        String representation of the prediction result.
        """
        return f"PredictionResult(label={self.label}, confidence={self.confidence})"

# EmotionClassifier: Uses pretrained model via pipeline
class EmotionClassifier:
    """
    Uses a pre-trained DistilHuBERT model fine-tuned on RAVDESS for emotion recognition via pipeline.
    """
    def __init__(self):
        # 使用 pipeline 加载模型
        try:
            self.classifier = pipeline(task="audio-classification", model="pollner/distilhubert-finetuned-ravdess")
            print("Model loaded successfully.")
        except Exception as e:
            print(f"Error loading model: {e}")
            self.classifier = None
        # 获取模型的标签映射
        self.label_mapping = {
            'angry': 'angry',
            'calm': 'calm',
            'disgust': 'disgust',
            'fear': 'fearful',  # 注意这里的映射
            'happy': 'happy',
            'neutral': 'neutral',
            'sad': 'sad',
            'surprise': 'surprised'  # 注意拼写
        }

    def predict(self, audio_file_path):
        """
        Predicts emotion for a single audio file.
        Returns a PredictionResult object.
        """
        if self.classifier is None:
            print("Classifier not initialized.")
            return None
        try:
            # 使用 pipeline 进行预测
            predictions = self.classifier(audio_file_path)
            # 获取最高置信度的预测结果
            top_prediction = predictions[0]
            pred_label = top_prediction['label'].lower()
            # 统一标签名称
            if pred_label == 'fear':
                pred_label = 'fearful'
            elif pred_label == 'surprise':
                pred_label = 'surprised'
            emotion = self.label_mapping.get(pred_label, 'unknown')
            confidence = top_prediction['score']
            # 创建 PredictionResult 对象
            pred_result = PredictionResult(emotion, confidence)
            print(f"Processed {audio_file_path}: Predicted emotion={emotion}, confidence={confidence:.4f}")
            return pred_result
        except Exception as e:
            print(f"Error processing {audio_file_path}: {e}")
            return PredictionResult("unknown", 0.0)

# AudioEmotionDetectionPipeline: Manages the workflow for a single file
class AudioEmotionDetectionPipeline:
    def __init__(self, audio_file_path):
        self.audio_file_path = audio_file_path  # 单个音频文件路径
        self.classifier = EmotionClassifier()  # 使用预训练的情感识别器

    def run(self):
        """
        运行管道，返回预测结果的 DataFrame。
        """
        # 对音频文件进行情感预测
        prediction_result = self.classifier.predict(self.audio_file_path)

        if prediction_result is None:
            print("No prediction result.")
            return None

        # 创建结果 DataFrame
        results = [{
            "audio_file": os.path.basename(self.audio_file_path),
            "predicted_emotion": prediction_result.label,
            "confidence": prediction_result.confidence
        }]

        # 将结果转换为 DataFrame
        results_df = pd.DataFrame(results)
        return results_df

# 下载和解压数据集的函数
def download_audio_from_drive(file_id, destination):
    """
    从 Google Drive 下载音频文件并保存到指定位置。
    """
    # 认证
    auth.authenticate_user()
    gauth = GoogleAuth()
    gauth.credentials = GoogleCredentials.get_application_default()
    drive = GoogleDrive(gauth)

    # 下载文件
    downloaded = drive.CreateFile({'id': file_id})
    print("Downloading file...")
    downloaded.GetContentFile(destination)
    print("Download completed.")

# 主函数运行管道
def main():
    """
    运行情感识别管道的主函数，用于处理多个音频文件。
    """
    # 设置音频文件ID和路径
    audio_file_ids = {
        'audio1.mp3': '108kPpEQeA_6RkQXmmLWDJXQzdiISlm0r'
    }

    for file_name, file_id in audio_file_ids.items():
        # 下载音频文件
        download_audio_from_drive(file_id, file_name)

        # 创建并运行 AudioEmotionDetectionPipeline
        audio_pipeline = AudioEmotionDetectionPipeline(file_name)
        audio_result_df = audio_pipeline.run()

        if audio_result_df is not None:
            # 输出结果 DataFrame
            print(f"\nPrediction Results for {file_name}:")
            print(audio_result_df.head())

            # 将结果保存到 CSV 文件
            audio_result_df.to_csv(f'{file_name}_emotion_recognition_results.csv', index=False)
        else:
            print(f"No results to display for {file_name}.")

if __name__ == "__main__":
    main()

Downloading file...
Download completed.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.77k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/94.8M [00:00<?, ?B/s]

Some weights of the model checkpoint at pollner/distilhubert-finetuned-ravdess were not used when initializing HubertForSequenceClassification: ['hubert.encoder.pos_conv_embed.conv.weight_g', 'hubert.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing HubertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at pollner/distilhubert-finetuned-ravdess and are newly initialized: ['hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'hubert.encoder.pos_conv_em

preprocessor_config.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

Model loaded successfully.
Processed audio1.mp3: Predicted emotion=neutral, confidence=0.4721

Prediction Results for audio1.mp3:
   audio_file predicted_emotion  confidence
0  audio1.mp3           neutral    0.472109
