In [1]:
!pip install transformers datasets torchaudio librosa pandas

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K  

In [2]:
from transformers import AutoModelForAudioClassification, Wav2Vec2FeatureExtractor
import torch
import os
import librosa
import pandas as pd
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials



In [3]:
class PretrainedEmotionModel:
    """
    Use the pre-trained DistilHuBERT model from Hugging Face for emotion classification.
    """
    def __init__(self, model_name):
        # Load the pre-trained DistilHuBERT model and feature extractor
        self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
        self.model = AutoModelForAudioClassification.from_pretrained(model_name)

    def predict_all_labels(self, audio_file_path):
        """
        Predict all possible emotion labels with their respective confidence scores.
        """
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Load and preprocess the audio
        speech_array, sampling_rate = librosa.load(audio_file_path, sr=16000)
        inputs = self.feature_extractor(
            speech_array, return_tensors="pt", sampling_rate=sampling_rate, padding=True
        )

        # Move inputs and model to the appropriate device (GPU or CPU)
        inputs = {key: value.to(device) for key, value in inputs.items()}
        self.model.to(device)

        # Make predictions
        with torch.no_grad():
            logits = self.model(**inputs).logits
            probabilities = torch.softmax(logits, dim=-1).squeeze().cpu().numpy()

        # Map label IDs to emotion labels
        id2label = self.model.config.id2label

        # Collect the predictions and their associated confidence scores
        results = []
        for label_id, confidence in enumerate(probabilities):
            emotion = id2label[label_id]
            results.append({
                'audio_file': os.path.basename(audio_file_path),
                'emotion': emotion.capitalize(),
                'confidence': confidence
            })

        # Sort results by confidence in descending order
        df = pd.DataFrame(results)
        df = df.sort_values(by='confidence', ascending=False).reset_index(drop=True)

        return df


class EmotionPipeline:
    """
    Orchestrates the workflow of downloading audio data from Google Drive and performing emotion classification.
    """
    def __init__(self, model_name="pollner/distilhubert-finetuned-ravdess"):
        self.model_name = model_name
        self.model = PretrainedEmotionModel(self.model_name)

    def authenticate_and_create_drive(self):
        """
        Authenticates the user and creates a PyDrive GoogleDrive instance.
        """
        auth.authenticate_user()
        gauth = GoogleAuth()
        gauth.credentials = GoogleCredentials.get_application_default()
        drive = GoogleDrive(gauth)
        return drive

    def download_audio_from_drive(self, drive, audio_file_id, destination_path):
        """
        Downloads an audio file from Google Drive using its file ID.
        """
        print(f"Downloading audio file from Google Drive with file ID: {audio_file_id}")
        downloaded = drive.CreateFile({'id': audio_file_id})
        downloaded.GetContentFile(destination_path)
        print(f"Downloaded audio file and saved as {destination_path}")

    def load_and_predict(self, audio_file_ids):
        """
        Downloads the audio files using their file IDs and predicts all possible labels.
        """
        drive = self.authenticate_and_create_drive()

        for audio_file_name, audio_file_id in audio_file_ids.items():
            destination_path = f"./{audio_file_name}"
            self.download_audio_from_drive(drive, audio_file_id, destination_path)

            # Perform prediction using the pre-trained model
            result_df = self.model.predict_all_labels(destination_path)
            print(result_df)

In [4]:
# Main function to execute the pipeline
def main():
    """
    Main function to execute the emotion classification pipeline using a pre-trained model from Hugging Face.
    """
    audio_model_name = "pollner/distilhubert-finetuned-ravdess"

    # Initialize the pipeline with the pre-trained model
    emotion_pipeline = EmotionPipeline(model_name=audio_model_name)

    # Provide the Google Drive file IDs of the audio files
    audio_file_ids = {
        'audio1.mp3': '108kPpEQeA_6RkQXmmLWDJXQzdiISlm0r',
        'audio2.mp3': '13O1hKhYl5Uzlb0mIadH5hv5t_zSud664'
    }

    # Download the audio files and perform emotion prediction
    emotion_pipeline.load_and_predict(audio_file_ids)


if __name__ == "__main__":
    main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.77k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/94.8M [00:00<?, ?B/s]

Some weights of the model checkpoint at pollner/distilhubert-finetuned-ravdess were not used when initializing HubertForSequenceClassification: ['hubert.encoder.pos_conv_embed.conv.weight_g', 'hubert.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing HubertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at pollner/distilhubert-finetuned-ravdess and are newly initialized: ['hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'hubert.encoder.pos_conv_em

Downloading audio file from Google Drive with file ID: 108kPpEQeA_6RkQXmmLWDJXQzdiISlm0r
Downloaded audio file and saved as ./audio1.mp3
   audio_file    emotion  confidence
0  audio1.mp3    Neutral    0.606270
1  audio1.mp3        Sad    0.158477
2  audio1.mp3      Happy    0.151101
3  audio1.mp3       Calm    0.070716
4  audio1.mp3    Disgust    0.005959
5  audio1.mp3      Angry    0.003035
6  audio1.mp3  Surprised    0.002574
7  audio1.mp3    Fearful    0.001867
Downloading audio file from Google Drive with file ID: 13O1hKhYl5Uzlb0mIadH5hv5t_zSud664
Downloaded audio file and saved as ./audio2.mp3
   audio_file    emotion  confidence
0  audio2.mp3       Calm    0.949149
1  audio2.mp3        Sad    0.038235
2  audio2.mp3    Neutral    0.007554
3  audio2.mp3    Disgust    0.004204
4  audio2.mp3    Fearful    0.000314
5  audio2.mp3  Surprised    0.000238
6  audio2.mp3      Angry    0.000162
7  audio2.mp3      Happy    0.000145
