In [9]:
!pip install opensmile
!pip install --upgrade pandas
!apt-get install git-lfs
!git lfs install
!git clone https://github.com/CheyneyComputerScience/CREMA-D.git

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.
Git LFS initialized.
fatal: destination path 'CREMA-D' already exists and is not an empty directory.


In [10]:
import os
import pandas as pd
import numpy as np
import opensmile
import audiofile
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [11]:
# AudioProcessor: Handles loading of audio files
class AudioProcessor:
    """
    AudioProcessor handles loading of audio files.
    It extracts audio signals and sampling rates from audio files.
    """

    def __init__(self, file_paths):
        self.file_paths = file_paths  # List of audio file paths

    def load_audio(self, path):
        """
        Loads an audio file and returns the signal and sampling rate.
        """
        try:
            signal, sampling_rate = audiofile.read(path, always_2d=True)
        except Exception as e:
            print(f"Error loading {path}: {str(e)}")
            return None, None
        return signal, sampling_rate

    def batch_load(self):
        """
        Loads all audio files in batch.
        Returns a list of tuples containing the audio signals and sampling rates.
        """
        signals = []
        for path in self.file_paths:
            signal, sampling_rate = self.load_audio(path)
            if signal is not None:
                signals.append((signal, sampling_rate))
        return signals

In [12]:
# FeatureExtractor: Uses OpenSmile to extract features from audio
class FeatureExtractor:
    """
    Extracts features from audio files using OpenSmile.
    """

    def __init__(self):
        self.smile = opensmile.Smile(
            feature_set=opensmile.FeatureSet.eGeMAPSv02,
            feature_level=opensmile.FeatureLevel.Functionals
        )

    def extract_features(self, signal, sampling_rate):
        """
        Extracts features from a single audio signal using OpenSmile.
        """
        features = self.smile.process_signal(signal, sampling_rate)
        return features

    def extract_batch_from_paths(self, paths):
        """
        Extracts features from a list of audio file paths.
        """
        all_features = []
        for path in paths:
            signal, sampling_rate = audiofile.read(path, always_2d=True)
            if signal is not None:
                features = self.extract_features(signal, sampling_rate)
                all_features.append(features)
        return pd.concat(all_features, ignore_index=True)

In [13]:
# EmotionClassifier: Random Forest classifier for emotion classification
class EmotionClassifier:
    """
    A classifier for predicting emotions using a Random Forest model.
    """

    def __init__(self):
        self.model = RandomForestClassifier(random_state=42)
        self.label_encoder = LabelEncoder()
        self.scaler = StandardScaler()

    def train(self, X_train, y_train):
        """
        Trains the emotion classifier using scaled features and encoded labels.
        """
        y_train_encoded = self.label_encoder.fit_transform(y_train)
        X_train_scaled = self.scaler.fit_transform(X_train)
        self.model.fit(X_train_scaled, y_train_encoded)

    def predict(self, X):
        """
        Predicts emotions on new data and returns a list of PredictionResult objects.
        """
        X_scaled = self.scaler.transform(X)
        y_pred_encoded = self.model.predict(X_scaled)
        y_pred = self.label_encoder.inverse_transform(y_pred_encoded)
        y_proba = self.model.predict_proba(X_scaled)
        confidence_levels = y_proba.max(axis=1)
        return [PredictionResult(label, confidence) for label, confidence in zip(y_pred, confidence_levels)]

In [14]:
# PredictionResult: Stores emotion classification results
class PredictionResult:
    """
    Stores the result of an emotion prediction.
    """

    def __init__(self, label, confidence):
        self.label = label  # Predicted emotion label
        self.confidence = round(confidence, 2)  # Confidence score

    def __repr__(self):
        """
        String representation of the prediction result.
        """
        return f"PredictionResult(label={self.label}, confidence={self.confidence:.2f})"

In [15]:
class AudioEmotionDetectionPipeline:
    """
    Manages the workflow:
    - Extracts features using OpenSmile.
    - Trains a model using CREMA-D AudioMP3 files.
    - Predicts emotions on new audio files using the trained model.
    """

    def __init__(self, file_ids):
        self.file_ids = file_ids  # Google Drive audio file IDs
        self.processor = None  # To handle audio file processing
        self.extractor = FeatureExtractor()  # To extract features from audio
        self.classifier = EmotionClassifier()  # Emotion classifier

    def load_crema_d_data(self):
        """
        Loads CREMA-D AudioMP3 dataset, extracting file paths and emotion labels from filenames.
        Returns a DataFrame with file paths and labels.
        """
        audio_dir = './CREMA-D/AudioMP3'
        audio_files = [f for f in os.listdir(audio_dir) if f.endswith('.mp3')]

        # Extract emotion labels from filenames
        emotions = {
            'ANG': 'Anger',
            'DIS': 'Disgust',
            'FEA': 'Fear',
            'HAP': 'Happiness',
            'NEU': 'Neutral',
            'SAD': 'Sadness'
        }

        file_paths = []
        labels = []

        for file in audio_files:
            emotion_code = file.split('_')[2]  # The third part of the filename contains the emotion code
            if emotion_code in emotions:
                file_paths.append(os.path.join(audio_dir, file))
                labels.append(emotions[emotion_code])

        return pd.DataFrame({'Path': file_paths, 'Label': labels})

    def download_and_extract_features(self):
        """
        Downloads audio files from Google Drive and extracts features.
        Returns a DataFrame with extracted features.
        """
        file_paths = self.download_files_from_drive(self.file_ids)
        self.processor = AudioProcessor(file_paths)
        return self.extractor.extract_batch_from_paths(file_paths)

    def download_files_from_drive(self, file_ids):
        """
        Downloads files from Google Drive using file IDs.
        Returns a list of file paths.
        """
        auth.authenticate_user()
        gauth = GoogleAuth()
        gauth.credentials = GoogleCredentials.get_application_default()
        drive = GoogleDrive(gauth)

        file_paths = []
        for filename, file_id in file_ids.items():
            downloaded = drive.CreateFile({'id': file_id})
            downloaded.GetContentFile(filename)
            file_paths.append(filename)
            print(f"{filename} downloaded")
        return file_paths

    def train_classifier(self):
        """
        Trains the emotion classifier using CREMA-D dataset.
        """
        crema_d_data = self.load_crema_d_data()
        X_train, X_test, y_train, y_test = train_test_split(
            crema_d_data['Path'], crema_d_data['Label'], test_size=0.2, random_state=42)

        # Extract features for training and testing
        X_train_features = self.extractor.extract_batch_from_paths(X_train)
        X_test_features = self.extractor.extract_batch_from_paths(X_test)

        print(f"Shape of training features: {X_train_features.shape}")
        print(f"Shape of testing features: {X_test_features.shape}")

        self.classifier.train(X_train_features, y_train)

        # Evaluate model performance
        y_test_pred = self.classifier.predict(X_test_features)
        print("Model evaluation on test set:")
        print(classification_report(y_test, [result.label for result in y_test_pred]))

        cm = confusion_matrix(y_test, [result.label for result in y_test_pred])
        print("Confusion Matrix:")
        print(cm)

    def run(self):
        """
        Runs the entire pipeline and returns predictions.
        """
        # Train classifier and predict on new audio files
        self.train_classifier()
        audio_features = self.download_and_extract_features()

        # Predict on new audio files
        predictions = self.classifier.predict(audio_features)

        # Convert predictions to DataFrame
        results_df = pd.DataFrame([{
            "label": pred.label,
            "confidence": pred.confidence
        } for pred in predictions])

        return results_df

In [16]:
# Main function to run the pipeline
def main():
    """
    Main function that runs the entire emotion recognition pipeline.
    """
    # Define Google Drive file IDs (replace with actual file IDs)
    file_ids = {
        'audio1.mp3': '108kPpEQeA_6RkQXmmLWDJXQzdiISlm0r',
        'audio2.mp3': '13O1hKhYl5Uzlb0mIadH5hv5t_zSud664'
    }

    # Create and run the AudioEmotionDetectionPipeline
    pipeline = AudioEmotionDetectionPipeline(file_ids)
    results_df = pipeline.run()

    # Output the results
    print(results_df)


if __name__ == "__main__":
    main()

Shape of training features: (5953, 88)
Shape of testing features: (1489, 88)
Model evaluation on test set:
              precision    recall  f1-score   support

       Anger       0.63      0.72      0.67       239
     Disgust       0.43      0.30      0.35       297
        Fear       0.54      0.38      0.45       238
   Happiness       0.48      0.46      0.47       254
     Neutral       0.43      0.61      0.50       201
     Sadness       0.52      0.63      0.57       260

    accuracy                           0.51      1489
   macro avg       0.51      0.52      0.50      1489
weighted avg       0.50      0.51      0.50      1489

Confusion Matrix:
[[172  15   8  38   5   1]
 [ 29  89  12  37  66  64]
 [ 17  19  90  37  24  51]
 [ 51  26  22 118  29   8]
 [  0  30  12  12 122  25]
 [  3  29  22   4  39 163]]
audio1.mp3 downloaded
audio2.mp3 downloaded
   label  confidence
0  Anger        0.56
1  Anger        0.76
