In [None]:
!pip install opensmile
!pip install --upgrade pandas
!pip install xgboost
!apt-get install git-lfs
!git lfs install
!git clone https://github.com/CheyneyComputerScience/CREMA-D.git

Collecting opensmile
  Downloading opensmile-2.5.0-py3-none-manylinux_2_17_x86_64.whl.metadata (15 kB)
Collecting audobject>=0.6.1 (from opensmile)
  Downloading audobject-0.7.11-py3-none-any.whl.metadata (2.6 kB)
Collecting audinterface>=0.7.0 (from opensmile)
  Downloading audinterface-1.2.2-py3-none-any.whl.metadata (4.1 kB)
Collecting audeer>=1.18.0 (from audinterface>=0.7.0->opensmile)
  Downloading audeer-2.2.0-py3-none-any.whl.metadata (4.1 kB)
Collecting audformat<2.0.0,>=1.0.1 (from audinterface>=0.7.0->opensmile)
  Downloading audformat-1.3.1-py3-none-any.whl.metadata (4.6 kB)
Collecting audiofile>=1.3.0 (from audinterface>=0.7.0->opensmile)
  Downloading audiofile-1.5.0-py3-none-any.whl.metadata (4.9 kB)
Collecting audmath>=1.4.1 (from audinterface>=0.7.0->opensmile)
  Downloading audmath-1.4.1-py3-none-any.whl.metadata (3.6 kB)
Collecting audresample<2.0.0,>=1.1.0 (from audinterface>=0.7.0->opensmile)
  Downloading audresample-1.3.3-py3-none-manylinux_2_17_x86_64.whl.metada

In [None]:
import os
import pandas as pd
import numpy as np
import opensmile
import audiofile
import joblib
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.exceptions import NotFittedError

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials



In [None]:
# AudioProcessor: Handles loading of audio files
class AudioProcessor:
  """
  AudioProcessor handles loading of audio files.
  It extracts audio signals and sampling rates from audio files.
  """

  def __init__(self, file_paths):
    self.file_paths = file_paths  # List of audio file paths

  def load_audio(self, path):
    """
    Loads an audio file and returns the signal and sampling rate.
    """
    try:
      signal, sampling_rate = audiofile.read(path, always_2d=True)
    except Exception as e:
      print(f"Error loading {path}: {str(e)}")
      return None, None
    return signal, sampling_rate

  def batch_load(self):
    """
    Loads all audio files in batch.
    Returns a list of tuples containing the audio signals and sampling rates.
    """
    signals = []
    for path in self.file_paths:
      signal, sampling_rate = self.load_audio(path)
      if signal is not None:
        signals.append((signal, sampling_rate))
    return signals


# FeatureExtractor: Uses OpenSmile to extract features from audio
class FeatureExtractor:
  """
  Extracts features from audio files using OpenSmile.
  """

  def __init__(self):
    self.smile = opensmile.Smile(
        feature_set=opensmile.FeatureSet.eGeMAPSv02,
        feature_level=opensmile.FeatureLevel.Functionals
    )

  def extract_features(self, signal, sampling_rate):
    """
    Extracts features from a single audio signal using OpenSmile.
    """
    features = self.smile.process_signal(signal, sampling_rate)
    return features

  def extract_batch_from_paths(self, paths):
    """
    Extracts features from a list of audio file paths.
    """
    all_features = []
    for path in paths:
      signal, sampling_rate = audiofile.read(path, always_2d=True)
      if signal is not None:
        features = self.extract_features(signal, sampling_rate)
        all_features.append(features)
    return pd.concat(all_features, ignore_index=True)


# EmotionClassifier: XGBoost classifier with RandomizedSearchCV for hyperparameter tuning
class EmotionClassifier:
  """
  A classifier for predicting emotions using XGBoost with RandomizedSearchCV for faster hyperparameter tuning.
  """

  def __init__(self):
    self.model = XGBClassifier(random_state=42)
    self.label_encoder = LabelEncoder()
    self.scaler = StandardScaler()
    self.is_fitted = False

  def train(self, X_train, y_train):
    """
    Trains the emotion classifier using scaled features and encoded labels.
    Uses RandomizedSearchCV for more comprehensive hyperparameter tuning.
    """
    y_train_encoded = self.label_encoder.fit_transform(y_train)
    self.feature_names = X_train.columns
    X_train_scaled = self.scaler.fit_transform(X_train)

    # Define a hyperparameter grid
    param_distributions = {
        'n_estimators': [100, 200],
        'max_depth': [3, 5],
        'learning_rate': [0.01, 0.05, 0.1]
    }

    # Use RandomizedSearchCV with more iterations
    randomized_search = RandomizedSearchCV(
        estimator=self.model,
        param_distributions=param_distributions,
        n_iter=5,
        cv=3,
        scoring='accuracy',
        verbose=2,
        n_jobs=-1
    )
    randomized_search.fit(X_train_scaled, y_train_encoded)

    # Use the best model from RandomizedSearchCV
    self.model = randomized_search.best_estimator_
    self.is_fitted = True
    print(f"Best parameters found: {randomized_search.best_params_}")

    # Evaluate cross-validation scores
    cv_scores = cross_val_score(self.model, X_train_scaled, y_train_encoded, cv=3, scoring='accuracy')
    print(f"Cross-validation scores: {cv_scores}")
    print(f"Mean cross-validation score: {np.mean(cv_scores)}")

  def predict(self, X):
    """
    Predicts emotions on new data and returns a list of all possible PredictionResult objects.
    """
    if not self.is_fitted:
      raise NotFittedError("This EmotionClassifier instance is not fitted yet.")

    if not hasattr(self.label_encoder, 'classes_'):
      raise ValueError("LabelEncoder is not fitted yet.")

    X = X[self.feature_names]
    X_scaled = self.scaler.transform(X)
    y_proba = self.model.predict_proba(X_scaled)
    y_classes = self.label_encoder.classes_

    # Create a list of all emotions, levels, and their corresponding confidence scores
    all_predictions = []
    for i in range(len(X)):
      sorted_indices = np.argsort(-y_proba[i])  # Sort by probability in descending order
      predictions_for_sample = []
      for idx in sorted_indices:
        emotion = y_classes[idx]
        prob = y_proba[i][idx]
        predictions_for_sample.append(PredictionResult(emotion, prob))
      all_predictions.append(predictions_for_sample)

    return all_predictions

  def predict_top_label(self, X):
    """
    Predicts the top emotion label for each sample.
    """
    X = X[self.feature_names]
    X_scaled = self.scaler.transform(X)
    y_pred_encoded = self.model.predict(X_scaled)
    y_pred = self.label_encoder.inverse_transform(y_pred_encoded)
    return y_pred

  def save_model(self, filename):
    """
    Saves the trained model and scaler to a file.
    """
    model_data = {
        'model': self.model,
        'scaler': self.scaler,
        'label_encoder': self.label_encoder,
        'feature_names': self.feature_names
    }
    joblib.dump(model_data, filename)
    print(f"Model, scaler, label encoder, and feature names saved to {filename}")

  def load_model(self, filename):
    """
    Loads the model from a file if it exists.
    """
    if os.path.exists(filename):
      model_data = joblib.load(filename)
      self.model = model_data['model']
      self.scaler = model_data['scaler']
      self.label_encoder = model_data['label_encoder']
      self.feature_names = model_data['feature_names']
      self.is_fitted = True
      print("Model, scaler, label encoder, and feature names loaded successfully.")
    else:
      print("Model file not found. Training a new model.")


# PredictionResult: Stores emotion classification results
class PredictionResult:
  """
  Stores the result of an emotion prediction.
  """

  def __init__(self, label, confidence):
    self.label = label  # Predicted emotion label
    self.confidence = confidence  # Confidence score

  def __repr__(self):
    """
    String representation of the prediction result.
    """
    return f"PredictionResult(label={self.label}, confidence={self.confidence})"


# AudioEmotionDetectionPipeline: Get results
class AudioEmotionDetectionPipeline:
  """
  Manages the workflow:
  - Extracts features using OpenSmile.
  - Trains a model using CREMA-D AudioMP3 files.
  - Predicts emotions on new audio files using the trained model.
  """

  def __init__(self, folder_id):
    self.folder_id = folder_id  # Google Drive folder ID
    #self.file_ids = file_ids  # Google Drive audio file IDs
    self.processor = None  # To handle audio file processing
    self.extractor = FeatureExtractor()  # To extract features from audio
    self.classifier = EmotionClassifier()  # Emotion classifier

  def load_crema_d_data(self):
    """
    Loads CREMA-D AudioMP3 dataset, extracting file paths, emotion labels, and emotion levels from filenames.
    Returns a DataFrame with file paths, combined emotion labels and levels.
    """
    audio_dir = './CREMA-D/AudioMP3'
    audio_files = [f for f in os.listdir(audio_dir) if f.endswith('.mp3')]

    # Define emotion and level mappings
    emotions = {
        'ANG': 'Anger',
        'DIS': 'Disgust',
        'FEA': 'Fear',
        'HAP': 'Happiness',
        'NEU': 'Neutral',
        'SAD': 'Sadness'
    }

    levels = {
        'LO': 'Low',
        'MD': 'Medium',
        'HI': 'High',
        'XX': 'Unspecified'
    }

    file_paths = []
    labels = []

    for file in audio_files:
      parts = file.split('_')

      if len(parts) >= 4:
        emotion_code = parts[2]  # The third part is the emotion
        level_code = parts[3].replace('.mp3', '')  # Remove the .mp3 extension

        if emotion_code in emotions and level_code in levels:
          emotion = emotions[emotion_code]
          level = levels[level_code]
          combined_label = f"{emotion}_{level}"  # Combine emotion and level

          file_paths.append(os.path.join(audio_dir, file))
          labels.append(combined_label)

    print(f"Loaded {len(labels)} labels from the files.")

    return pd.DataFrame({'Path': file_paths, 'Label': labels})


  def download_files_from_drive(self, folder_id):
    """
    Downloads all audio files from a Google Drive folder using the folder ID.
    Returns a list of file paths.
    """
    # Authenticate and create the PyDrive client
    auth.authenticate_user()
    gauth = GoogleAuth()
    gauth.credentials = GoogleCredentials.get_application_default()
    drive = GoogleDrive(gauth)

    file_paths = []

    # Query to list all audio files in the specified folder
    query = f"'{folder_id}' in parents and trashed=false and mimeType contains 'audio/'"
    file_list = drive.ListFile({'q': query}).GetList()

    if not file_list:
        print("No audio files found in the folder.")
        return file_paths

    for file in file_list:
        filename = file['title']
        downloaded = drive.CreateFile({'id': file['id']})
        downloaded.GetContentFile(filename)
        file_paths.append(filename)
        print(f"{filename} downloaded")
    return file_paths

  def download_and_extract_features(self):
    """
    Downloads audio files from Google Drive folder and extracts features.
    Returns features and a list of file paths.
    """
    file_paths = self.download_files_from_drive(self.folder_id)
    if not file_paths:
        print("No files to process.")
        return None, None  # Return None if no files are downloaded
    self.processor = AudioProcessor(file_paths)
    features = self.extractor.extract_batch_from_paths(file_paths)
    return features, file_paths

  def train_classifier(self):
    """
    Trains the emotion classifier using CREMA-D dataset.
    """
    crema_d_data = self.load_crema_d_data()

    # Check the size of the dataset before splitting
    print(f"Dataset size before splitting: {crema_d_data.shape}")

    if crema_d_data.empty:
      print("Error: The dataset is empty!")
      return

    # Load the model if it exists, otherwise train
    self.classifier.load_model('audio_emotion_classifier.joblib')

    if not self.classifier.is_fitted:
      # If the model is not loaded, we need to train it
      X_train, X_test, y_train, y_test = train_test_split(
          crema_d_data['Path'], crema_d_data['Label'], test_size=0.2, random_state=42)

      X_train_features = self.extractor.extract_batch_from_paths(X_train)
      X_test_features = self.extractor.extract_batch_from_paths(X_test)

      self.classifier.train(X_train_features, y_train)

      # Save the trained model
      self.classifier.save_model('audio_emotion_classifier.joblib')

      # Evaluate model performance
      y_test_pred = self.classifier.predict_top_label(X_test_features)
      print("Model evaluation on test set:")
      print(classification_report(y_test, y_test_pred))

      cm = confusion_matrix(y_test, y_test_pred)
      print("Confusion Matrix:")
      print(cm)


  def run(self):
    """
    Runs the entire pipeline and returns predictions for multiple audio files.
    """
    # Train classifier and predict on new audio files
    self.train_classifier()
    audio_features, file_paths = self.download_and_extract_features()

    if audio_features is None or file_paths is None:
        print("No audio features to process. Exiting.")
        return pd.DataFrame()  # Return empty DataFrame if no features

    # Predict on new audio files
    all_predictions = self.classifier.predict(audio_features)

    # Prepare DataFrame for all predictions with audio file reference
    results = []
    for i, sample_predictions in enumerate(all_predictions):
        # Get the corresponding audio file name for this sample
        audio_file = os.path.basename(file_paths[i])  # Get file name
        for pred in sample_predictions:
            emotion, level = pred.label.split('_')
            results.append({
                "audio_file": audio_file,
                "emotion": emotion,
                "level": level,
                "confidence": pred.confidence
            })
    return pd.DataFrame(results)


In [None]:
# Main function to run the pipeline
def main():
  """
  Main function that runs the entire emotion recognition pipeline.
  """
  # Define Google Drive file IDs (replace with actual file IDs)
  # audio_file_ids = {
  #     'audio1.mp3': '108kPpEQeA_6RkQXmmLWDJXQzdiISlm0r'
  # }
  folder_id = '1DR1Br06XTjOyn-n4ka_00RLT3HNqYtUi'

  # Create and run the AudioEmotionDetectionPipeline
  audio_pipeline = AudioEmotionDetectionPipeline(folder_id)
  audio_results_df = audio_pipeline.run()

  pd.set_option('display.max_rows', None)
  pd.set_option('display.max_columns', None)
  # Output the results
  if not audio_results_df.empty:
    # Output the results
    print(audio_results_df)
  else:
    print("No results to display.")


if __name__ == "__main__":
  main()

Loaded 7442 labels from the files.
Dataset size before splitting: (7442, 2)
Model, scaler, label encoder, and feature names loaded successfully.


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


03-01-08-02-02-02-01.wav downloaded
03-01-06-02-02-02-01.wav downloaded
03-01-06-02-01-02-01.wav downloaded
03-01-08-01-01-02-01.wav downloaded
03-01-08-02-02-01-01.wav downloaded
03-01-06-01-02-02-01.wav downloaded
03-01-07-01-01-02-01.wav downloaded
03-01-06-02-01-01-01.wav downloaded
03-01-08-01-02-01-01.wav downloaded
03-01-08-02-01-01-01.wav downloaded
03-01-07-02-01-02-01.wav downloaded
03-01-05-02-02-01-01.wav downloaded
03-01-07-02-02-01-01.wav downloaded
03-01-06-01-02-01-01.wav downloaded
03-01-05-01-01-02-01.wav downloaded
03-01-05-02-01-02-01.wav downloaded
03-01-07-01-02-02-01.wav downloaded
03-01-08-01-02-02-01.wav downloaded
03-01-06-01-01-01-01.wav downloaded
03-01-08-02-01-02-01.wav downloaded
03-01-06-02-02-01-01.wav downloaded
03-01-05-01-02-02-01.wav downloaded
03-01-07-01-02-01-01.wav downloaded
03-01-05-02-02-02-01.wav downloaded
03-01-07-02-01-01-01.wav downloaded
03-01-07-01-01-01-01.wav downloaded
03-01-06-01-01-02-01.wav downloaded
03-01-07-02-02-02-01.wav dow