In [1]:
import opendatasets as od
import pandas as pd
import numpy as np
import os
import librosa
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix
import opendatasets as od
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

In [2]:
print(os.path.exists(os.path.expanduser('~/.kaggle/kaggle.json')))

True


In [3]:
# --- Download Datasets (only needs to be run once) ---
# 1. TESS: The clean speech dataset
od.download("https://www.kaggle.com/datasets/ejlok1/toronto-emotional-speech-set-tess")

# 2. ESC-50: The background noise dataset
od.download("https://www.kaggle.com/datasets/mmoreaux/environmental-sound-classification-50")

# --- Define Paths ---                                                                                                                                      
TESS_PATH = './toronto-emotional-speech-set-tess/TESS Toronto emotional speech set data/'
ESC50_PATH = './environmental-sound-classification-50/audio/audio/'

Skipping, found downloaded files in "./toronto-emotional-speech-set-tess" (use force=True to force download)
Skipping, found downloaded files in "./environmental-sound-classification-50" (use force=True to force download)


In [4]:
import os
import pandas as pd

# --- Define the CORRECTED Path ---
# We start one level higher, so os.walk can explore BOTH subdirectories it found.
TESS_PATH = './toronto-emotional-speech-set-tess/'

# --- Process Clean Speech (TESS) ---
tess_data = []
# This loop will now automatically search inside both 'TESS...' and 'tess...' folders
for dirpath, _, filenames in os.walk(TESS_PATH):
    for filename in filenames:
        if filename.endswith('.wav'):
            # The rest of your logic for parsing the filename is perfect
            try:
                emotion = filename.split('_')[2].split('.')[0].lower()
                if emotion == 'ps':
                    emotion = 'surprised'
                
                # We filter to keep a balanced set
                if emotion in ['angry', 'calm', 'happy', 'sad', 'neutral', 'surprised']:
                     tess_data.append({
                        "path": os.path.join(dirpath, filename),
                        "emotion": emotion
                    })
            except IndexError:
                # This will safely skip any .wav files that don't match the expected naming format
                print(f"Skipping file with unexpected format: {filename}")


tess_df = pd.DataFrame(tess_data)
print(f"Successfully found and processed {len(tess_df)} audio files.\n")

print("Clean Speech DataFrame:")
# We use .value_counts() which is the correct pandas function
print(tess_df['emotion'].value_counts())
print("\n")

Successfully found and processed 4000 audio files.

Clean Speech DataFrame:
emotion
surprised    800
sad          800
angry        800
happy        800
neutral      800
Name: count, dtype: int64




In [5]:
noise_paths = [
    os.path.join(ESC50_PATH, f)
    for f in os.listdir(ESC50_PATH)
    if f.endswith('.wav')
]

print(f"Found {len(noise_paths)} noise files.")

Found 2000 noise files.


In [6]:
import re
import pandas as pd

noise_files = [
    f for f in os.listdir(ESC50_PATH) if f.endswith('.wav')
]

data = []
for f in noise_files:
    class_id = int(re.findall(r'-(\d+)\.wav$', f)[0])
    data.append({"filename": f, "class_id": class_id})

esc50_meta = pd.DataFrame(data)
print(esc50_meta)

               filename  class_id
0      1-208757-B-2.wav         2
1      1-50623-A-15.wav        15
2     3-157615-A-10.wav        10
3      5-172299-A-5.wav         5
4      5-194932-A-7.wav         7
...                 ...       ...
1995  5-189237-A-12.wav        12
1996   2-108763-A-9.wav         9
1997   1-59324-A-21.wav        21
1998   1-46040-A-14.wav        14
1999    1-15689-B-4.wav         4

[2000 rows x 2 columns]


In [7]:
# ... (code to create tess_df) ...
tess_df = pd.DataFrame(tess_data)

# --- ADD THIS DIAGNOSTIC LINE ---
print(f"DataFrame created. Number of audio files found: {len(tess_df)}")

print("Clean Speech DataFrame:")
print(tess_df['emotion'].value_counts())
print("\n")

DataFrame created. Number of audio files found: 4000
Clean Speech DataFrame:
emotion
surprised    800
sad          800
angry        800
happy        800
neutral      800
Name: count, dtype: int64




In [8]:
import os

# Let's list the contents of the main downloaded folder
downloaded_folder = './toronto-emotional-speech-set-tess/'
try:
    print(f"Contents of '{downloaded_folder}':")
    print(os.listdir(downloaded_folder))
except FileNotFoundError:
    print(f"Error: The directory '{downloaded_folder}' does not exist. Please check the download step.")

Contents of './toronto-emotional-speech-set-tess/':
['TESS Toronto emotional speech set data', 'tess toronto emotional speech set data']


In [9]:
def extract_features(file_path, n_mfcc=13):
    """Extracts MFCCs from an audio file."""
    try:
        audio, sample_rate = librosa.load(file_path, res_type='kaiser_fast') 
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=n_mfcc)
        mfccs_processed = np.mean(mfccs.T, axis=0) # Average across the time dimension
        return mfccs_processed
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

# Extract features for all clean audio files
tess_df['features'] = [extract_features(path) for path in tqdm(tess_df['path'], desc="Extracting Clean Features")]
tess_df.dropna(inplace=True)
print("Feature extraction complete!")


Extracting Clean Features: 100%|██████████| 4000/4000 [01:15<00:00, 53.14it/s]

Feature extraction complete!





In [10]:
# --- Prepare data for training ---
# We need to make sure there are no missing feature rows
tess_df.dropna(subset=['features'], inplace=True)

X = np.array(tess_df['features'].tolist())
y = np.array(tess_df['emotion'].tolist())

In [11]:
# Encode the string labels (e.g., 'happy', 'sad') into numbers (e.g., 0, 1)
le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [12]:
# Split the data into 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y)

In [13]:
# Scale the features. This standardizes them (mean=0, variance=1) which helps the model train better.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [14]:
# --- Train the Model ---
# We're using a Multi-layer Perceptron (a type of neural network)
model = MLPClassifier(hidden_layer_sizes=(256, 128), 
                      activation='relu', 
                      solver='adam', 
                      max_iter=500, 
                      random_state=42,
                      early_stopping=True,
                      verbose=True) # Set verbose=True to see the training progress

In [15]:
print("\n--- Training Baseline Model on CLEAN Data ---")
model.fit(X_train_scaled, y_train)


--- Training Baseline Model on CLEAN Data ---
Iteration 1, loss = 1.16111686
Validation score: 0.740625
Iteration 2, loss = 0.56170342
Validation score: 0.912500
Iteration 3, loss = 0.33949250
Validation score: 0.921875
Iteration 4, loss = 0.24417481
Validation score: 0.934375
Iteration 5, loss = 0.19357104
Validation score: 0.940625
Iteration 6, loss = 0.16628918
Validation score: 0.940625
Iteration 7, loss = 0.13886411
Validation score: 0.953125
Iteration 8, loss = 0.12303048
Validation score: 0.956250
Iteration 9, loss = 0.10992617
Validation score: 0.965625
Iteration 10, loss = 0.09883202
Validation score: 0.968750
Iteration 11, loss = 0.09332716
Validation score: 0.978125
Iteration 12, loss = 0.08936299
Validation score: 0.975000
Iteration 13, loss = 0.08038318
Validation score: 0.984375
Iteration 14, loss = 0.07418014
Validation score: 0.981250
Iteration 15, loss = 0.06825302
Validation score: 0.981250
Iteration 16, loss = 0.06381636
Validation score: 0.981250
Iteration 17, loss

0,1,2
,hidden_layer_sizes,"(256, ...)"
,activation,'relu'
,solver,'adam'
,alpha,0.0001
,batch_size,'auto'
,learning_rate,'constant'
,learning_rate_init,0.001
,power_t,0.5
,max_iter,500
,shuffle,True


In [16]:
# --- Evaluate on CLEAN Data ---
print("\n--- Evaluating Baseline Model on CLEAN Test Data ---")
y_pred_clean = model.predict(X_test_scaled)

print("\nClassification Report (Clean Data):")
print(classification_report(y_test, y_pred_clean, target_names=le.classes_))

# You can also print a confusion matrix to see which emotions are getting mixed up
print("\nConfusion Matrix (Clean Data):")
print(confusion_matrix(y_test, y_pred_clean))


--- Evaluating Baseline Model on CLEAN Test Data ---

Classification Report (Clean Data):
              precision    recall  f1-score   support

       angry       0.99      0.99      0.99       160
       happy       0.94      0.99      0.97       160
     neutral       0.97      1.00      0.98       160
         sad       1.00      0.98      0.99       160
   surprised       1.00      0.93      0.96       160

    accuracy                           0.98       800
   macro avg       0.98      0.98      0.98       800
weighted avg       0.98      0.98      0.98       800


Confusion Matrix (Clean Data):
[[159   1   0   0   0]
 [  1 159   0   0   0]
 [  0   0 160   0   0]
 [  0   0   3 157   0]
 [  0   9   2   0 149]]


In [21]:
import numpy as np
import librosa
from tqdm import tqdm
from sklearn.metrics import classification_report

# --- Robust add_noise function ---
def add_noise(speech_signal, noise_signal, snr_db=5):
    """Mixes a speech signal with a noise signal at a specific SNR."""
    speech_power = np.sum(speech_signal ** 2) / len(speech_signal)
    noise_power = np.sum(noise_signal ** 2) / len(noise_signal)
    
    if noise_power == 0:
        return speech_signal
        
    required_noise_power = speech_power / (10 ** (snr_db / 10))
    scaled_noise = noise_signal * np.sqrt(required_noise_power / (noise_power + 1e-10))
    
    return speech_signal + scaled_noise

# ---  Feature extractor for arrays ---
def extract_features_from_array(audio_array, sample_rate, n_mfcc=13):
    """Extracts MFCCs from an audio data array."""
    try:
        mfccs = librosa.feature.mfcc(y=audio_array, sr=sample_rate, n_mfcc=n_mfcc)
        mfccs_processed = np.mean(mfccs.T, axis=0)
        return mfccs_processed
    except Exception as e:
        print(f"Error processing an audio array: {e}")
        return None

# ---  Retrieve your BEST performing model ---
try:
    best_model = models[best_model_name]
    print(f"🏆 Proceeding with the best model: {best_model_name}\n")
except NameError:
    best_model = model
    best_model_name = "MLP Classifier"
    print("Proceeding with the single MLP Classifier model.\n")


# --- Run the corrected and robust "Chaos" loop ---
X_test_noisy_features = []
y_test_corresponding = []

_, X_test_paths, _, y_test_original = train_test_split(tess_df['path'], y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

for speech_path, true_label in tqdm(zip(X_test_paths, y_test_original), total=len(X_test_paths), desc="Creating Noisy Test Set"):
    speech_audio, sr = librosa.load(speech_path, sr=None)
    noise_path = np.random.choice(noise_paths)
    noise_audio, _ = librosa.load(noise_path, sr=sr)
    
    if len(noise_audio) < len(speech_audio):
        noise_audio = np.tile(noise_audio, int(np.ceil(len(speech_audio)/len(noise_audio))))
    noise_audio = noise_audio[:len(speech_audio)]
    
    noisy_audio = add_noise(speech_audio, noise_audio, snr_db=5)
    
    features = extract_features_from_array(noisy_audio, sr, n_mfcc=13)
    
    if features is not None:
        X_test_noisy_features.append(features)
        y_test_corresponding.append(true_label)

# --- Scale and Evaluate ---
X_test_noisy_scaled = scaler.transform(np.array(X_test_noisy_features))
y_test_final = np.array(y_test_corresponding)

print(f"\n--- Evaluating '{best_model_name}' Model on NOISY Test Data ---")
y_pred_noisy = best_model.predict(X_test_noisy_scaled)

print(f"\nSuccessfully processed {len(y_test_final)} noisy samples out of {len(y_test_original)}.")
print("\nClassification Report (Noisy Data):")
print(classification_report(y_test_final, y_pred_noisy, target_names=le.classes_))

Proceeding with the single MLP Classifier model.



Creating Noisy Test Set: 100%|██████████| 800/800 [00:10<00:00, 77.38it/s] 



--- Evaluating 'MLP Classifier' Model on NOISY Test Data ---

Successfully processed 800 noisy samples out of 800.

Classification Report (Noisy Data):
              precision    recall  f1-score   support

       angry       0.56      0.09      0.15       160
       happy       0.56      0.17      0.27       160
     neutral       0.00      0.00      0.00       160
         sad       0.30      0.63      0.41       160
   surprised       0.28      0.68      0.39       160

    accuracy                           0.31       800
   macro avg       0.34      0.31      0.24       800
weighted avg       0.34      0.31      0.24       800



In [None]:
# The Challenge of MSP-Podcast
# First, understand why this dataset is challenging and therefore impressive to work on:

# Unstructured Audio: It contains pauses, "ums" and "ahs," laughter, music, ads, and multiple people speaking over each other. Your model can't just analyze the raw file; it needs to find the actual speech.

# Variable Quality: Recordings range from professional studio microphones to low-quality laptop mics over a Skype call.

# Continuous Labels: This is the biggest change. Instead of discrete classes like happy or sad, you get three continuous numbers for each segment:

# Valence: How positive or negative the emotion is (sad vs. happy).

# Arousal: The energy level of the emotion (calm vs. excited).

# Dominance: How in-control or submissive the speaker sounds.

In [22]:
# --- Quick Start on MSP-Podcast ---

# Assume you have downloaded the dataset and have the labels.csv file

# 1. Load the labels
labels_df = pd.read_csv('./MSP-Podcast-1.9/labels.csv') # Adjust path if needed

# 2. Let's work with just the first 100 files to build our pipeline
subset_df = labels_df.head(100)

# 3. For now, let's skip VAD and just extract features from the whole file
def extract_features_msp(file_path, n_mfcc=20): # Using more MFCCs is common for complex data
    """Loads and extracts features for MSP-Podcast files."""
    try:
        # Load the audio file
        audio, sr = librosa.load(file_path, sr=16000, duration=5) # Resample to 16kHz and take first 5s
        
        # Extract MFCCs
        mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
        mfccs_mean = np.mean(mfccs.T, axis=0)
        
        # You can also add more features!
        # chroma = np.mean(librosa.feature.chroma_stft(y=audio, sr=sr).T, axis=0)
        # mel = np.mean(librosa.feature.melspectrogram(y=audio, sr=sr).T, axis=0)
        # You would concatenate these features together
        
        return mfccs_mean
    
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

# Get the full paths for our subset
subset_df['path'] = subset_df['FileName'].apply(lambda x: os.path.join('./MSP-Podcast-1.9/Audio', x))

# Extract features for the subset
subset_df['features'] = [extract_features_msp(path) for path in tqdm(subset_df['path'], desc="Processing Subset")]
subset_df.dropna(inplace=True)

# 4. Prepare data for a simple regression model
X_subset = np.array(subset_df['features'].tolist())
y_arousal = np.array(subset_df['Label_Arousal'].tolist()) # Let's try to predict Arousal first

# 5. Train a simple regressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_train_sub, X_test_sub, y_train_sub, y_test_sub = train_test_split(X_subset, y_arousal, test_size=0.2, random_state=42)

reg_model = RandomForestRegressor(random_state=42)
reg_model.fit(X_train_sub, y_train_sub)
y_pred_sub = reg_model.predict(X_test_sub)

mse = mean_squared_error(y_test_sub, y_pred_sub)
print(f"\nSimple model on subset of MSP-Podcast:")
print(f"Root Mean Squared Error for Arousal: {np.sqrt(mse):.4f}")

FileNotFoundError: [Errno 2] No such file or directory: './MSP-Podcast-1.9/labels.csv'