In [1]:
import os

upload_dir = "emovdb_subset"
os.makedirs(upload_dir, exist_ok=True)

# Step 2: List all the .wav files we’ve uploaded
for file in os.listdir(upload_dir):
    if file.endswith(".wav"):
        print(file)



anger_1-28_0012.wav
amused_141_168_0152.wav
anger_29-56_0029.wav
anger_1-28_0011.wav
anger_29-56_0046.wav
anger_1-28_0023.wav
anger_29-56_0037.wav
anger_29-56_0034.wav
amused_1-15_0013.wav
anger_29-56_0040.wav
disgust_141-168_0158.wav
anger_1-28_0014.wav
anger_29-56_0047.wav
anger_29-56_0045.wav
anger_29-56_0039.wav
anger_1-28_0025.wav
anger_29-56_0035.wav
anger_1-28_0017.wav
sleepiness_57-84_0061.wav
amused_29-45_0044.wav
anger_1-28_0027.wav
anger_1-28_0013.wav
disgust_533-560_0540.wav
anger_29-56_0048.wav
anger_1-28_0028.wav
anger_1-28_0022.wav
anger_367-392_0386.wav
anger_29-56_0036.wav
anger_1-28_0020.wav
anger_29-56_0043.wav
anger_29-56_0031.wav
anger_169-196_0195.wav
anger_29-56_0033.wav
disgust_253-280_0256.wav
anger_57-84_0078.wav
anger_1-28_0024.wav
anger_1-28_0010.wav
amused_57-84_0077.wav
anger_29-56_0038.wav
sleepiness_1-28_0014.wav
anger_29-56_0042.wav
anger_1-28_0018.wav
anger_29-56_0044.wav
anger_29-56_0030.wav
anger_1-28_0015.wav
anger_1-28_0009.wav
anger_29-56_0041.wav

In [2]:
#Convert all uploaded audio to 22kHz mono for training
import librosa
import soundfile as sf

# Output directory to save cleaned audio
processed_dir = "emovdb_processed"
os.makedirs(processed_dir, exist_ok=True)

# Loop through and convert each .wav file
for file in os.listdir(upload_dir):
    if file.endswith(".wav"):
        # Load audio at 22,050 Hz
        y, sr = librosa.load(os.path.join(upload_dir, file), sr=22050)

        # Save the processed audio
        output_path = os.path.join(processed_dir, file)
        sf.write(output_path, y, 22050)

print("Audio preprocessing complete. Processed files saved in:", processed_dir)


Audio preprocessing complete. Processed files saved in: emovdb_processed


In [3]:
# Step 4: Create a metadata file for training
metadata_path = "emovdb_metadata.csv"

# Used a dummy text for now (then update later with real transcript)
default_text = "This is an emotional voice sample."

# Open file for writing
with open(metadata_path, "w") as f:
    for file in os.listdir(processed_dir):
        if file.endswith(".wav"):
            # Format: filename|text
            f.write(f"{file}|{default_text}")

print("Metadata file created at:", metadata_path)


Metadata file created at: emovdb_metadata.csv


In [5]:
# Step 5: Install Coqui TTS and espeak-ng
!pip install TTS
!apt-get install espeak-ng
!pip install scipy numpy==1.24.4


Collecting TTS
  Downloading TTS-0.22.0-cp311-cp311-manylinux1_x86_64.whl.metadata (21 kB)
Collecting anyascii>=0.3.0 (from TTS)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pysbd>=0.3.4 (from TTS)
  Downloading pysbd-0.3.4-py3-none-any.whl.metadata (6.1 kB)
Collecting pandas<2.0,>=1.4 (from TTS)
  Downloading pandas-1.5.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting trainer>=0.0.32 (from TTS)
  Downloading trainer-0.0.36-py3-none-any.whl.metadata (8.1 kB)
Collecting coqpit>=0.0.16 (from TTS)
  Downloading coqpit-0.0.17-py3-none-any.whl.metadata (11 kB)
Collecting pypinyin (from TTS)
  Downloading pypinyin-0.54.0-py2.py3-none-any.whl.metadata (12 kB)
Collecting hangul-romanize (from TTS)
  Downloading hangul_romanize-0.1.0-py3-none-any.whl.metadata (1.2 kB)
Collecting gruut==2.2.3 (from gruut[de,es,fr]==2.2.3->TTS)
  Downloading gruut-2.2.3.tar.gz (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  espeak-ng-data libespeak-ng1 libpcaudio0 libsonic0
The following NEW packages will be installed:
  espeak-ng espeak-ng-data libespeak-ng1 libpcaudio0 libsonic0
0 upgraded, 5 newly installed, 0 to remove and 35 not upgraded.
Need to get 4,526 kB of archives.
After this operation, 11.9 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/main amd64 libpcaudio0 amd64 1.1-6build2 [8,956 B]
Get:2 http://archive.ubuntu.com/ubuntu jammy/main amd64 libsonic0 amd64 0.2.0-11build1 [10.3 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 espeak-ng-data amd64 1.50+dfsg-10ubuntu0.1 [3,956 kB]
Get:4 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 libespeak-ng1 amd64 1.50+dfsg-10ubuntu0.1 [207 kB]
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 espeak-ng amd64 1.50+dfsg-1

In [4]:
#Use a pre-trained expressive TTS model
from TTS.api import TTS

# Load a pre-trained expressive English voice model
# progress_bar and gpu are fine as they are
tts = TTS(model_name="tts_models/en/vctk/vits", progress_bar=False, gpu=False)

# Text you want to convert to audio
demo_text = "I'm really excited to demonstrate my project!"
output_path = "demo_output.wav"

# Generate the speech and save it
# Added the 'speaker' argument with a valid VCTK speaker ID=p280'
tts.tts_to_file(text=demo_text, file_path=output_path, speaker="p280")

print("Synthesis complete. Audio saved to:", output_path)

from IPython.display import Audio
Audio("demo_output.wav")

 > Downloading model to /root/.local/share/tts/tts_models--en--vctk--vits
 > Model's license - apache 2.0
 > Check https://choosealicense.com/licenses/apache-2.0/ for more info.
 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 > initialization of speaker-embedding layers.
 > Text s

In [5]:

import os
import librosa
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import joblib

# Step 4: Create a metadata file for training
metadata_path = "emovdb_metadata.csv"

# Used a dummy text for now (then update later with real transcript)
default_text = "This is an emotional voice sample."

# Open file for writing
with open(metadata_path, "w") as f:
    # Write the header row
    f.write("file|text") # Added header and newline

    for file in os.listdir(processed_dir):
        if file.endswith(".wav"):
            # Format: filename|text
            # Added newline character at the end of each data line
            f.write(f"{file}|{default_text}")

print("Metadata file created at:", metadata_path)

Metadata file created at: emovdb_metadata.csv


In [6]:
# Define folder containing your .wav files
audio_folder = "emovdb_subset"

# Rebuild metadata from filenames
metadata = []
for file in os.listdir(audio_folder):
    if file.endswith(".wav"):
        emotion = file.split("_")[0]
        metadata.append([file, emotion])

metadata_df = pd.DataFrame(metadata, columns=["file", "emotion"])
metadata_df.to_csv("fixed_metadata.csv", index=False)

print("Metadata created:")
print(metadata_df.head())





Metadata created:
                      file emotion
0      anger_1-28_0012.wav   anger
1  amused_141_168_0152.wav  amused
2     anger_29-56_0029.wav   anger
3      anger_1-28_0011.wav   anger
4     anger_29-56_0046.wav   anger


In [10]:
def extract_features(audio_path, n_mfcc=13):
    try:
        y, sr = librosa.load(audio_path, sr=None)
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
        return np.mean(mfccs.T, axis=0)
    except Exception as e:
        print(f"Error loading or processing {audio_path}: {e}")
        return None

# Define folder containing your .wav files
audio_folder = "emovdb_subset"

metadata_df = pd.read_csv("fixed_metadata.csv")
failed_files = []
data = []

for index, row in metadata_df.iterrows():
    file_name = row['file']
    emotion = row['emotion']
    path = os.path.join(audio_folder, file_name)

    if not os.path.exists(path):
        print(f"File not found: {path}")
        failed_files.append(file_name)
        continue

    try:
        features = extract_features(path)
        if features.shape[0] != 13:  # ensure 13 MFCCs returned
            print(f"Skipping {file_name} due to bad MFCC shape: {features.shape}")
            failed_files.append(file_name)
            continue
        data.append([*features, emotion])
    except Exception as e:
        print(f"Error processing {file_name}: {e}")
        failed_files.append(file_name)

# Build DataFrame again
feature_names = [f'mfcc_{i+1}' for i in range(13)]
df = pd.DataFrame(data, columns=feature_names + ['emotion'])

print(f"Final usable samples: {len(df)}")
df.head()


Final usable samples: 53


Unnamed: 0,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,mfcc_10,mfcc_11,mfcc_12,mfcc_13,emotion
0,-364.279083,67.15461,-19.59071,10.659934,-5.175036,-0.524392,-2.882263,0.102238,-1.297441,-6.315584,-1.381441,-0.587236,-7.845033,anger
1,-318.937134,80.976425,-6.455454,1.438914,-1.305544,-1.114751,2.763088,4.461967,-4.518803,-6.504177,7.891774,-5.261915,-7.389937,amused
2,-289.230988,95.504608,-14.785392,-4.249517,-2.013148,0.292316,0.913888,-6.069655,-6.947056,-11.075929,0.988916,-2.16588,-12.349177,anger
3,-400.8992,97.871979,-22.759382,-5.800097,1.117949,-3.989106,-5.514225,-1.153582,-0.317833,-9.353425,-0.530906,1.036237,-7.428967,anger
4,-331.234344,65.648224,-8.97658,5.962227,4.254489,-2.02821,3.072378,1.15575,-4.926527,-11.725599,6.167563,-0.103482,-9.114613,anger


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import joblib

# Split features and target
X = df.drop('emotion', axis=1)
y = df['emotion']

# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Save label encoder
joblib.dump(le, 'label_encoder.pkl')

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42,)

# Train Random Forest
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_test)
print("Random Forest Report:")
print(classification_report(y_test, rf_preds, zero_division=0))

# Train Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)
lr_preds = lr_model.predict(X_test)
print("Logistic Regression Report:")
print(classification_report(y_test, lr_preds, zero_division=0))



Random Forest Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.91      1.00      0.95        10

    accuracy                           0.91        11
   macro avg       0.45      0.50      0.48        11
weighted avg       0.83      0.91      0.87        11

Logistic Regression Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00        10

    accuracy                           1.00        11
   macro avg       1.00      1.00      1.00        11
weighted avg       1.00      1.00      1.00        11



In [12]:

# Save the better-performing model: Logistic Regression
joblib.dump(lr_model, 'emotion_model.pkl')
print("Logistic Regression model saved as emotion_model.pkl")

Logistic Regression model saved as emotion_model.pkl


In [13]:
import os

# See if audio files exist
print("Files in folder:", os.listdir("emovdb_subset")[:5])
print("Metadata preview:")
print(metadata_df.head())


Files in folder: ['anger_1-28_0012.wav', 'amused_141_168_0152.wav', 'anger_29-56_0029.wav', 'anger_1-28_0011.wav', 'anger_29-56_0046.wav']
Metadata preview:
                      file emotion
0      anger_1-28_0012.wav   anger
1  amused_141_168_0152.wav  amused
2     anger_29-56_0029.wav   anger
3      anger_1-28_0011.wav   anger
4     anger_29-56_0046.wav   anger
