In [2]:
import os
import glob
import librosa
import numpy as np
import soundfile
import tkinter as tk
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout, LSTM
from keras.layers import TimeDistributed 
from tkinter import filedialog
from pydub import AudioSegment
import sounddevice as sd
from PIL import Image, ImageTk 



In [3]:
# Emotions in the RAVDESS dataset
emotions = {
  '01':'neutral',
  '02':'calm',
  '03':'happy',
  '04':'sad',
  '05':'angry',
  '06':'fearful',
  '07':'disgust',
  '08':'surprised'
}

# Emotions to observe
observed_emotions=['calm', 'happy', 'fearful', 'disgust']

In [4]:
# Extract features (mfcc, chroma, mel) from a sound file
def extract_feature(file_name, mfcc=True, chroma=True, mel=True):
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype='float32')
        sample_rate = sound_file.samplerate
        stft = np.abs(librosa.stft(X)) if chroma or mel else None
        result = np.array([])

        if mfcc:
            mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
            result = np.hstack((result, mfccs))

        if chroma:
            chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
            result = np.hstack((result, chroma))

        if mel:
            mel_spec = np.mean(librosa.feature.melspectrogram(y=X, sr=sample_rate).T, axis=0)
            result = np.hstack((result, mel_spec))

    return result


In [5]:
# Load the data and extract features for each sound file
def load_data(test_size=0.2):
    x, y = [], []
    for file in glob.glob('project/Speech Emotion Recognition Ravdess Data/Actor_*/*.wav'):
        print(f"Processing file: {file}")
        file_name = os.path.basename(file)
        emotion = emotions[file_name.split("-")[2]]
        if emotion not in observed_emotions:
            continue
        feature = extract_feature(file, mfcc=True, chroma=True, mel=True)
        x.append(feature)
        y.append(emotion)

    return train_test_split(np.array(x), y, test_size=test_size, random_state=9)

In [6]:
data = load_data(test_size=0.25)  # Load the data

# Check if the data returned has the expected length
if len(data) == 4:
    x_train, x_test, y_train, y_test = data
else:
    raise ValueError("load_data function must return four values: x_train, x_test, y_train, y_test.")

# Proceed with your code using x_train, x_test, y_train, y_test

Processing file: project/Speech Emotion Recognition Ravdess Data\Actor_01\03-01-01-01-01-01-01.wav
Processing file: project/Speech Emotion Recognition Ravdess Data\Actor_01\03-01-01-01-01-02-01.wav
Processing file: project/Speech Emotion Recognition Ravdess Data\Actor_01\03-01-01-01-02-01-01.wav
Processing file: project/Speech Emotion Recognition Ravdess Data\Actor_01\03-01-01-01-02-02-01.wav
Processing file: project/Speech Emotion Recognition Ravdess Data\Actor_01\03-01-02-01-01-01-01.wav
Processing file: project/Speech Emotion Recognition Ravdess Data\Actor_01\03-01-02-01-01-02-01.wav
Processing file: project/Speech Emotion Recognition Ravdess Data\Actor_01\03-01-02-01-02-01-01.wav
Processing file: project/Speech Emotion Recognition Ravdess Data\Actor_01\03-01-02-01-02-02-01.wav
Processing file: project/Speech Emotion Recognition Ravdess Data\Actor_01\03-01-02-02-01-01-01.wav
Processing file: project/Speech Emotion Recognition Ravdess Data\Actor_01\03-01-02-02-01-02-01.wav
Processing

In [7]:
# Get the shape of the training and testing datasets
print((x_train.shape[0], x_test.shape[0]))

(576, 192)


In [8]:
# Get the number of features extracted
print(f'Features extracted: {x_train.shape[1]}')

Features extracted: 180


In [9]:
# Initialize the Multi Layer Perceptron Classifier
model=MLPClassifier(alpha=0.01, batch_size=256, epsilon=1e-08, hidden_layer_sizes=(300,), learning_rate='adaptive', max_iter=500)

In [10]:
# Train the model
model.fit(x_train,y_train)

In [11]:
# Predict for the test set
y_pred=model.predict(x_test)

In [12]:
# Calculate the accuracy of our model
accuracy=accuracy_score(y_true=y_test, y_pred=y_pred)
# Print the accuracy
print("Accuracy: {:.2f}%".format(accuracy * 100))

Accuracy: 61.98%


In [13]:
def predict_emotion(file):
    features = extract_feature(file)
    features = features.reshape(1, -1)
    print("Features shape:", features.shape)
    print("Features:", features)

    predicted_probabilities = model.predict(features)
    return predicted_probabilities[0]
    print("Predicted probabilities shape:", predicted_probabilities.shape)
    print("Predicted probabilities:", predicted_probabilities)

    predicted_label_index = np.argmax(predicted_probabilities)
    print("Predicted label index:", predicted_label_index)

    predicted_emotion = label_encoder.classes_[predicted_label_index]
    print("Predicted emotion:", predicted_emotion)


    # Emotion mapping for TESS dataset
    emotion_mapping = {
        'YAF_angry': 'ANGRY',
        'YAF_disgust': 'DISGUST',
        'YAF_fear': 'FEAR',
        'YAF_happy': 'HAPPY',
        'YAF_neutral': 'NEUTRAL',
        'YAF_pleasant_surprised': 'SURPRISED',
        'YAF_sad': 'SAD',
        'OAF_angry': 'ANGRY',
        'OAF_disgust': 'DISGUST',
        'OAF_Fear': 'FEAR',
        'OAF_happy': 'HAPPY',
        'OAF_neutral': 'NEUTRAL',
        'OAF_Pleasant_surprised': 'SURPRISED',
        'OAF_Sad': 'SAD',
    }


    recognizable_emotion = emotion_mapping.get(predicted_emotion)
    return recognizable_emotion

In [None]:
class EmotionApp:
    def __init__(self, root):
        self.root = root
        self.root.title("Emotion Prediction App")
        self.root.configure(bg='yellow')
        
        self.prediction_history = [] 
        self.show_home_page()
        
    def show_home_page(self):
        self.clear_window()
        
        label = tk.Label(self.root, text="Welcome to Emotion Prediction App", font=('Helvetica bold', 16))
        label.pack(pady=20)
        
        button = tk.Button(self.root, text="Audio Prediction", command=self.show_audio_page, bg='orange')
        button.pack()
        
        button_history = tk.Button(self.root, text="Prediction History", command=self.show_history_page, bg='lightgreen')
        button_history.pack(pady=10)
        
        about_button = tk.Button(self.root, text="About The App", command=self.show_about_page, bg='lightblue')
        about_button.pack(pady=10)
        
    def show_audio_page(self):
        self.clear_window()
        
        canvas = tk.Canvas(self.root, width=500, height=500, bg='skyblue')
        canvas.pack()
        
        label1 = tk.Label(self.root, text='SPEECH EMOTION', font=('Helvetica bold', 26))
        canvas.create_window(250, 50, window=label1)
        
        def upload_audio():
            file = filedialog.askopenfilename(filetypes=[("Audio Files", "*.wav")])
            if file:
                predicted_emotion = predict_emotion(file)
                label2.config(text=predicted_emotion)
                
                self.prediction_history.append((os.path.basename(file), predicted_emotion))
                
        button1 = tk.Button(self.root, text='Upload Audio', command=upload_audio, bg='orange')
        canvas.create_window(250, 150, window=button1)
        
        label2 = tk.Label(self.root, text='Predicted Emotion Will Be Displayed Here')
        canvas.create_window(250, 200, window=label2)
        
        back_button = tk.Button(self.root, text="Back to Home", command=self.show_home_page)
        canvas.create_window(250, 400, window=back_button)
        
    def show_history_page(self):
        self.clear_window()
        
        canvas = tk.Canvas(self.root, width=500, height=500, bg='lightgreen')
        canvas.pack()
        
        label = tk.Label(self.root, text="Prediction History", font=('Helvetica bold', 16))
        canvas.create_window(250, 50, window=label)
        
        if self.prediction_history:
            for index, (file_name, predicted_emotion) in enumerate(self.prediction_history, start=1):
                history_text = f"{index}. File: {file_name}, Emotion: {predicted_emotion}"
                history_label = tk.Label(self.root, text=history_text)
                canvas.create_window(250, 100 + index * 30, window=history_label)
        else:
            no_history_label = tk.Label(self.root, text="No prediction history available.")
            canvas.create_window(250, 150, window=no_history_label)
        
        back_button = tk.Button(self.root, text="Back to Home", command=self.show_home_page)
        canvas.create_window(250, 450, window=back_button)
        
    def show_about_page(self):
        self.clear_window()
        
        canvas = tk.Canvas(self.root, width=500, height=500, bg='skyblue')
        canvas.pack()
        
        label = tk.Label(self.root, text="About The Software", font=('Helvetica bold', 16))
        canvas.create_window(250, 50, window=label)
        
        about_text = ("Hello Everyone !! "
                      " Speech Emotion Recognition is a software that recognizes the emotion of the user."
                      " All of the audio files in this software should be inputted with '.wav' extension."
                      " A special thanks to the University of Toronto for the TESS data set and to all of my guiders"
                      " at clevered that guided me throughout the journey of making this software.")
        
        about_label = tk.Label(self.root, text=about_text, wraplength=400)
        canvas.create_window(250, 150, window=about_label)
        
        back_button = tk.Button(self.root, text="Back to Home", command=self.show_home_page)
        canvas.create_window(250, 400, window=back_button)
        
    def clear_window(self):
        for widget in self.root.winfo_children():
            widget.destroy()

if __name__ == "__main__":
    root = tk.Tk()
    app = EmotionApp(root)
    root.mainloop()

Features shape: (1, 180)
Features: [[-6.27027527e+02  5.58267860e+01 -3.05190110e+00  1.13762960e+01
   9.62898195e-01 -4.01901150e+00 -9.94970322e+00 -1.36324577e+01
  -9.36559677e+00  3.61703724e-01 -8.85112286e+00 -2.03845263e+00
  -8.59228325e+00  8.24374676e-01 -7.33057213e+00 -4.08857775e+00
  -4.89521933e+00 -1.83429301e+00 -6.02740574e+00  1.79887697e-01
  -6.15797281e+00 -3.08045745e+00 -2.10184073e+00 -3.36759830e+00
  -2.32434177e+00 -1.58505774e+00 -1.46545660e+00 -3.52840573e-02
  -1.01484740e+00 -5.80244899e-01 -1.76666105e+00 -9.89610255e-01
  -1.93926501e+00  5.76166660e-02 -2.25089240e+00 -2.52078533e+00
  -2.80945373e+00 -1.09878504e+00  3.81202787e-01 -5.64754009e-01
   5.81303596e-01  5.71341515e-01  5.97026646e-01  6.14089787e-01
   6.15995884e-01  6.60925984e-01  6.65077686e-01  6.69091105e-01
   6.68007135e-01  6.74369931e-01  6.74773932e-01  6.53977811e-01
   1.22266397e-06  1.73821245e-05  1.41880431e-04  3.63113685e-03
   3.06619685e-02  4.20224704e-02  3.4840