In [1]:
import librosa
import soundfile
import os, glob, pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

In [8]:
import librosa
import soundfile

def extract_feature(file_name, mfcc=True, chroma=True, mel=True):
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32")
        sample_rate = sound_file.samplerate
        
        # Initialize an empty list to store features
        features = []
        
        # Compute MFCCs
        if mfcc:
            mfccs = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40)
            mfccs_mean = np.mean(mfccs, axis=1)  # Take mean along the time axis
            features.append(mfccs_mean)
        
        # Compute chroma
        if chroma:
            chroma = librosa.feature.chroma_stft(y=X, sr=sample_rate)
            chroma_mean = np.mean(chroma, axis=1)  # Take mean along the time axis
            features.append(chroma_mean)
        
        # Compute Mel spectrogram
        if mel:
            mel_spec = librosa.feature.melspectrogram(y=X, sr=sample_rate, n_mels=128)
            mel_spec_mean = np.mean(mel_spec, axis=1)  # Take mean along the time axis
            features.append(mel_spec_mean)
        
        # Concatenate features
        combined_features = np.concatenate(features)
        
        return combined_features


In [9]:
# Emotions in the RAVDESS dataset
emotions=[
  'angry', 'disgust', 'fear', 'happy', 'neutral', 'ps', 'sad'
    ]

#Emotions to observe
observed_emotions=['angry', 'disgust', 'fear', 'happy', 'neutral', 'ps', 'sad']

In [10]:
file_paths = glob.glob("D:/ML/speech-emotion-recognition-ravdess-data/Actor_*/*.wav")
print("Number of files found:", len(file_paths))

# Iterate through each file and print the basename
for file_path in file_paths:    file_name = os.path.basename(file_path)
    print("File basename:", file_name)

Number of files found: 1440
File basename: 03-01-01-01-01-01-01.wav
File basename: 03-01-01-01-01-02-01.wav
File basename: 03-01-01-01-02-01-01.wav
File basename: 03-01-01-01-02-02-01.wav
File basename: 03-01-02-01-01-01-01.wav
File basename: 03-01-02-01-01-02-01.wav
File basename: 03-01-02-01-02-01-01.wav
File basename: 03-01-02-01-02-02-01.wav
File basename: 03-01-02-02-01-01-01.wav
File basename: 03-01-02-02-01-02-01.wav
File basename: 03-01-02-02-02-01-01.wav
File basename: 03-01-02-02-02-02-01.wav
File basename: 03-01-03-01-01-01-01.wav
File basename: 03-01-03-01-01-02-01.wav
File basename: 03-01-03-01-02-01-01.wav
File basename: 03-01-03-01-02-02-01.wav
File basename: 03-01-03-02-01-01-01.wav
File basename: 03-01-03-02-01-02-01.wav
File basename: 03-01-03-02-02-01-01.wav
File basename: 03-01-03-02-02-02-01.wav
File basename: 03-01-04-01-01-01-01.wav
File basename: 03-01-04-01-01-02-01.wav
File basename: 03-01-04-01-02-01-01.wav
File basename: 03-01-04-01-02-02-01.wav
File basenam

In [28]:
directory_path = 'C:/Users/anilk/jup/speech/input' 

def load_data(test_size=0.2):
    x,y=[],[]
    for dirname, _, filenames in os.walk(directory_path):
        for filename in filenames:
            paths.append(os.path.join(dirname, filename))
            label = filename.split('_')[-1]
            label = label.split('.')[0]
            labels.append(label.lower())
            paths=extract_feature(file, mfcc=True, chroma=True, mel=True)
            x.append(paths)
            y.append(label)
        return train_test_split(np.array(x), y, test_size=test_size, random_state=9)

In [29]:
#Split the dataset
x_train,x_test,y_train,y_test=load_data(test_size=0.25)

ValueError: With n_samples=0, test_size=0.25 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [13]:
x_train

array([[-5.2206189e+02,  3.5066891e+01,  3.7534292e+00, ...,
         1.6524314e-04,  1.0432161e-04,  6.5557157e-05],
       [-6.4122772e+02,  4.4948776e+01, -1.8517413e+00, ...,
         3.8926191e-05,  3.0525534e-05,  2.9416666e-05],
       [-6.5070575e+02,  5.3021164e+01, -4.9204044e+00, ...,
         4.7521684e-05,  3.4663255e-05,  1.6284444e-05],
       ...,
       [-5.5009619e+02,  1.7029768e+01, -1.1457564e+01, ...,
         1.5176463e-04,  1.1682853e-04,  8.4747931e-05],
       [-5.5535760e+02,  4.7156971e+01,  1.1075074e+01, ...,
         1.6108646e-04,  1.0496246e-04,  6.5281172e-05],
       [-5.0481635e+02,  3.5361866e+01, -1.4349578e+01, ...,
         6.0815155e-04,  5.5526977e-04,  4.4778222e-04]], dtype=float32)

In [14]:
print((x_train.shape[0], x_test.shape[0]))

(576, 192)


In [15]:
print(f'Features extracted: {x_train.shape[1]}')

Features extracted: 180


In [16]:
model=MLPClassifier(alpha=0.01, batch_size=256, epsilon=1e-08, hidden_layer_sizes=(300,), learning_rate='adaptive', max_iter=500)

In [17]:
model.fit(x_train,y_train)

In [18]:
y_pred=model.predict(x_test)

In [19]:
y_pred

array(['happy', 'calm', 'happy', 'happy', 'disgust', 'calm', 'calm',
       'disgust', 'calm', 'happy', 'happy', 'calm', 'fearful', 'happy',
       'disgust', 'happy', 'calm', 'disgust', 'disgust', 'calm',
       'disgust', 'disgust', 'disgust', 'calm', 'happy', 'happy', 'calm',
       'happy', 'calm', 'calm', 'happy', 'disgust', 'happy', 'calm',
       'happy', 'calm', 'calm', 'fearful', 'calm', 'disgust', 'happy',
       'calm', 'calm', 'calm', 'calm', 'calm', 'disgust', 'calm', 'calm',
       'happy', 'fearful', 'fearful', 'calm', 'happy', 'happy', 'calm',
       'calm', 'happy', 'calm', 'calm', 'disgust', 'calm', 'happy',
       'happy', 'happy', 'calm', 'calm', 'disgust', 'disgust', 'happy',
       'happy', 'fearful', 'fearful', 'fearful', 'fearful', 'disgust',
       'happy', 'happy', 'calm', 'fearful', 'calm', 'calm', 'fearful',
       'calm', 'disgust', 'calm', 'disgust', 'fearful', 'happy', 'happy',
       'disgust', 'calm', 'calm', 'happy', 'disgust', 'disgust', 'calm',
     

In [20]:
#Calculate the accuracy of our model
accuracy=accuracy_score(y_true=y_test, y_pred=y_pred)

#Print the accuracy
print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy: 66.67%


In [21]:
from sklearn.metrics import accuracy_score, f1_score

In [22]:
import pandas as pd
df=pd.DataFrame({'Actual': y_test, 'Predicted':y_pred})
df.head(20)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Unnamed: 0,Actual,Predicted
0,happy,happy
1,calm,calm
2,happy,happy
3,happy,happy
4,disgust,disgust
5,calm,calm
6,happy,calm
7,happy,disgust
8,disgust,calm
9,happy,happy


In [24]:
import pickle
# Writing different model files to file
with open( 'modelForPrediction1.sav', 'wb') as f:
    pickle.dump(model,f)

In [27]:
filename = 'modelForPrediction1.sav'
loaded_model = pickle.load(open(filename, 'rb')) # loading the model file from the storage

feature=extract_feature('D:/ML/speech-emotion-recognition-ravdess-data/Actor_01/03-01-01-01-01-01-01.wav', mfcc=True, chroma=True, mel=True)

feature=feature.reshape(1,-1)

prediction=loaded_model.predict(feature)
prediction

array(['calm'], dtype='<U7')