In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Import All Important Libraries
import librosa
import soundfile
import os, glob, pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler 
from sklearn.feature_selection import RFE
from sklearn.neural_network import MLPClassifier

In [3]:
# function for extracting mfcc, pitch, and rmse features from sound file
def extract_feature(file_name, mfcc, pitch, rmse):
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32")
        sample_rate = sound_file.samplerate
        result = []
        if mfcc:
            mfccs = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40)
            if mfccs.size > 0:
                mfccs = np.mean(mfccs.T, axis=0)
                mfccs = np.pad(mfccs, (0, 40 - len(mfccs)), mode='constant')
                result.append(mfccs)
        if pitch:
            f0, voiced_flag, voiced_probs = librosa.pyin(y=X, sr=sample_rate, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
            if np.any(voiced_flag):
                pitch_mean = np.mean(f0[voiced_flag])
                pitch_std = np.std(f0[voiced_flag])
                if np.isnan(pitch_mean) or np.isnan(pitch_std):
                    return None
                pitch = np.array([pitch_mean, pitch_std])
                pitch = np.pad(pitch, (0, 2 - len(pitch)), mode='constant')
                result.append(pitch)
       
        if rmse:
            rmse = librosa.feature.rms(y=X)
            if rmse.size > 0:
                rmse = np.mean(rmse.T, axis=0)
                rmse = np.pad(rmse, (0, 1 - len(rmse)), mode='constant')
                result.append(rmse)
        
    return np.concatenate(result)


In [4]:
# Define the motions dictionary
emotions = {
    '01': 'neutral',
    '02': 'calm',
    '03': 'happy',
    '04': 'sad',
    '05': 'angry',
    '06': 'fearful',
    '07': 'disgust',
    '08': 'surprised'
}

# Emotions we want to observe
observed_emotions = ['sad', 'angry', 'fearful', 'surprised']

# Load the data and extract features for each sound file
def load_data(test_size=0.1):
    x, y = [], []
#      for folder in glob.glob('C:\\Users\\user\\Documents\\6th_sem_project\\speech-emotion-recognition-ravdess-data\\Actor_' + '*'):
#      for folder in glob.glob('C:\\Users\\user\\Documents\\6th_sem_project\\Reduced dataset\\Actor_' + '*'):
#      for folder in glob.glob('/content/drive/MyDrive/speech/Actor_' + '*'):
    for folder in glob.glob('/content/drive/MyDrive/Dataset_Speech_Emotion_Recognition/speech-emotion-recognition-ravdess-data/Actor_' + '*'):
        print(folder)
        for file in glob.glob(folder + '/*.wav'):
            file_name = os.path.basename(file)
            emotion = emotions[file_name.split('-')[2]]
            if emotion not in observed_emotions:
                continue
            feature = extract_feature(file, mfcc=True, pitch=True, rmse=True)
            if feature is not None:
                x.append(feature)
                y.append(emotion)
    
    max_len = max(len(l) for l in x)
    x = [np.pad(l, pad_width=(0, max_len - len(l)), mode='constant') for l in x]
    
    # convert x to a 2D array of numeric values
    x = np.vstack(x)
    
    return train_test_split(np.array(x, dtype=object), np.array(y, dtype=object), test_size=test_size, random_state=9)


# Load the data and split into train and test sets
x_train, x_test, y_train, y_test = load_data(test_size=0.1)

# Print shape of train and test set and number of features extracted
print((x_train.shape[0], x_test.shape[0]))
print(f'Features extracted: {x_train[0].shape[0]}')


/content/drive/MyDrive/Dataset_Speech_Emotion_Recognition/speech-emotion-recognition-ravdess-data/Actor_18
/content/drive/MyDrive/Dataset_Speech_Emotion_Recognition/speech-emotion-recognition-ravdess-data/Actor_19
/content/drive/MyDrive/Dataset_Speech_Emotion_Recognition/speech-emotion-recognition-ravdess-data/Actor_16
/content/drive/MyDrive/Dataset_Speech_Emotion_Recognition/speech-emotion-recognition-ravdess-data/Actor_23
/content/drive/MyDrive/Dataset_Speech_Emotion_Recognition/speech-emotion-recognition-ravdess-data/Actor_21
/content/drive/MyDrive/Dataset_Speech_Emotion_Recognition/speech-emotion-recognition-ravdess-data/Actor_20
/content/drive/MyDrive/Dataset_Speech_Emotion_Recognition/speech-emotion-recognition-ravdess-data/Actor_17
/content/drive/MyDrive/Dataset_Speech_Emotion_Recognition/speech-emotion-recognition-ravdess-data/Actor_22
/content/drive/MyDrive/Dataset_Speech_Emotion_Recognition/speech-emotion-recognition-ravdess-data/Actor_24
/content/drive/MyDrive/Dataset_Speech

In [5]:
# create the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('feature_selection', RFE(estimator=RandomForestClassifier(), n_features_to_select=37)),
     ('classifier', VotingClassifier( estimators=[
        ('rf', RandomForestClassifier(n_estimators=500, random_state=9)),
        ('svc', SVC(kernel='linear', probability=True, random_state=9)),
        ('knn', KNeighborsClassifier(n_neighbors=6)),
        ('mlp', MLPClassifier(alpha=0.01, batch_size=256, hidden_layer_sizes=(700,), max_iter=500, learning_rate='adaptive', random_state=9))
    ], voting='soft'))
])


# Fit the model to the training data
pipeline.fit(x_train, y_train)

# Predict for the test set
y_pred = pipeline.predict(x_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred) 
print(f"Accuracy score: {accuracy}")
print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy score: 0.8311688311688312
Accuracy: 83.12%


In [6]:
filename = "ensemble_model"
pickle.dump(pipeline, open(filename, 'wb'))

In [8]:
 path = '/content/drive/MyDrive/Dataset_Speech_Emotion_Recognition/ensemble_model'
loaded_model = pickle.load(open(path, 'rb'))
test = loaded_model.predict(x_test)

In [9]:
# Calculate accuracy
accuracy = accuracy_score(y_test, test) 
print(f"Accuracy score: {accuracy}")
print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy score: 0.8311688311688312
Accuracy: 83.12%
