In [69]:
import librosa
import numpy as np

def extract_features(file_path):
    audio, sample = librosa.load(file_path, res_type='kaiser_fast')
    mfccs = np.mean(librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40).T, axis=0)
    chroma = np.mean(librosa.feature.chroma_stft(y=audio, sr=sample_rate).T, axis=0)
    spectral_contrast = np.mean(librosa.feature.spectral_contrast(y=audio, sr=sample_rate).T, axis=0)

    combined_features = np.hstack([mfccs, chroma, spectral_contrast])

    return combined_features

In [70]:
import os
import pandas as pd

data = {}
paras = []
labels = []
base_path = "data/voices"

for label in os.listdir(base_path):
    folder_path = os.path.join(base_path, label)
    for f in os.listdir(folder_path):
        file_path = os.path.join(folder_path, f)
        try:
            features = extract_features(file_path)
            paras.append(features)
            labels.append(label)
        except Exception as e:
            print(f"Error processing {file_path}: {e}")

print(paras[0])
data.update({'para': paras})
data.update({'label': labels})

df = pd.DataFrame(data)
print(df)
df.to_csv("emotion_dataset.csv", index=False)


[-5.99260315e+02  5.30330315e+01 -9.83530140e+00  9.40046406e+00
 -4.98440742e+00  2.40553474e+00 -1.55293922e+01 -5.46014071e+00
 -1.43017521e+01 -5.93659353e+00 -3.53338552e+00 -4.39077425e+00
 -4.36546564e+00 -1.53836858e+00 -8.80753422e+00  1.02889347e+00
 -9.06821346e+00 -1.52784061e+00 -4.09466505e+00 -3.85637617e+00
 -6.21353006e+00 -2.29107928e+00 -4.52339602e+00 -4.06523848e+00
 -3.57221842e+00 -1.47317529e+00 -1.96464956e+00  2.54054379e+00
  1.14935911e+00  7.08324015e-01 -1.81262136e+00 -1.92672014e+00
 -1.13827324e+00  8.19765508e-01  1.83780348e+00  4.72514439e+00
  3.00497079e+00  5.00106327e-02 -6.81706488e-01 -1.36187136e+00
  4.50462878e-01  5.22485614e-01  5.46331227e-01  4.47947323e-01
  5.14102042e-01  5.55319548e-01  5.54396212e-01  5.23587465e-01
  4.41024244e-01  4.27330971e-01  5.33335984e-01  4.99350727e-01
  2.02411040e+01  1.37914517e+01  1.61740143e+01  1.59664895e+01
  1.69325662e+01  1.66128180e+01  3.90445843e+01]
                                        

In [129]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE

df = pd.read_csv("emotion_dataset.csv")

df_temp = []
for i in df['para']:
    frame = [ float(num) for num in (i.strip('[]').split())]
    df_temp.append(frame)
df['para'] = df_temp

label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

x = np.vstack(df['para']) #features
y = df['label'].values #labels
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

smote = SMOTE(sampling_strategy='auto', k_neighbors=1)
x_res, y_res = smote.fit_resample(x_train, y_train)


{np.int64(0): np.int64(5), np.int64(1): np.int64(4), np.int64(2): np.int64(17), np.int64(3): np.int64(2), np.int64(4): np.int64(7), np.int64(5): np.int64(3)}


In [144]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

file_name = "knn_emotion_recognition.pkl"
if os.path.isfile(file_name):
    knn_model = joblib.load("knn_emotion_recognition.pkl")
    print(f"Model loaded from {file_name}")
else:
    knn_model = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2, weights='distance')

knn_model.fit(x_res, y_res)

y_pred = knn_model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100: .2f}%")
print("Classification Report:\n", classification_report(y_test, y_pred, zero_division=1))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Model loaded from knn_emotion_recognition.pkl
Accuracy:  30.00%
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.00      0.00         1
           1       0.00      1.00      0.00         0
           2       1.00      0.29      0.44         7
           3       1.00      1.00      1.00         1
           4       0.00      0.00      0.00         1
           5       0.00      1.00      0.00         0

    accuracy                           0.30        10
   macro avg       0.50      0.55      0.24        10
weighted avg       0.90      0.30      0.41        10

Confusion Matrix:
 [[0 0 0 0 1 0]
 [0 0 0 0 0 0]
 [0 1 2 0 1 3]
 [0 0 0 1 0 0]
 [0 0 0 0 0 1]
 [0 0 0 0 0 0]]


In [141]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_neighbors': [3, 5, 7, 5],
    'metric': ['euclidean', 'manhattan', 'chebyshev']
}

grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(x_train, y_train)

print(f"Best Parameters: {grid_search.best_params_}")
best_knn_model = grid_search.best_estimator_

Best Parameters: {'metric': 'manhattan', 'n_neighbors': 5}




In [142]:
import joblib

joblib.dump(best_knn_model, "knn_emotion_recognition.pkl")

['knn_emotion_recognition.pkl']

In [None]:
loaded_model = joblib.load("knn_emotion_recognition.pkl")

In [None]:
file_path = ""
features = extract_features(file_path).reshape(1, -1)

predicted_label = loaded_model.predict(features)
emotion = label_encoder.inverse_transform(predicted_label)
print(f"Predicted Result: {emotion}")