# Projet SER

In [None]:
!pip install numpy pandas scikit-learn
!pip install librosa
!pip install mrmr-selection
!pip install soundfile
!pip install librosa
!pip install kaggle

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import ConfusionMatrixDisplay, accuracy_score, confusion_matrix, classification_report, precision_score, recall_score
#from pymrmr import mRMR
import librosa
#from sklearn.preprocessing import MinMaxScaler
from scipy.signal import find_peaks
import soundfile as sf
import seaborn as sns
import matplotlib.pyplot as plt
#from sklearn.decomposition import PCA

## Data Collection

### From local path

In [None]:
"""
def collect_data_ravdess(audio_path):
    data = pd.DataFrame()

    for file in os.listdir(audio_path):
        if file.endswith(".wav"):
            filepath = os.path.join(audio_path, file)
            label = file.split("-")[2]  # Extracting emotion label from filename

            # Append the filepath and label to the DataFrame
            data = data.append({"filepath": filepath, "label": label}, ignore_index=True)

    return data

"""

### From kaggle

In [None]:
!mkdir ~/.kaggle
!touch ~/.kaggle/kaggle.json

api_token = {"username":"siakesophie","key":"760d5044c02669f464a1d97a873a4827"}

import json

with open('/root/.kaggle/kaggle.json', 'w') as file:
    json.dump(api_token, file)

!chmod 600 ~/.kaggle/kaggle.json

In [None]:
# list datasets on kaggle with the name Ravdess
!kaggle datasets list -s RAVDESS

In [None]:
# Authenticate into Kaggle and download the required dataset then unzip it in the folder in which we are found

import kaggle

kaggle.api.authenticate()

kaggle.api.dataset_download_files('uwrfkaggler/ravdess-emotional-speech-audio', path='.', unzip=True)


## Data Processing
- Audio Normalization
- Silence Removal

In [None]:
def normalize(X):
    max_X = np.max(np.abs(X))
    Y = X / max_X

    return Y

In [None]:
def remove_silence(X, factor):
    max_X = np.max(np.abs(X))
    decision_threshold = max_X / factor

    # Find the indices of samples above the decision threshold
    indices_useful_X, _ = find_peaks(np.abs(X), height=decision_threshold)

    # Extract the useful samples
    Y = X[indices_useful_X[0]:indices_useful_X[-1] + 1]

    return Y


In [None]:
def normalise_remove_silence(input_filename, output_directory, factor):
    # Read the input audio file
    x, fs = librosa.load(input_filename, sr=None)

    # Normalize the audio
    y = normalize(x)

    # Remove silence
    z = remove_silence(y, factor)

    # Extract the filename and extension
    filename, extension = os.path.splitext(os.path.basename(input_filename))

    # Create the output filename in the specified directory
    output_filename = os.path.join(output_directory, f"{filename}_N_RS{factor}{extension}")

    # Write the processed audio to the output file
    sf.write(output_filename, z, fs)

## Audio Feature Extraction


In [None]:
from scipy.stats import mode

def global_feature_computation(feature_matrix, computations):
    computed_features = []
    for computation in computations:
        if computation == "mean":
            computed_features.append(np.mean(feature_matrix, axis=1))
        elif computation == "min":
            computed_features.append(np.min(feature_matrix, axis=1))
        elif computation == "max":
            computed_features.append(np.max(feature_matrix, axis=1))
        elif computation == "std":
            computed_features.append(np.std(feature_matrix, axis=1))
        elif computation == "range":
            computed_features.append(np.ptp(feature_matrix, axis=1))
        elif computation == "mode":
            # Use scipy.stats.mode to get mode and count
            mode_result = mode(feature_matrix, axis=1)
            computed_features.append(mode_result.mode.flatten())
        elif computation == "median":
            computed_features.append(np.median(feature_matrix, axis=1))
        elif computation == "1st_quartile":
            computed_features.append(np.percentile(feature_matrix, 25, axis=1))
        elif computation == "3rd_quartile":
            computed_features.append(np.percentile(feature_matrix, 75, axis=1))
        # Add conditions for other computations

    return np.concatenate(computed_features)

In [None]:
def extract_features(audio_path, features, global_computation):
    # Load the normalized and silence-removed audio
    audio, sr = librosa.load(audio_path)

    lst_spectral_flatness = []
    lst_spectral_centroid = []
    lst_mfcc = []
    lst_melspectrogram = []
    lst_chroma_stft = []
    lst_rms = []

    feature_list = []

    # Extract selected features
    for feature_name in features:
        if feature_name == "spectral_flatness":
            spectral_flatness = librosa.feature.spectral_flatness(y=audio)
            lst_spectral_flatness.append(global_feature_computation(spectral_flatness, global_computation))
        elif feature_name == "spectral_centroid":
            spectral_centroid = librosa.feature.spectral_centroid(y=audio, sr = sr)
            lst_spectral_centroid.append(global_feature_computation(spectral_centroid, global_computation))
        elif feature_name == "mfcc":
            mfcc = librosa.feature.mfcc(y=audio, sr = sr)
            lst_mfcc.append(global_feature_computation(mfcc, global_computation))
        elif feature_name == "melspectrogram":
            mel_spectrum = librosa.feature.melspectrogram(y=audio, sr = sr)
            lst_melspectrogram.append(global_feature_computation(mel_spectrum, global_computation))
        elif feature_name == "chroma_stft":
            chroma_stft = librosa.feature.chroma_stft(y=audio, sr = sr)
            lst_chroma_stft.append(global_feature_computation(chroma_stft, global_computation))
        elif feature_name == "rms":
            rms = librosa.feature.rms(y=audio)
            lst_rms.append(global_feature_computation(rms, global_computation))


    feature_list.append(lst_spectral_flatness[0])
    feature_list.append(lst_spectral_centroid[0])
    feature_list.append(lst_mfcc[0])
    feature_list.append(lst_melspectrogram[0])
    feature_list.append(lst_chroma_stft[0])
    feature_list.append(lst_rms[0])


    # Combine the extracted features into a single feature vector
    feature_vector = np.concatenate(feature_list)

    return feature_vector


In [None]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')

    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")

## Model Training
- SVM

In [None]:
features = ["spectral_flatness", "spectral_centroid", "mfcc", "chroma_stft", "melspectrogram", "rms"]
global_computation = ["mean"]
#global_computation = ["mean", "min", "max", "std", "range", "mode", "median"]

class_names = ["Neutral", "Cal", "Happiness", "Sadness", "Angry", "Fear", "Disgust", "Surprise"]

factor = 200

num_selected_features = 5

### Normalizing and removing the silence

In [None]:
# Set the base directory where actor directories are located
base_directory = '/content/audio_speech_actors_01-24'
output_directory = '/content/processed_data'

# Create the output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

# First loop: Normalize and remove silence
for actor_directory in os.listdir(base_directory):
    actor_path = os.path.join(base_directory, actor_directory)

    # Check if it's a directory
    if os.path.isdir(actor_path):

        # Iterate over audio files in the actor's directory
        for audio_file in os.listdir(actor_path):
            audio_path = os.path.join(actor_path, audio_file)

            # Normalize and remove silence
            normalise_remove_silence(audio_path, output_directory, factor)


In [None]:
# List to store the paths of normalized audio files
normalized_audio_paths = []

for audio_file in os.listdir(output_directory):
    normalized_audio_path = os.path.join(output_directory, audio_file)

    # Append to the list of normalized audio paths
    normalized_audio_paths.append(normalized_audio_path)


In [None]:
# Initialize lists for audio files, labels, and features
features_list = []
audio_files = []

for normalized_audio_path in normalized_audio_paths:
    extracted_features = extract_features(normalized_audio_path, features, global_computation)

    # Append to lists or perform further processing as needed
    audio_files.append(normalized_audio_path)
    features_list.append(extracted_features)


In [None]:
# Convert features_list to a NumPy array if needed
features_list = np.array(features_list)

In [None]:
features_list.shape

### labels
Here is the filename identifiers as per the official RAVDESS website:

- Modality (01 = full-AV, 02 = video-only, 03 = audio-only).
- Vocal channel (01 = speech, 02 = song).
- Emotion (01 = neutral, 02 = calm, 03 = happy, 04 = sad, 05 = angry, 06 = fearful, 07 = disgust, 08 = surprised).
- Emotional intensity (01 = normal, 02 = strong). NOTE: There is no strong intensity for the 'neutral' emotion.
- Statement (01 = "Kids are talking by the door", 02 = "Dogs are sitting by the door").
- Repetition (01 = 1st repetition, 02 = 2nd repetition).
- Actor (01 to 24. Odd numbered actors are male, even numbered actors are female).


So, here's an example of an audio filename. 02-01-06-01-02-01-12.mp4 This means the meta data for the audio file is:

- Video-only (02)
- Speech (01)
- Fearful (06)
- Normal intensity (01)
- Statement "dogs" (02)
- 1st Repetition (01)
- 12th Actor (12) - Female (as the actor ID number is even)

In [None]:
labels = []

for actor_directory in os.listdir(base_directory):
    actor_path = os.path.join(base_directory, actor_directory)

     #Check if it's a directory
    if os.path.isdir(actor_path):

        # Iterate over audio files in the actor's directory
        for audio_file in os.listdir(actor_path):
            # Split the file name using the "-" delimiter and take the third part as the emotion label
            parts = audio_file.split("-")
            emotion_label = parts[2]
            labels.append(emotion_label)

In [None]:
labels = np.array(labels)
labels = labels.astype(np.int64)

In [None]:
labels.shape

In [None]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(features_list, labels, test_size=0.2, random_state=42)

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

# Initialize and train the SVM model
svc_model = SVC()
svc_model.fit(X_train, y_train)
#
#
# Initialize and train the KNN model
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)
#
#
# Initialize and train the Gradient Boosting model
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(X_train, y_train)
#
#
# Standardize the data before training neural networks
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
#
# Initialize and train the Neural Network model
nn_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
nn_model.fit(X_train, y_train)

In [None]:
# Make predictions on the test set
y_pred_svc = svc_model.predict(X_test)
y_pred_knn = knn_model.predict(X_test)
y_pred_gb = gb_model.predict(X_test)
y_pred_nn = nn_model.predict(X_test)

In [None]:
y_pred_svc.shape, y_pred_knn.shape,  y_pred_gb.shape, y_pred_nn.shape

In [None]:
# Accuracies
accuracy_svc = accuracy_score(y_test, y_pred_svc)
accuracy_knn = accuracy_score(y_test, y_pred_knn)
accuracy_gb  = accuracy_score(y_test, y_pred_gb)
accuracy_nn  = accuracy_score(y_test, y_pred_nn)

In [None]:
models = []
accuracies = []

models.append(svc_model)
models.append(knn_model)
models.append(gb_model)
models.append(nn_model)

accuracies.append(accuracy_svc)
accuracies.append(accuracy_knn)
accuracies.append(accuracy_gb)
accuracies.append(accuracy_nn)

model_accuracy_df = pd.DataFrame(models, columns=['Models'])
model_accuracy_df['Accuracies'] = accuracies

In [None]:
model_accuracy_df

In [None]:
evaluate_model(svc_model, X_test, y_test)

In [None]:
evaluate_model(knn_model, X_test, y_test)

In [None]:
evaluate_model(gb_model, X_test, y_test)

In [None]:
evaluate_model(nn_model, X_test, y_test)

## Training with DNN & CNN

In [None]:
import tensorflow as tf
from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping
from keras.models import load_model, Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, BatchNormalization

In [None]:
Y = []
for i in range(labels.size):
  Y.append(labels[i])
  print(labels[i])

In [None]:
# Reconstruction of the dataset with the lables from 01 to 07

display(features_list.shape)
df = pd.DataFrame(features_list)
df['labels'] = Y
df.to_csv('features.csv', index=False)
df.head()

In [None]:
df_norm = df.drop("labels", axis=1)
df_norm.head()

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(df_norm)
print(scaler.mean_)
df_norm = scaler.transform(df_norm)

df_norm = pd.DataFrame(df_norm)
df_norm.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_norm, labels, test_size=0.2, random_state=42)

In [None]:
# Let check the data shape
X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
# For use the CNN model, let expand the data dimensions
X_train = np.expand_dims(X_train, axis=2)
X_test = np.expand_dims(X_test, axis=2)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
# Modeling

model=Sequential()
model.add(Conv1D(128, kernel_size=5, strides = 1, padding='same', activation=tf.nn.relu, input_shape=(X_train.shape[1], 1)))
model.add(MaxPooling1D(pool_size=5, strides = 2, padding = 'same'))
model.add(Dropout(0.3))

model.add(Conv1D(128, kernel_size=5, strides=1, padding='same', activation=tf.nn.relu))
model.add(MaxPooling1D(pool_size=5, strides = 2, padding = 'same'))
model.add(Dropout(0.3))

model.add(Flatten())
model.add(Dense(units=32, activation=tf.nn.relu))
model.add(Dropout(0.3))

model.add(Dense(units=9, activation=tf.nn.softmax))

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
es = EarlyStopping(monitor  = "val_accuracy",
                   mode     = 'max',
                   verbose  = 0,
                   patience = 30)
mc = ModelCheckpoint('model_best.h5',
                     monitor        = 'val_accuracy',
                     mode           = 'max',
                     verbose        = 1,
                     save_best_only = True)

In [None]:
model.summary()

In [None]:
BATCH_SIZE = 100
EPOCHS = 30

history = model.fit(X_train, y_train, validation_split=0.2, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=1, callbacks=[es, mc])

In [None]:
print("Accuracy of our model on test data : " , model.evaluate(X_test,y_test)[1]*100 , "%")

epochs = [i for i in range(EPOCHS)]
fig , ax = plt.subplots(1,2)
train_acc = history.history['accuracy']
train_loss = history.history['loss']
test_acc = history.history['val_accuracy']
test_loss = history.history['val_loss']

fig.set_size_inches(20,6)
ax[0].plot(epochs , train_loss , label = 'Training Loss')
ax[0].plot(epochs , test_loss , label = 'Testing Loss')
ax[0].set_title('Training & Testing Loss')
ax[0].legend()
ax[0].set_xlabel("Epochs")

ax[1].plot(epochs , train_acc , label = 'Training Accuracy')
ax[1].plot(epochs , test_acc , label = 'Testing Accuracy')
ax[1].set_title('Training & Testing Accuracy')
ax[1].legend()
ax[1].set_xlabel("Epochs")
plt.show()