## Remove BGM

In [None]:
import os
from spleeter.separator import Separator

# Path to mp3 files
input_directory = "datasets/mp3_withBGM"
output_directory = "datasets/wav_files"

# Initialize the Spleeter separator with the 2stems model (vocals + accompaniment)
# This will automatically download a pretrained model
separator = Separator('spleeter:2stems')

for filename in os.listdir(input_directory):
    if filename.endswith(".mp3"):
        file_path = os.path.join(input_directory, filename)
        output_path = os.path.join(output_directory, filename.replace(".mp3", "_vocals.wav"))
        
        # Process the file to separate the vocals and accompaniment
        print(f"Processing {filename}...")
        separator.separate_to_file(file_path, output_directory)
        
        # After Spleeter processes the file, it will create a folder with the same name as the input file
        # which will contain 'vocals' and 'accompaniment' audio files.
        # Move the vocals file to the desired location
        vocals_file_path = os.path.join(output_directory, filename.replace(".mp3", "/vocals.wav"))
        
        # Move the extracted vocals file to the desired location
        os.rename(vocals_file_path, output_path)
        
        # Remove the accompanying files.
        accompaniment_file_path = os.path.join(output_directory, filename.replace(".mp3", "/accompaniment.wav"))
        if os.path.exists(accompaniment_file_path):
            os.remove(accompaniment_file_path)
        
        print(f"Saved vocals to {output_path}")
        
print("Processing complete!")


## Capture features

In [None]:
import librosa
import librosa.display
import numpy as np
import os
import pandas as pd
import csv
import sys
import joblib
import pickle

# Reference: https://github.com/Renovamen/Speech-Emotion-Recognition/blob/master/extract_feats/librosa.py

# Directory containing .wav files
audio_dir = 'datasets/wav_files'
# audio_dir = 'datasets/wav_files_EI_smaller'
# audio_dir = 'datasets/wav_files_EI_inbalanced'

# Create a list to hold features for each file
features_list = []

# Loop through each file in the directory
for filename in os.listdir(audio_dir):
    if filename.endswith('.wav'):
        label =  filename.split('_')[1] # [1] => E/I, [2] => S/N, [3] => T/F, [4] => P/J
        file_path = os.path.join(audio_dir, filename)
        
        # Load the audio file
        y, sr = librosa.load(file_path, sr=None)
        stft = np.abs(librosa.stft(y))
        
        # Extract features as above (e.g., MFCC, Spectral Centroid, etc.)
        # fmin 和 fmax 对应于人类语音的最小最大基本频率
        pitches, magnitudes = librosa.piptrack(y=y, sr=sr, S=stft, fmin=70, fmax=400)
        pitch = []
        for i in range(magnitudes.shape[1]):  # every time slot
            index = magnitudes[:, 1].argmax()
            pitch.append(pitches[index, i])

        # pitch_tuning_offset = librosa.pitch_tuning(pitches)
        pitchmean = np.mean(pitch)
        pitchstd = np.std(pitch)
        pitchmax = np.max(pitch)

        # 频谱质心
        cent = librosa.feature.spectral_centroid(y=y, sr=sr)
        cent = cent / np.sum(cent)
        meancent = np.mean(cent)
        stdcent = np.std(cent)
        maxcent = np.max(cent)

        # # 谱平面
        # flat = librosa.feature.spectral_flatness(y=y)
        # meanflat = np.mean(flat)
        # stdflat = np.std(flat)

        # ottava对比
        cont = librosa.feature.spectral_contrast(S=stft, sr=sr)
        meancont = np.mean(cont.T, axis = 0)
        stdcont = np.std(cont.T, axis = 0)
        
        rolloff = librosa.feature.spectral_rolloff(S=stft, sr=sr)  #, roll_percent=0.85
        meanrolloff = np.mean(rolloff)
        stdrolloff = np.std(rolloff)

        # 色谱图
        chroma = librosa.feature.chroma_stft(S=stft, sr=sr)
        meanchroma = np.mean(chroma.T, axis=0)
        stdchroma = np.std(chroma.T, axis=0)

        # 梅尔频率
        mel = librosa.feature.melspectrogram(y=y, sr=sr)
        meanmel = np.mean(mel.T, axis=0)
        stdmel = np.std(mel.T, axis=0)

        # 过零率
        zerocr = librosa.feature.zero_crossing_rate(y=y) 
        meanzerocr = np.mean(zerocr)
        stdzerocr = np.std(zerocr)
     
        S, phase = librosa.magphase(stft)
        meanMagnitude = np.mean(S)
        stdMagnitude = np.std(S)
        maxMagnitude = np.max(S)

        # 均方根能量
        rmse = librosa.feature.rms(S=S)[0]
        meanrms = np.mean(rmse)
        stdrms = np.std(rmse)
        maxrms = np.max(rmse)

        # 使用系数为50的MFCC特征
        mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=50).T, axis=0)
        mfccsstd = np.std(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=50).T, axis=0)
        mfccmax = np.max(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=50).T, axis=0)
        
        # tempo, beats = librosa.beat.beat_track(y=y, sr=sr)

        # Store the features
        ext_features = np.array([
            pitchmean, pitchstd, pitchmax,
            meancent, stdcent, maxcent, meanrolloff, stdrolloff, #, meanflat, stdflat
            meanzerocr, stdzerocr, meanMagnitude, stdMagnitude, maxMagnitude,
            meanrms, stdrms, maxrms
        ])
        
        ext_features = np.concatenate((ext_features, meancont, stdcont, meanchroma, stdchroma, meanmel, stdmel, mfccs, mfccsstd, mfccmax))

        # Append features for this file to the list
        features_list.append([filename, ext_features, label])

# # Print the extracted features for the first file
# print(features_list[0])

# Save featuees to a CSV file
feature_path = 'datasets/features/features_EI.p'
# feature_path = 'datasets/features/features_SN.p'
# feature_path = 'datasets/features/features_TF.p'
# feature_path = 'datasets/features/features_JP.p'
# feature_path = 'datasets/features/features_EI_smaller.p'
# feature_path = 'datasets/features/features_EI_inbalanced.p'
pickle.dump(features_list, open(feature_path, 'wb'))

## Load datas

In [None]:
import numpy as np
import os
import pandas as pd
import csv
import sys
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import joblib

feature_path = 'datasets/features/features_EI.p'
# feature_path = 'datasets/features/features_SN.p'
# feature_path = 'datasets/features/features_TF.p'
# feature_path = 'datasets/features/features_JP.p'
# feature_path = 'datasets/features/features_EI_smaller.p'
# feature_path = 'datasets/features/features_EI_inbalanced.p'

features = pd.DataFrame(
    data = joblib.load(feature_path),
    columns = ['file_name', 'features', 'label']
)
X = list(features['features'])
y = list(features['label'])

# Standardization
scaler_path = 'models/SCALER_LIBROSA.m'
scaler = StandardScaler().fit(X)
joblib.dump(scaler, scaler_path)
X = scaler.transform(X)

X = np.array(X)
y = np.array(y)

X_train_valid, X_test, y_train_valid, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, test_size=0.25, random_state=42)

## SMOTE (imbalanced dataset)

In [None]:
from imblearn.over_sampling import SMOTE

# Initialize SMOTE
smote = SMOTE(random_state=42)

# Fit and resample the training data (X_train, y_train)
X_train_valid, y_train_valid = smote.fit_resample(X_train_valid, y_train_valid)

X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, test_size=0.25, random_state=42)

## PCA

In [None]:
import numpy as np
from sklearn.decomposition import PCA

# Check the original dimension = 460
print("Dimension of ext_features:", X.shape)

# Apply PCA 
pca = PCA(n_components = 60)
X_pca = pca.fit_transform(X)
print("Dimension of ext_features:", X_pca.shape)

# # Explained variance ratio to understand how much information each principal component captures
# print("Explained Variance Ratio by each component:", pca.explained_variance_ratio_)

X_train_valid, X_test, y_train_valid, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, test_size=0.25, random_state=42)

## SVM

In [None]:
import os
from sklearn import svm
import joblib

# Define classifier
svm_model = svm.SVC(kernel='linear', C=1, random_state=42, probability=True)
svm_model.fit(X_train, y_train)
joblib.dump(svm_model, 'models/svm_model.pkl')

y_test_pred = svm_model.predict(X_test)
print(y_test)
print(y_test_pred)
test_accuracy = svm_model.score(X_test, y_test)
print("test accuracy", test_accuracy)

In [None]:
import os
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score

# Confusion Matrix
# True E / N / F / J  Fake I
# Fake E / N / F / J  True I
cm = confusion_matrix(y_test, y_test_pred)
print("Confusion Matrix:")
print(cm)

# Classification Report (Precision, Recall, F1-score)
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))

# cross-validation
scores = cross_val_score(svm_model, X_train_valid, y_train_valid, cv=5)
precision_scores = cross_val_score(svm_model, X_train_valid, y_train_valid, cv=5, scoring='precision_macro')
recall_scores = cross_val_score(svm_model, X_train_valid, y_train_valid, cv=5, scoring='recall_macro')
f1_scores = cross_val_score(svm_model, X_train_valid, y_train_valid, cv=5, scoring='f1_macro')

print(scores)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
print(precision_scores)
print("%0.2f precision with a standard deviation of %0.2f" % (precision_scores.mean(), precision_scores.std()))
print(recall_scores)
print("%0.2f recall with a standard deviation of %0.2f" % (recall_scores.mean(), recall_scores.std()))
print(f1_scores)
print("%0.2f f1 score with a standard deviation of %0.2f" % (f1_scores.mean(), f1_scores.std()))

## MLP

In [None]:
import os
import pickle
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# Create the MLP model
mlp_model = MLPClassifier(hidden_layer_sizes=(100, ), max_iter=1000, random_state=42)

mlp_model.fit(X_train, y_train)
joblib.dump(mlp_model, 'models/mlp_model.pkl')

y_test_pred = mlp_model.predict(X_test)
print(y_test)
print(y_test_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
print("test accuracy", test_accuracy)

In [None]:
import os
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score

# Confusion Matrix
# True E / N / F / J  Fake I
# Fake E / N / F / J  True I
cm = confusion_matrix(y_test, y_test_pred)
print("Confusion Matrix:")
print(cm)

# Classification Report (Precision, Recall, F1-score)
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))

# cross-validation
scores = cross_val_score(mlp_model, X_train_valid, y_train_valid, cv=5)
precision_scores = cross_val_score(mlp_model, X_train_valid, y_train_valid, cv=5, scoring='precision_macro')
recall_scores = cross_val_score(mlp_model, X_train_valid, y_train_valid, cv=5, scoring='recall_macro')
f1_scores = cross_val_score(mlp_model, X_train_valid, y_train_valid, cv=5, scoring='f1_macro')

print(scores)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
print(precision_scores)
print("%0.2f precision with a standard deviation of %0.2f" % (precision_scores.mean(), precision_scores.std()))
print(recall_scores)
print("%0.2f recall with a standard deviation of %0.2f" % (recall_scores.mean(), recall_scores.std()))
print(f1_scores)
print("%0.2f f1 score with a standard deviation of %0.2f" % (f1_scores.mean(), f1_scores.std()))

## K-means

In [None]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import homogeneity_score, completeness_score, v_measure_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import silhouette_score
from sklearn.metrics import fowlkes_mallows_score

# Apply K-Means clustering 
# n_cluster = 3 for E / I  &  T / F  &  J / P
# n_cluster = 2 for S / N
kmeans_model = KMeans(n_clusters=3, init='k-means++', random_state=42)
kmeans_model.fit(X_train_valid)
joblib.dump(kmeans_model, 'models/kmeans_model.pkl')

y_test_pred = kmeans_model.predict(X_test)
print(y_test)
print(y_test_pred)

# Custom mapping pick one to uncomment
custom_mapping = {'E': 1, 'I': 0}
# custom_mapping = {'S': 1, 'N': 0}
# custom_mapping = {'T': 0, 'F': 1}
# custom_mapping = {'J': 1, 'P': 2}

# custom_mapping = {'E': 0, 'I': 1} # smaller
# custom_mapping = {'E': 0, 'I': 2} # imbalance without SMOTE

# Apply custom mapping
y_test_custom = [custom_mapping[label] for label in y_test]

test_accuracy = accuracy_score(y_test_custom, y_test_pred)
print("test accuracy", test_accuracy)

ari = adjusted_rand_score(y_test_custom, y_test_pred)
print(f"Adjusted Rand Index: {ari}")

homogeneity = homogeneity_score(y_test_custom, y_test_pred)
completeness = completeness_score(y_test_custom, y_test_pred)
v_measure = v_measure_score(y_test_custom, y_test_pred)
print(f"Homogeneity: {homogeneity}")
print(f"Completeness: {completeness}")
print(f"V-Measure: {v_measure}")

# True I    Fake E / S
# Fake I    True E / S
cm = confusion_matrix(y_test_custom, y_test_pred)
print("Confusion Matrix:") 
print(cm)

# silhouette = silhouette_score(X_test, y_test_pred)
# print(f"Silhouette Score: {silhouette}")

# fmi = fowlkes_mallows_score(y_test_custom, y_test_pred)
# print(f"Fowlkes-Mallows Index: {fmi}")