In [None]:
import librosa
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
import os
import ast


In [None]:
def list_files(directory, extension):
    files = []
    for file in os.listdir(directory):
        if file.endswith(extension):
            files.append(os.path.join(directory, file))
            
    return files

In [None]:
lina = list_files("/Users/tom/Documents/IA/Voice Data/Lina/Combined/parsed", 'mp3')
martin = list_files("/Users/tom/Documents/IA/Voice Data/Martin/Combined/parsed", 'mp3')
ilyes = list_files("/Users/tom/Documents/IA/Voice Data/Ilyes/Combined/parsed", 'mp3')

In [None]:
def extract_features(file):
    y, sr = librosa.load(file)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    mean_mfccs = np.mean(mfccs, axis=1)
    mean_mfccs = np.array(mean_mfccs).reshape(1, -1)
    return mean_mfccs

In [None]:
def extract_features_from_files(files, name):
    features = []
    for file in files:
        features.append(extract_features(file))
        
    data = np.concatenate(features)
    df = pd.DataFrame(data)
    df['name'] = name
    return df

In [None]:
lina_df = extract_features_from_files(lina, "Lina")

In [None]:
martin_df = extract_features_from_files(martin, "Martin")

In [None]:
ilyes_df = extract_features_from_files(ilyes, "Ilyes")

In [None]:
df = pd.concat([lina_df, martin_df, ilyes_df])

In [None]:
X = df.drop('name', axis=1)
y = df['name']
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
clf = RandomForestClassifier(n_estimators=100)

In [None]:
clf.fit(X_train, y_train)

In [None]:
clf.score(X_test, y_test)

In [None]:
clf.predict(X_test)

In [370]:
import pydub

def convert_to_mp3_and_make_test_df(directory):
    for file in os.listdir(directory):
        if file.endswith('ogg'):
            audio = pydub.AudioSegment.from_file(directory + file, format='ogg')
            audio.export(directory + file.replace('ogg', 'mp3'), format='mp3')
        
    files = list_files(directory, 'mp3')
    test_df = extract_features_from_files(files, directory.split('/')[-2])
    X = scaler.transform(test_df.drop('name', axis=1))
    y = test_df['name']
    df = pd.DataFrame(X)
    df['name'] = y
    return df

In [371]:
martin_df = convert_to_mp3_and_make_test_df("/Users/tom/Documents/IA/Voice Data/Test/Martin/")
lina_df = convert_to_mp3_and_make_test_df("/Users/tom/Documents/IA/Voice Data/Test/Lina/")
ilyes_df = convert_to_mp3_and_make_test_df("/Users/tom/Documents/IA/Voice Data/Test/Ilyes/")

In [399]:
test = pd.concat([martin_df, lina_df, ilyes_df])

y_pred = pd.Series(clf.predict(test.drop('name', axis=1)))
y_true = test['name']

y_true.reset_index(drop=True, inplace=True)

In [400]:
from sklearn.metrics import classification_report
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

       Ilyes       1.00      1.00      1.00         2
        Lina       1.00      1.00      1.00         2
      Martin       1.00      1.00      1.00         1

    accuracy                           1.00         5
   macro avg       1.00      1.00      1.00         5
weighted avg       1.00      1.00      1.00         5


In [403]:
df_proba = clf.predict_proba(test.drop('name', axis=1))

df_proba = pd.DataFrame(df_proba, columns=clf.classes_ + '_proba')

df_proba['predicted'] = y_pred
df_proba['true'] = y_true


In [404]:
df_proba

Unnamed: 0,Ilyes_proba,Lina_proba,Martin_proba,predicted,true
0,0.49,0.01,0.5,Martin,Martin
1,0.2,0.78,0.02,Lina,Lina
2,0.29,0.61,0.1,Lina,Lina
3,0.99,0.01,0.0,Ilyes,Ilyes
4,1.0,0.0,0.0,Ilyes,Ilyes
