Pre-Processing the Voice Data

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

from IPython.display import Audio
import librosa

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn import preprocessing
from sklearn import tree
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier


from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import classification_report

from sklearn.decomposition import PCA
data_dir = '/Users/christian fink/Math485/Math485_2/voice_clip_selected/'
voice_clip_meta_data = pd.read_csv(data_dir+"voice_clip_meta_data.csv")
voice_feature = pd.read_csv(data_dir+"voice_feature_20241022.csv")

In [2]:
col_list_mfcc = ['mfcc_00', 'mfcc_01', 'mfcc_02', 'mfcc_03', 'mfcc_04', 'mfcc_05',
       'mfcc_06', 'mfcc_07', 'mfcc_08', 'mfcc_09', 'mfcc_10', 'mfcc_11',
       'mfcc_12', 'mfcc_13', 'mfcc_14', 'mfcc_15', 'mfcc_16', 'mfcc_17',
       'mfcc_18', 'mfcc_19', 'mfcc_20', 'mfcc_21', 'mfcc_22', 'mfcc_23',
       'mfcc_24']
col_list_f0 = ['f0_mean', 'f0_std','f0_var','f0_min','f0_max','f0_skew', 'f0_kurtosis']
col_list_zcr = ['zcr_mean','zcr_std','zcr_var','zcr_min','zcr_max','zcr_skew','zcr_kurtosis']

In [3]:
age_name = ['teens', 'twenties','thirties', 'fourties', 'fifties', 'sixties', 'seventies', 'eighties','nineties']
age_value = range(1,10)
age_dict = dict(zip(age_name, age_value))
gender_name = ["male_masculine","female_feminine"]
gender_value = [0,1]
gender_dict = dict(zip(gender_name, gender_value))
voice_feature['age'] = voice_feature['age'].map(age_dict)
voice_feature['gender'] = voice_feature['gender'].map(gender_dict)
voice_feature.dropna(inplace=True)
X = voice_feature[col_list_f0 + col_list_mfcc + col_list_zcr]
feature_names = col_list_f0 + col_list_mfcc+col_list_zcr
scaler = preprocessing.MinMaxScaler().fit(X)
X = scaler.transform(X)
y = voice_feature.gender

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.33, random_state=7)

In [4]:
def view_pred_result(model, X_test, y_test, thresh_min=0,thresh_max=0.5, thresh_inc=0.01):
    y_pred = model.predict(X_test)
    acc_pred = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    print('Accuracy: ' + str(acc_pred))
    print('ROC Score: ' + str(roc_auc))

    cfm_df = pd.DataFrame()
    for i, thresh in enumerate(np.arange(thresh_min,thresh_max,thresh_inc)):
        y_pred_thresh = np.array((model.predict_proba(X_test)[:, 1] > thresh).astype(int))
        cfm = confusion_matrix(y_test,y_pred_thresh)
        cfm_df.loc[i,"Threshold"]=thresh
        cfm_df.loc[i,"True Negative"]=cfm[0,0]
        cfm_df.loc[i,"False Positive"]=cfm[0,1]
        cfm_df.loc[i,"False Negative"]=cfm[1,0]
        cfm_df.loc[i,"True Positive"]=cfm[1,1]
    cfm_df['FPR'] = cfm_df['False Positive']/(cfm_df['False Positive']+cfm_df['True Negative'])
    cfm_df['TPR'] = cfm_df['True Positive']/(cfm_df['False Negative']+cfm_df['True Positive'])
    #metrics.RocCurveDisplay.from_estimator(model, X_test, y_test)
    print(metrics.classification_report(y_test,y_pred, digits = 4))
    ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
    return cfm_df

Sorting Voice Features by Accent Type

In [5]:
voice_feature[voice_feature['accents'].str.contains("United States")]['accents'].unique()

array(['United States English', 'United States English,Floridian',
       'West Coast,United States English',
       'Canadian English,United States English',
       'United States English,England English',
       'England English,United States English',
       'Latin America,United States English',
       'United States English,Pittsburgh PA',
       'Singaporean English,United States English',
       'United States English,Pacific North West United States',
       'United States English,Canadian English,international',
       'India and South Asia (India, Pakistan, Sri Lanka),United States English,England English',
       'United States English,Australian English,England English,Irish English',
       'United States English,midwest',
       'United States English,North Indiana',
       'United States English,Midwestern',
       'United States English,British English',
       'United States English,Irish English',
       'United States English,Slight Dutch accent',
       'United Stat