# Gender Recognition using Voice Data
Objective: >90% accuracy

Method: Support Vector Machines

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm_notebook as tqdm
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score

## Viewing as Pandas `DataFrame`

In [2]:
voice_df = pd.read_csv("voice.csv")
voice_df

Unnamed: 0,meanfreq,sd,median,Q25,Q75,IQR,skew,kurt,sp.ent,sfm,...,centroid,meanfun,minfun,maxfun,meandom,mindom,maxdom,dfrange,modindx,label
0,0.059781,0.064241,0.032027,0.015071,0.090193,0.075122,12.863462,274.402906,0.893369,0.491918,...,0.059781,0.084279,0.015702,0.275862,0.007812,0.007812,0.007812,0.000000,0.000000,male
1,0.066009,0.067310,0.040229,0.019414,0.092666,0.073252,22.423285,634.613855,0.892193,0.513724,...,0.066009,0.107937,0.015826,0.250000,0.009014,0.007812,0.054688,0.046875,0.052632,male
2,0.077316,0.083829,0.036718,0.008701,0.131908,0.123207,30.757155,1024.927705,0.846389,0.478905,...,0.077316,0.098706,0.015656,0.271186,0.007990,0.007812,0.015625,0.007812,0.046512,male
3,0.151228,0.072111,0.158011,0.096582,0.207955,0.111374,1.232831,4.177296,0.963322,0.727232,...,0.151228,0.088965,0.017798,0.250000,0.201497,0.007812,0.562500,0.554688,0.247119,male
4,0.135120,0.079146,0.124656,0.078720,0.206045,0.127325,1.101174,4.333713,0.971955,0.783568,...,0.135120,0.106398,0.016931,0.266667,0.712812,0.007812,5.484375,5.476562,0.208274,male
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3163,0.131884,0.084734,0.153707,0.049285,0.201144,0.151859,1.762129,6.630383,0.962934,0.763182,...,0.131884,0.182790,0.083770,0.262295,0.832899,0.007812,4.210938,4.203125,0.161929,female
3164,0.116221,0.089221,0.076758,0.042718,0.204911,0.162193,0.693730,2.503954,0.960716,0.709570,...,0.116221,0.188980,0.034409,0.275862,0.909856,0.039062,3.679688,3.640625,0.277897,female
3165,0.142056,0.095798,0.183731,0.033424,0.224360,0.190936,1.876502,6.604509,0.946854,0.654196,...,0.142056,0.209918,0.039506,0.275862,0.494271,0.007812,2.937500,2.929688,0.194759,female
3166,0.143659,0.090628,0.184976,0.043508,0.219943,0.176435,1.591065,5.388298,0.950436,0.675470,...,0.143659,0.172375,0.034483,0.250000,0.791360,0.007812,3.593750,3.585938,0.311002,female


In [3]:
label_enc = LabelEncoder()
voice_df["label"] = label_enc.fit_transform(voice_df.label)
voice_df.head()

Unnamed: 0,meanfreq,sd,median,Q25,Q75,IQR,skew,kurt,sp.ent,sfm,...,centroid,meanfun,minfun,maxfun,meandom,mindom,maxdom,dfrange,modindx,label
0,0.059781,0.064241,0.032027,0.015071,0.090193,0.075122,12.863462,274.402906,0.893369,0.491918,...,0.059781,0.084279,0.015702,0.275862,0.007812,0.007812,0.007812,0.0,0.0,1
1,0.066009,0.06731,0.040229,0.019414,0.092666,0.073252,22.423285,634.613855,0.892193,0.513724,...,0.066009,0.107937,0.015826,0.25,0.009014,0.007812,0.054688,0.046875,0.052632,1
2,0.077316,0.083829,0.036718,0.008701,0.131908,0.123207,30.757155,1024.927705,0.846389,0.478905,...,0.077316,0.098706,0.015656,0.271186,0.00799,0.007812,0.015625,0.007812,0.046512,1
3,0.151228,0.072111,0.158011,0.096582,0.207955,0.111374,1.232831,4.177296,0.963322,0.727232,...,0.151228,0.088965,0.017798,0.25,0.201497,0.007812,0.5625,0.554688,0.247119,1
4,0.13512,0.079146,0.124656,0.07872,0.206045,0.127325,1.101174,4.333713,0.971955,0.783568,...,0.13512,0.106398,0.016931,0.266667,0.712812,0.007812,5.484375,5.476562,0.208274,1


In [4]:
voice_df.info() #Checking for Null values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3168 entries, 0 to 3167
Data columns (total 21 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   meanfreq  3168 non-null   float64
 1   sd        3168 non-null   float64
 2   median    3168 non-null   float64
 3   Q25       3168 non-null   float64
 4   Q75       3168 non-null   float64
 5   IQR       3168 non-null   float64
 6   skew      3168 non-null   float64
 7   kurt      3168 non-null   float64
 8   sp.ent    3168 non-null   float64
 9   sfm       3168 non-null   float64
 10  mode      3168 non-null   float64
 11  centroid  3168 non-null   float64
 12  meanfun   3168 non-null   float64
 13  minfun    3168 non-null   float64
 14  maxfun    3168 non-null   float64
 15  meandom   3168 non-null   float64
 16  mindom    3168 non-null   float64
 17  maxdom    3168 non-null   float64
 18  dfrange   3168 non-null   float64
 19  modindx   3168 non-null   float64
 20  label     3168 non-null   int6

## Checking correlation values

In [5]:
voice_df.corr().label.sort_values(ascending=False)

label       1.000000
IQR         0.618916
sp.ent      0.490552
sd          0.479539
sfm         0.357499
kurt        0.087195
Q75         0.066906
skew        0.036627
modindx     0.030801
minfun     -0.136692
maxfun     -0.166461
mode       -0.171775
meandom    -0.191067
dfrange    -0.192213
mindom     -0.194974
maxdom     -0.195657
median     -0.283919
centroid   -0.337415
meanfreq   -0.337415
Q25        -0.511455
meanfun    -0.833921
Name: label, dtype: float64

The features: `IQR`, `sd`, `sfm`, `sp.ent`, `Q75`, `maxfun`, `median`, `centroid`, `meanfreq`, `mondindx` are significantly correlated to `label` (with correlation values > 0.10)

## Scaling all column values

In [6]:
X, y = voice_df.iloc[:, :-1], voice_df.iloc[:, -1]

In [7]:
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

In [8]:
X

array([[-4.04924806,  0.4273553 , -4.22490077, ..., -1.43142165,
        -1.41913712, -1.45477229],
       [-3.84105325,  0.6116695 , -3.99929342, ..., -1.41810716,
        -1.4058184 , -1.01410294],
       [-3.46306647,  1.60384791, -4.09585052, ..., -1.42920257,
        -1.41691733, -1.06534356],
       ...,
       [-1.29877326,  2.32272355, -0.05197279, ..., -0.5992661 ,
        -0.58671739,  0.17588664],
       [-1.2452018 ,  2.012196  , -0.01772849, ..., -0.41286326,
        -0.40025537,  1.14916112],
       [-0.51474626,  2.14765111, -0.07087873, ..., -1.27608595,
        -1.2637521 ,  1.47567886]])

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Fitting our SVM
Base model: Linear Kernel with default hyperparameters

In [10]:
svc = SVC(kernel="linear")
svc.fit(X_train, y_train)
y_preds = svc.predict(X_test)
acc_base = accuracy_score(y_test, y_preds)
print(f'The base model accuracy score is {acc_base * 100:.2f}%.')

The base model accuracy score is 97.63%.


Our new objective can be to reach an accuracy of near 100%.

### Tweaking hyperparameter `kernel`

In [20]:
kernels = ["linear", "rbf", "poly", "sigmoid"]
for kernel in kernels:
  svc = SVC(kernel=kernel)
  svc.fit(X_train, y_train)
  y_preds = svc.predict(X_test)
  print(f'The accuracy for {kernel} kernel has an accuracy of {accuracy_score(y_test, y_preds)*100:.4f}%.')

The accuracy for linear kernel has an accuracy of 97.6341%.
The accuracy for rbf kernel has an accuracy of 98.2650%.
The accuracy for poly kernel has an accuracy of 96.8454%.
The accuracy for sigmoid kernel has an accuracy of 82.9653%.


It appears that the `linear` and `rbf` kernels yield the highest accuracies

### Performing K-Fold Cross Validation on Linear and RBF Kernels
Checking if the model accuracy has a large variance

In [12]:
lin_svc = SVC(kernel="linear")
rbf_svc = SVC(kernel="rbf")
lin_acc = cross_val_score(lin_svc, X_train, y_train, cv=8)
rbf_acc = cross_val_score(rbf_svc, X_train, y_train, cv=8)
lin_acc, rbf_acc

(array([0.96214511, 0.97791798, 0.97476341, 0.98107256, 0.97791798,
        0.97476341, 0.9778481 , 0.9778481 ]),
 array([0.97791798, 0.98422713, 0.97160883, 0.99053628, 0.98422713,
        0.96845426, 0.98101266, 0.99050633]))

In [13]:
lin_acc.mean(), rbf_acc.mean()

(0.975534580521503, 0.9810613245218224)

### Obtaining best hyperparameters with GridSearchCV

In [14]:
svc = SVC()
hyperparams = [{"C": [1, 10, 50, 100, 500, 1000], "kernel": ["linear"]},
               {"C": [1, 10, 50, 100, 500, 1000], "gamma": [i/10 for i in range(1, 10)], "kernel": ["rbf"]}]

grid_search = GridSearchCV(svc, param_grid=hyperparams, scoring="accuracy", cv=3)
grid_search.fit(X_train, y_train)

In [15]:
best_score = grid_search.best_score_*100
best_params = grid_search.best_params_

In [16]:
best_score

97.94787664638193

In [17]:
best_params

{'C': 1, 'gamma': 0.2, 'kernel': 'rbf'}

### Best estimator hyperparameters on test data

In [22]:
best_acc = grid_search.score(X_test, y_test)
print(f'The accuracy score is {best_acc*100:.4f}%.')

The accuracy score is 98.2650%.


### Experimenting further
Finding best C (likely lies between 1 to 10) and best gamma

In [24]:
svc = SVC()
params = [{"C": [1, 5, 7, 10], "gamma": [i/20 for i in range(1, 10)], "kernel": ["rbf"]}]
gs = GridSearchCV(svc, param_grid=params, cv=5, scoring="accuracy")
gs.fit(X_train, y_train)
best_score = gs.best_score_
best_params = gs.best_params_
best_acc = gs.score(X_test, y_test)
print(f'The best score is {best_score * 100:.4f}%.')
print(f'The best params are: {best_params}.')
print(f'The best test accuracy is {best_acc *100:.4f}%.')

The best score is 98.2245%.
The best params are: {'C': 1, 'gamma': 0.05, 'kernel': 'rbf'}.
The best test accuracy is 98.2650%.


In [26]:
svc = SVC()
params = [{"C": [i for i in range(1, 8)], "gamma": [i/50 for i in range(1, 20)], "kernel": ["rbf"]}]
gs = GridSearchCV(svc, param_grid=params, cv=5, scoring="accuracy")
gs.fit(X_train, y_train)
best_score = gs.best_score_
best_params = gs.best_params_
best_acc = gs.score(X_test, y_test)
print(f'The best score is {best_score * 100:.4f}%.')
print(f'The best params are: {best_params}.')
print(f'The best test accuracy is {best_acc *100:.4f}%.')

The best score is 98.2639%.
The best params are: {'C': 5, 'gamma': 0.12, 'kernel': 'rbf'}.
The best test accuracy is 98.4227%.


# End of experimenting
We ended with a test accuracy of 98.4227%, improving from default hyperparameter accuracy of 98.2650%!