In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,accuracy_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier


# from mpl_toolkits.mplot3d import Axes3D
# from sklearn.preprocessing import scale
# import sklearn.metrics as sm
# from sklearn import datasets
# from sklearn.metrics import confusion_matrix

In [2]:
#load data
df = pd.read_csv('voice_gender_dataset/voice.csv')

In [3]:
df.head()

Unnamed: 0,meanfreq,sd,median,Q25,Q75,IQR,skew,kurt,sp.ent,sfm,...,centroid,meanfun,minfun,maxfun,meandom,mindom,maxdom,dfrange,modindx,label
0,0.059781,0.064241,0.032027,0.015071,0.090193,0.075122,12.863462,274.402906,0.893369,0.491918,...,0.059781,0.084279,0.015702,0.275862,0.007812,0.007812,0.007812,0.0,0.0,male
1,0.066009,0.06731,0.040229,0.019414,0.092666,0.073252,22.423285,634.613855,0.892193,0.513724,...,0.066009,0.107937,0.015826,0.25,0.009014,0.007812,0.054688,0.046875,0.052632,male
2,0.077316,0.083829,0.036718,0.008701,0.131908,0.123207,30.757155,1024.927705,0.846389,0.478905,...,0.077316,0.098706,0.015656,0.271186,0.00799,0.007812,0.015625,0.007812,0.046512,male
3,0.151228,0.072111,0.158011,0.096582,0.207955,0.111374,1.232831,4.177296,0.963322,0.727232,...,0.151228,0.088965,0.017798,0.25,0.201497,0.007812,0.5625,0.554688,0.247119,male
4,0.13512,0.079146,0.124656,0.07872,0.206045,0.127325,1.101174,4.333713,0.971955,0.783568,...,0.13512,0.106398,0.016931,0.266667,0.712812,0.007812,5.484375,5.476562,0.208274,male


In [4]:
#check null values
df.isnull().sum()

meanfreq    0
sd          0
median      0
Q25         0
Q75         0
IQR         0
skew        0
kurt        0
sp.ent      0
sfm         0
mode        0
centroid    0
meanfun     0
minfun      0
maxfun      0
meandom     0
mindom      0
maxdom      0
dfrange     0
modindx     0
label       0
dtype: int64

In [5]:
def unique(col):
    return df[col].nunique()

d = {}
for i in df.columns:
    value = unique(i)
    d[i] = value
print(d,sep="\n")

{'meanfreq': 3166, 'sd': 3166, 'median': 3077, 'Q25': 3103, 'Q75': 3034, 'IQR': 3073, 'skew': 3166, 'kurt': 3166, 'sp.ent': 3166, 'sfm': 3166, 'mode': 2825, 'centroid': 3166, 'meanfun': 3166, 'minfun': 913, 'maxfun': 123, 'meandom': 2999, 'mindom': 77, 'maxdom': 1054, 'dfrange': 1091, 'modindx': 3079, 'label': 2}


In [6]:
x = df.iloc[:,:-1]
y = df.label
x.head()

Unnamed: 0,meanfreq,sd,median,Q25,Q75,IQR,skew,kurt,sp.ent,sfm,mode,centroid,meanfun,minfun,maxfun,meandom,mindom,maxdom,dfrange,modindx
0,0.059781,0.064241,0.032027,0.015071,0.090193,0.075122,12.863462,274.402906,0.893369,0.491918,0.0,0.059781,0.084279,0.015702,0.275862,0.007812,0.007812,0.007812,0.0,0.0
1,0.066009,0.06731,0.040229,0.019414,0.092666,0.073252,22.423285,634.613855,0.892193,0.513724,0.0,0.066009,0.107937,0.015826,0.25,0.009014,0.007812,0.054688,0.046875,0.052632
2,0.077316,0.083829,0.036718,0.008701,0.131908,0.123207,30.757155,1024.927705,0.846389,0.478905,0.0,0.077316,0.098706,0.015656,0.271186,0.00799,0.007812,0.015625,0.007812,0.046512
3,0.151228,0.072111,0.158011,0.096582,0.207955,0.111374,1.232831,4.177296,0.963322,0.727232,0.083878,0.151228,0.088965,0.017798,0.25,0.201497,0.007812,0.5625,0.554688,0.247119
4,0.13512,0.079146,0.124656,0.07872,0.206045,0.127325,1.101174,4.333713,0.971955,0.783568,0.104261,0.13512,0.106398,0.016931,0.266667,0.712812,0.007812,5.484375,5.476562,0.208274


In [7]:
y.head()

0    male
1    male
2    male
3    male
4    male
Name: label, dtype: object

In [8]:
df['label'].value_counts()

male      1584
female    1584
Name: label, dtype: int64

In [9]:
data = ['Female','Male']
d= [1584,1584]

In [10]:
#split dataset into training and test with test size as 20 percent
x_train, x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

## Artificial Neural Networks

In [None]:
ANNmodel = MLPClassifier(random_state = 1, max_iter = 300)
ANNmodel.fit(x_train,y_train)

In [None]:
y_pred1 = ANNmodel.predict(x_test)
y_pred1[:10]

In [None]:
print('score of ANN model is: ', ANNmodel.score(x_test,y_test))

In [None]:
pd.crosstab(y_pred1,y_test,rownames=['matrix'],colnames=['confusion'],margins = True)

In [None]:
print("\t\t\tANN report:\n",classification_report(y_pred1,y_test))
print("ANN Accuracy score: ",accuracy_score(y_pred1,y_test)*100,"%")

## Support Vector Machines

In [None]:
SVMmodel = SVC(kernel = 'linear')
SVMmodel.fit(x_train,y_train)

In [None]:
y_pred2 = SVMmodel.predict(x_test)
y_pred2[:10]

In [None]:
print('score of SVM model is: ', SVMmodel.score(x_test,y_test))

In [None]:
print(pd.crosstab(y_pred2,y_test,rownames=['matrix'],colnames=['confusion'],margins = True))

In [None]:
print("\t\t\tSVM report:\n",classification_report(y_pred2,y_test))
print("SVM Accuracy score: ",accuracy_score(y_pred2,y_test)*100,"%")

## KNN

In [11]:
KNmodel = KNeighborsClassifier(n_neighbors =15, metric = 'minkowski', p = 1)
KNmodel.fit(x_train, y_train)

KNeighborsClassifier(n_neighbors=15, p=1)

In [27]:
KNmodel.keepdims = "True"
y_pred3 = KNmodel.predict(x_test)
print(y_pred3[:10])
print('score of KNN model is: ', KNmodel.score(x_test,y_test))

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


['male' 'female' 'female' 'female' 'male' 'male' 'male' 'male' 'male'
 'male']
score of KNN model is:  0.7634069400630915


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [13]:
pd.crosstab(y_pred3,y_test,rownames=['matrix'],colnames=['confusion'],margins = True)

confusion,female,male,All
matrix,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,235,76,311
male,74,249,323
All,309,325,634


In [14]:
print("\t\t\tKNN report:\n",classification_report(y_pred3,y_test))
print("KNN Accuracy score: ",accuracy_score(y_pred3,y_test)*100,"%")

			KNN report:
               precision    recall  f1-score   support

      female       0.76      0.76      0.76       311
        male       0.77      0.77      0.77       323

    accuracy                           0.76       634
   macro avg       0.76      0.76      0.76       634
weighted avg       0.76      0.76      0.76       634

KNN Accuracy score:  76.34069400630915 %


## Decision Trees

In [15]:
DTmodel = DecisionTreeClassifier(random_state = 1)
DTmodel.fit(x_train,y_train)

DecisionTreeClassifier(random_state=1)

In [16]:
y_pred4 = DTmodel.predict(x_test)
y_pred4 = np.array(y_pred4)
print(y_pred4[:10])
print('score of decision tree model is: ', DTmodel.score(x_test,y_test))

['female' 'male' 'female' 'female' 'male' 'female' 'male' 'male' 'male'
 'male']
score of decision tree model is:  0.9716088328075709


In [None]:
pd.crosstab(y_pred4,y_test,rownames=['matrix'],colnames=['confusion'],margins = True)

In [None]:
print("\t\t\tDecision Tree Class report:\n",classification_report(y_pred4,y_test))
print("Decision Tree Accuracy score: ",accuracy_score(y_pred4,y_test)*100,"%")

## Logistic Regression

In [None]:
LRmodel = LogisticRegression(max_iter=10000)
LRmodel.fit(x_train,y_train)

In [None]:
y_pred5 = LRmodel.predict(x_test)
print(y_pred5[:10])

In [None]:
print('score of Log Reg model is: ',LRmodel.score(x_test,y_test))

In [None]:
pd.crosstab(y_pred5,y_test,rownames=['matrix'],colnames=['confusion'],margins = True)

In [None]:
print("\t\t\tLog Reg Class report:\n",classification_report(y_pred5,y_test))
print("Log Reg Accuracy score: ",accuracy_score(y_pred5,y_test)*100,"%")

## All models

In [None]:
# list1 = [y_pred1,y_pred2,y_pred3,y_pred4,y_pred5]
# d =['DecTree','RandomForest','Log Regression','KNN','SuppVecMachine']
# a={} 
# k=0
# list2 = []
# for i in list1:
#     list2.append(accuracy_score(i,y_test)*100)
# for i in d:
#     a[i] = list2[k]
#     k+=1
# print("List of all model accuracies:\n",a)
# print("the most accurate model is:", max(a, key=a.get))