# 华中科技大学机器学习课程项目
## 语音性别识别
### CS1805 裴涵
#### 所有绘图见同级目录下image文件夹，具体以报告中插入图片为准
#### 本文件为主分析代码，其他对比代码见同目录下.py文件


## 一.导入库文件

In [99]:
import scipy
import pandas as pd
import numpy as np
import pylab as pl
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
import sklearn.metrics as metrics


# 忽略警告
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

# 绘图缺失的数据
import missingno as msno

# 分类模型
from sklearn.preprocessing import binarize
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from yellowbrick.features import ParallelCoordinates
from yellowbrick.classifier import ClassificationReport
from yellowbrick.classifier import DiscriminationThreshold
from yellowbrick.features import FeatureImportances


# 绘图弹出窗显示，方便保存可视化图片
%matplotlib auto



Using matplotlib backend: Qt5Agg


## 二.EDA过程
### 读取数据集

In [2]:
df = pd.read_csv('./data/voice.csv')
df.head(3)

Unnamed: 0,meanfreq,sd,median,Q25,Q75,IQR,skew,kurt,sp.ent,sfm,...,centroid,meanfun,minfun,maxfun,meandom,mindom,maxdom,dfrange,modindx,label
0,0.059781,0.064241,0.032027,0.015071,0.090193,0.075122,12.863462,274.402906,0.893369,0.491918,...,0.059781,0.084279,0.015702,0.275862,0.007812,0.007812,0.007812,0.0,0.0,male
1,0.066009,0.06731,0.040229,0.019414,0.092666,0.073252,22.423285,634.613855,0.892193,0.513724,...,0.066009,0.107937,0.015826,0.25,0.009014,0.007812,0.054688,0.046875,0.052632,male
2,0.077316,0.083829,0.036718,0.008701,0.131908,0.123207,30.757155,1024.927705,0.846389,0.478905,...,0.077316,0.098706,0.015656,0.271186,0.00799,0.007812,0.015625,0.007812,0.046512,male


### 1.查看数据总体信息

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3168 entries, 0 to 3167
Data columns (total 21 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   meanfreq  3168 non-null   float64
 1   sd        3168 non-null   float64
 2   median    3168 non-null   float64
 3   Q25       3168 non-null   float64
 4   Q75       3168 non-null   float64
 5   IQR       3168 non-null   float64
 6   skew      3168 non-null   float64
 7   kurt      3168 non-null   float64
 8   sp.ent    3168 non-null   float64
 9   sfm       3168 non-null   float64
 10  mode      3168 non-null   float64
 11  centroid  3168 non-null   float64
 12  meanfun   3168 non-null   float64
 13  minfun    3168 non-null   float64
 14  maxfun    3168 non-null   float64
 15  meandom   3168 non-null   float64
 16  mindom    3168 non-null   float64
 17  maxdom    3168 non-null   float64
 18  dfrange   3168 non-null   float64
 19  modindx   3168 non-null   float64
 20  label     3168 non-null   obje

### 2.查看缺失值

In [4]:
msno.matrix(df)

<matplotlib.axes._subplots.AxesSubplot at 0x1b7bd5605c8>

In [5]:
# 将0替换成NULL
df.replace(0, np.nan, inplace=True)
msno.matrix(df)


<matplotlib.axes._subplots.AxesSubplot at 0x1b7bd7fed08>

In [6]:
# 树状图形式查看缺失值
msno.dendrogram(df)



<matplotlib.axes._subplots.AxesSubplot at 0x1b7bda50848>

In [7]:
df_null = df.isnull().sum()
# 画出缺失值
plt.figure(figsize=(8,7))
g = sns.barplot(df_null.index, df_null.values)
locs, labels = plt.xticks()
plt.setp(labels, rotation=45)
plt.ylabel('No of missing values')
plt.show()

### 3.数据类别分布

In [8]:
data = df.copy()

In [9]:
# 对数据类型的分布绘图
colors = ['pink','Lightblue']
dfs = data[data.columns[-1]]
#plt.rcParams['font.sans-serif']=['SimHei'] # 用来正常显示中文标签
plt.pie(dfs.value_counts(),autopct='%1.1f%%',colors=colors,labels=['female','male'],textprops={'fontsize':20,'color':'black'})
plt.axis('equal')
plt.title('voice 数据集 label 分布\n',)
plt.show()
print (data['label'].value_counts())

female    1584
male      1584
Name: label, dtype: int64


### 4.特征权重概览

In [10]:
df.head(2)

Unnamed: 0,meanfreq,sd,median,Q25,Q75,IQR,skew,kurt,sp.ent,sfm,...,centroid,meanfun,minfun,maxfun,meandom,mindom,maxdom,dfrange,modindx,label
0,0.059781,0.064241,0.032027,0.015071,0.090193,0.075122,12.863462,274.402906,0.893369,0.491918,...,0.059781,0.084279,0.015702,0.275862,0.007812,0.007812,0.007812,,,male
1,0.066009,0.06731,0.040229,0.019414,0.092666,0.073252,22.423285,634.613855,0.892193,0.513724,...,0.066009,0.107937,0.015826,0.25,0.009014,0.007812,0.054688,0.046875,0.052632,male


In [11]:
# 便于比较每个特性
pd.plotting.radviz(df,'label')

<matplotlib.axes._subplots.AxesSubplot at 0x1b7bb1bf548>

### 5.特征相关性

In [12]:
# 画出相关矩阵
cor_mat= data[:].corr()
mask = np.array(cor_mat)
mask[np.tril_indices_from(mask)] = False
fig=plt.gcf()
fig.set_size_inches(15,15)
sns.heatmap(data=cor_mat,square=True,annot=True,cbar=True,cmap='Spectral')

<matplotlib.axes._subplots.AxesSubplot at 0x1b7bb1bf548>

In [13]:
for col in df.columns[:-1]:
    sns.FacetGrid(df, hue="label", size=3).map(sns.kdeplot, col).add_legend()
    plt.show()

## 三.算法实现
### 1.标签二值化

In [14]:
df['label'].head(4)

0    male
1    male
2    male
3    male
Name: label, dtype: object

In [15]:
# male:1 female:0
df['label'] = np.where(df['label']== 'male', 1, 0)

In [16]:
df['label'].head(4)

0    1
1    1
2    1
3    1
Name: label, dtype: int32

In [17]:
df['label'].unique()

array([1, 0])

### 2.标准化数据

In [40]:
df.replace(np.nan,0, inplace=True)
y = df['label'].values
x = df.drop(['label'],axis=1)
scaler = StandardScaler()
scaler.fit(x)
x = scaler.transform(x)

### 3.划分数据集

In [41]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=2)
method_scores=[]
method_names=[]

In [39]:
%%timeit
# %%time
naive_bayes = GaussianNB()
naive_bayes.fit(x_train,y_train)

684 µs ± 44.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [29]:
print("Naive Bayes Score: {}".format(naive_bayes.score(x_test,y_test)))
method_names.append("Naive Bayes")
method_scores.append(naive_bayes.score(x_test,y_test))

# 混淆矩阵
y_pred = naive_bayes.predict(x_test)
conf_mat = confusion_matrix(y_test,y_pred)
# 混淆矩阵可视化
f, ax = plt.subplots(figsize=(5,5))
sns.heatmap(conf_mat,annot=True,linewidths=0.5,linecolor="red",fmt=".0f",ax=ax)
plt.xlabel("Predicted Values")
plt.ylabel("True Values")
plt.show()

Naive Bayes Score: 0.8759200841219769


#### 选择两个主特征

In [33]:
data = df.copy()
df_cut =data[['IQR', 'meanfun', 'label']]

y_cut=df_cut['label'].values
x_cut = df_cut.drop(['label'],axis=1)
scaler = StandardScaler()
scaler.fit(x_cut)
x_cut = scaler.transform(x_cut)

x_train, x_test, y_train, y_test = train_test_split(x_cut, y_cut, test_size=0.3, random_state=2)

# train, test = train_test_split(voice_cut, test_size=0.2, random_state=2)
#
#
# x_train = train.iloc[:, :-1]
# y_train = train.iloc[:, -1]
# x_test = test.iloc[:, :-1]
# y_test = test.iloc[:, -1]

In [37]:
%%time
# %%timeit

naive_bayes = GaussianNB()
naive_bayes.fit(x_train,y_train)

Wall time: 6.98 ms


GaussianNB(priors=None, var_smoothing=1e-09)

In [38]:
print("Naive Bayes Score: {}".format(naive_bayes.score(x_test,y_test)))
method_names.append("Naive Bayes")
method_scores.append(naive_bayes.score(x_test,y_test))

# 混淆矩阵
y_pred = naive_bayes.predict(x_test)
conf_mat = confusion_matrix(y_test,y_pred)
# 混淆矩阵可视化
f, ax = plt.subplots(figsize=(5,5))
sns.heatmap(conf_mat,annot=True,linewidths=0.5,linecolor="red",fmt=".0f",ax=ax)
plt.xlabel("Predicted Values")
plt.ylabel("True Values")
plt.show()


Naive Bayes Score: 0.9652996845425867


### KNN

In [43]:
# %%timeit

knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(x_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

In [44]:
print("Score: {}".format(knn.score(x_test,y_test)))
method_names.append("KNN")
method_scores.append(knn.score(x_test,y_test))

# 混淆矩阵
y_pred = knn.predict(x_test)
conf_mat = confusion_matrix(y_test,y_pred)
# 可视化
f, ax = plt.subplots(figsize=(5,5))
sns.heatmap(conf_mat,annot=True,linewidths=0.5,linecolor="red",fmt=".0f",ax=ax)
plt.xlabel("Predicted Values")
plt.ylabel("True Values")
plt.show()

Score: 0.9726603575184016


##### 寻找KNN的N值

In [53]:
score_list=[]
for each in range(1,15):
    knn2 = KNeighborsClassifier(n_neighbors=each)
    knn2.fit(x_train,y_train)
    score_list.append(knn2.score(x_test,y_test))

plt.rcParams['font.sans-serif']=['SimHei'] # 用来正常显示中文标签
plt.plot(range(1,15),score_list)
plt.xlabel("k values")
plt.ylabel("score")
plt.title('k取值对准确率的影响')

Text(0.5, 1.0, 'k取值对准确率的影响')

In [54]:
knn = KNeighborsClassifier(n_neighbors=6)
knn.fit(x_train,y_train)
print("KNN Method Score: {}".format(knn.score(x_test,y_test)))

#Confusion Matrix
y_pred = knn.predict(x_test)
conf_mat = confusion_matrix(y_test,y_pred)
#Visualization Confusion Matrix
f, ax = plt.subplots(figsize=(5,5))
sns.heatmap(conf_mat,annot=True,linewidths=0.5,linecolor="red",fmt=".0f",ax=ax)
plt.xlabel("Predicted Values")
plt.ylabel("True Values")
plt.show()

KNN Method Score: 0.9726603575184016


#### LR

In [56]:
# %%timeit
log_reg = LogisticRegression()
log_reg.fit(x_train,y_train) #Fitting

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [57]:
print("Logistic Regression Score {}".format(log_reg.score(x_test,y_test)))
method_names.append("Logistic Reg.")
method_scores.append(log_reg.score(x_test,y_test))

# 混淆矩阵
y_pred = log_reg.predict(x_test)
conf_mat = confusion_matrix(y_test,y_pred)
# 可视化
f, ax = plt.subplots(figsize=(5,5))
sns.heatmap(conf_mat,annot=True,linewidths=0.5,linecolor="red",fmt=".0f",ax=ax)
plt.xlabel("Predicted Values")
plt.ylabel("True Values")
plt.show()

Logistic Regression Score 0.9726603575184016


In [58]:
features = ['meanfreq', 'sd', 'median', 'Q25', 'Q75', 'IQR', 'skew', 'kurt',
       'sp.ent', 'sfm', 'mode', 'centroid', 'meanfun', 'minfun', 'maxfun',
       'meandom', 'mindom', 'maxdom', 'dfrange', 'modindx']

classes = ["men",'women']

z = df[features]
c = df.label

In [59]:
bayes = GaussianNB()
visualizer = ClassificationReport(bayes, classes=classes, support=True)

visualizer.fit(x_train, y_train)
visualizer.score(x_test, y_test)
g = visualizer.poof()

In [60]:
knns = KNeighborsClassifier()
visualizer = ClassificationReport(knns, classes=classes, support=True)

visualizer.fit(x_train, y_train)
visualizer.score(x_test, y_test)
g = visualizer.poof()

In [61]:
logistic = LogisticRegression()
visualizer = DiscriminationThreshold(logistic)

visualizer.fit(x, y)
visualizer.poof()

<matplotlib.axes._subplots.AxesSubplot at 0x1b7c87b9308>

In [63]:
log_reg2 = LogisticRegression()
log_reg2.fit(x_train,y_train) #Fitting
print("Logistic Regression Score {}".format(log_reg.score(x_test,y_test)))

Logistic Regression Score 0.9726603575184016


In [69]:
y_probs_logit = log_reg2.predict_proba(x_test)
y_probs_logit_left = y_probs_logit[:,1]

In [72]:
y_pred_logit = binarize(y_probs_logit, 0.68)[:,1]
y_pred_logit

array([1., 1., 0., 1., 1., 0., 0., 0., 1., 1., 0., 1., 1., 0., 1., 0., 1.,
       0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 1., 1., 0.,
       1., 0., 0., 1., 0., 1., 1., 0., 1., 0., 0., 1., 1., 0., 0., 0., 1.,
       0., 1., 0., 0., 0., 1., 1., 0., 1., 1., 0., 0., 0., 1., 1., 0., 0.,
       1., 1., 0., 0., 1., 1., 0., 0., 1., 1., 1., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0.,
       0., 0., 1., 1., 0., 1., 0., 1., 0., 0., 1., 1., 0., 1., 1., 0., 0.,
       0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 1., 1., 0., 1., 0.,
       1., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1.,
       0., 1., 1., 1., 0., 1., 0., 0., 1., 1., 0., 1., 0., 0., 0., 1., 0.,
       0., 0., 1., 1., 0., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 1., 0.,
       1., 1., 1., 0., 1., 1., 1., 0., 0., 1., 1., 0., 1., 0., 1., 1., 0.,
       1., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0., 0., 1., 1., 0., 1., 0.,
       1., 0., 0., 0., 0.

In [74]:
y_pred_orig = log_reg.predict(x_test)

confusion_matrix(y_test,y_pred_orig)



array([[495,  16],
       [ 10, 430]], dtype=int64)

In [75]:
confusion_matrix(y_test,y_pred_logit)

array([[501,  10],
       [ 14, 426]], dtype=int64)

In [76]:
logis = LogisticRegression()
visualizer = ClassificationReport(logis, classes=classes, support=True)

visualizer.fit(x_train, y_train)
visualizer.score(x_test, y_test)
g = visualizer.poof()

##### 观察特征的重要性

In [77]:
fig = plt.figure()
ax = fig.add_subplot()

viz = FeatureImportances(GradientBoostingClassifier(), ax=ax)
viz.fit(z, c)
viz.poof()

<matplotlib.axes._subplots.AxesSubplot at 0x1b7c88f7e88>

#### SVM

In [79]:
%%time
svclassifier = SVC(kernel='linear',gamma=0.000001)
svclassifier.fit(x_train, y_train)

Wall time: 44.9 ms


SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1e-06, kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [86]:
y_pred = svclassifier.predict(x_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(svclassifier.score(x_test,y_test))

[[496  15]
 [ 11 429]]
              precision    recall  f1-score   support

           0       0.98      0.97      0.97       511
           1       0.97      0.97      0.97       440

    accuracy                           0.97       951
   macro avg       0.97      0.97      0.97       951
weighted avg       0.97      0.97      0.97       951

0.9726603575184016


### rbf核

In [93]:
%timeit
svclassifier1 = SVC(kernel='rbf')
svclassifier1.fit(x_train, y_train)

Wall time: 0 ns


SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [94]:
y_pred1=svclassifier1.predict(x_test)

In [95]:
print(confusion_matrix(y_test,y_pred1))
print(classification_report(y_test,y_pred1))

[[500  11]
 [ 11 429]]
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       511
           1       0.97      0.97      0.97       440

    accuracy                           0.98       951
   macro avg       0.98      0.98      0.98       951
weighted avg       0.98      0.98      0.98       951



In [96]:
visualizer = ClassificationReport(svclassifier, classes=classes, support=True)

visualizer.fit(x_train, y_train)
visualizer.score(x_test, y_test)
g = visualizer.poof()

In [97]:
visualizer = ClassificationReport(svclassifier1, classes=classes, support=True)

visualizer.fit(x_train, y_train)
visualizer.score(x_test, y_test)
g = visualizer.poof()

#### RandomizedSearchCV寻找超参和最佳的C

In [104]:
rand_list = {"C": scipy.stats.uniform(2, 1000),
             "gamma": scipy.stats.uniform(0.01, 10)}

In [106]:
%%time
random = RandomizedSearchCV(svclassifier1, param_distributions = rand_list, n_iter = 300, n_jobs = 4, cv=5, scoring='accuracy')
random.fit(x_train, y_train)

Wall time: 2min 16s


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                                 class_weight=None, coef0=0.0,
                                 decision_function_shape='ovr', degree=3,
                                 gamma='scale', kernel='rbf', max_iter=-1,
                                 probability=False, random_state=None,
                                 shrinking=True, tol=0.001, verbose=False),
                   iid='deprecated', n_iter=300, n_jobs=4,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001B7D43642C8>,
                                        'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001B7D3056DC8>},
                   pre_dispatch='2*n_jobs', random_state=None, refit=True,
                   return_train_score=False, scoring='accuracy', verbose=0)

In [107]:
random.best_estimator_,random.best_score_,random.best_params_

(SVC(C=7.206341560522909, break_ties=False, cache_size=200, class_weight=None,
     coef0=0.0, decision_function_shape='ovr', degree=3,
     gamma=0.12878697185180532, kernel='rbf', max_iter=-1, probability=False,
     random_state=None, shrinking=True, tol=0.001, verbose=False),
 0.9824100624326357,
 {'C': 7.206341560522909, 'gamma': 0.12878697185180532})

In [108]:
y_pred2 = random.predict(x_test)
print(classification_report(y_test, y_pred2))



              precision    recall  f1-score   support

           0       0.98      0.98      0.98       511
           1       0.97      0.98      0.98       440

    accuracy                           0.98       951
   macro avg       0.98      0.98      0.98       951
weighted avg       0.98      0.98      0.98       951



In [110]:
visualizer = ClassificationReport(random, classes=classes, support=True)
visualizer.fit(x_train, y_train)
visualizer.score(x_test, y_test)
g = visualizer.poof()

In [113]:
%%time
rand_list2 = {"C": scipy.stats.uniform(2, 1000)}
random2 = RandomizedSearchCV(log_reg, param_distributions = rand_list2, n_iter = 300, n_jobs = 4, cv=5, scoring='accuracy')
random2.fit(x_train, y_train)

Wall time: 20 s


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=LogisticRegression(C=1.0, class_weight=None,
                                                dual=False, fit_intercept=True,
                                                intercept_scaling=1,
                                                l1_ratio=None, max_iter=100,
                                                multi_class='auto', n_jobs=None,
                                                penalty='l2', random_state=None,
                                                solver='lbfgs', tol=0.0001,
                                                verbose=0, warm_start=False),
                   iid='deprecated', n_iter=300, n_jobs=4,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001B7D2F2E308>},
                   pre_dispatch='2*n_jobs', random_state=None, refit=True,
                   return_train_score=False, scoring='accuracy', verbose=0)

In [114]:
visualizer = ClassificationReport(random2, classes=classes, support=True)
visualizer.fit(x_train, y_train)
visualizer.score(x_test, y_test)
g = visualizer.poof()