In [1]:
%matplotlib inline

# 实验四  数据的分类预测
## 实验目的
+ 使用scikit-learn 包中的tree，贝叶斯，knn，对数据进行模型训练，尽量了解其原理及运用。
+ 使用不同分析三种分类器在实验中的性能比较，分析它们的特点。
## 实验报告
+ 本实验采用的数据集为house与segment。
+ 实验完成后要求提交实验报告与相应的实验结果。
+ 将实验报告和相关实验结果打包上传至ftp服务器上相关目录下：
    /上传作业/刘昆宏/数据挖掘/实验4/
+ 实验报告请在实验完成后的第二周周日结束前上传。


# PART1. 对house-votes-84数据集进行分类预测。

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier

## 数据预处理
### 读取数据
这里发现有一些数据存在缺失的情况，需要进行缺失值填充。

In [3]:
df = pd.read_csv("data/house-votes-84.data", sep=',',header=None,na_values='?')
df.replace('y',1, inplace=True)
df.replace('n',0, inplace=True)
df.replace('democrat',1, inplace=True)
df.replace('republican',0, inplace=True)
col_names = ['party'] + [i+1 for i in range(16)]
df.columns = col_names
df

Unnamed: 0,party,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,,1.0,1.0,1.0,0.0,1.0
1,0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,
2,1,,1.0,1.0,,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
3,1,0.0,1.0,1.0,0.0,,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
4,1,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
430,0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0
431,1,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
432,0,0.0,,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0
433,0,0.0,0.0,0.0,1.0,1.0,1.0,,,,,0.0,1.0,1.0,1.0,0.0,1.0


共16个特征值，允许出现最多2个NaN：

In [4]:
threshold = 14
df_trainable = df.dropna(thresh=threshold)
df_trainable

Unnamed: 0,party,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,,1.0,1.0,1.0,0.0,1.0
1,0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,
2,1,,1.0,1.0,,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
3,1,0.0,1.0,1.0,0.0,,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
4,1,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
429,1,1.0,0.0,1.0,0.0,,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,,1.0,1.0
430,0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0
431,1,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
432,0,0.0,,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0


失去了20个样本，丢失率仅为4.5%，能够接受，所以这里选择去掉这些缺失率高的样本。

在政治投票中，同党派人士由于政见相合和政治利益，对于某一问题的看法往往相似。因此，此处的缺失值选择**同党派人士的众数**进行填充。

In [5]:
df_orig = df
df = df_trainable

df.loc[df['party']==1] = df.loc[df['party']==1].fillna(df.loc[df['party']==1].mode().T[0])

df.loc[df['party']==0] = df.loc[df['party']==0].fillna(df.loc[df['party']==0].mode().T[0])

df

Unnamed: 0,party,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0
1,0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0
2,1,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
3,1,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
4,1,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
429,1,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0
430,0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0
431,1,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
432,0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0


## 分割数据集
由于数据量太少，就不按照6:2:2的方式分配训练、开发、测试集了，选择传统的8:2分配训练和开发/测试集。

In [6]:
# 提取特征和标签
data = df.iloc[:,1:]
target = df.iloc[:,0]

xtrain,xtest,ytrain,ytest = train_test_split(data,target,test_size=0.2)

print("shape of xtrain" , xtrain.shape)
print("shape of xtest " , xtest.shape)
print("shape of ytrain" , ytrain.shape)
print("shape of ytest " , ytest.shape)

shape of xtrain (332, 16)
shape of xtest  (83, 16)
shape of ytrain (332,)
shape of ytest  (83,)


在进行分类之前，先构造训练和预测的函数

In [7]:
def train(clf, xtrain, ytrain, xtest, ytest):
    %time clf.fit(xtrain, ytrain)
    train_accuracy = clf.score(xtrain, ytrain)
    test_accuracy = clf.score(xtest, ytest)
    print("accuracy on training set =", train_accuracy)
    print("accuracy on testing  set =", test_accuracy)
    return clf

## 使用tree进行分类
构建决策树，查看准确率

In [8]:
clf = DecisionTreeClassifier()
tree_clf = train(clf, xtrain, ytrain, xtest, ytest)

Wall time: 2 ms
accuracy on training set = 1.0
accuracy on testing  set = 0.9759036144578314


使用默认的参数就达到了非常不错的准确率，并且仅仅使用了2毫秒就完成了训练。
## 使用Bayes进行分类
sklearn提供了三种朴素贝叶斯模型：高斯模型，多项式模型和伯努利模型。这里分别使用三种模型进行预测，对比表现。

In [9]:
# GaussianNB
clf = GaussianNB()
bayes_clf = train(clf, xtrain, ytrain, xtest, ytest)

Wall time: 5 ms
accuracy on training set = 0.9457831325301205
accuracy on testing  set = 0.9518072289156626


In [10]:
# MultinomialNB
clf = MultinomialNB()
_ = train(clf, xtrain, ytrain, xtest, ytest)

Wall time: 8 ms
accuracy on training set = 0.9096385542168675
accuracy on testing  set = 0.9156626506024096


In [11]:
# BernoulliNB
clf = BernoulliNB()
_ = train(clf, xtrain, ytrain, xtest, ytest)

Wall time: 3.97 ms
accuracy on training set = 0.9096385542168675
accuracy on testing  set = 0.9156626506024096


结果表明这个数据集上不论是性能还是正确率，都是高斯模型最优。
## 使用KNN算法
尝试使用1、3、5、7的KNN
1NN表现不错，但有些过拟合，其余的则表现较决策树有些一般。

In [12]:
# KNN
for i in [1,3,5,7]:
    clf = KNeighborsClassifier(n_neighbors=i, n_jobs=-1)
    print("result of {}NN:".format(i))
    _ = train(clf, xtrain, ytrain, xtest, ytest)
knn_1_clf = KNeighborsClassifier(n_neighbors=1, n_jobs=-1)
knn_1_clf.fit(xtrain, ytrain)

result of 1NN:
Wall time: 3 ms
accuracy on training set = 1.0
accuracy on testing  set = 0.9397590361445783
result of 3NN:
Wall time: 2.03 ms
accuracy on training set = 0.9578313253012049
accuracy on testing  set = 0.9518072289156626
result of 5NN:
Wall time: 2.04 ms
accuracy on training set = 0.9337349397590361
accuracy on testing  set = 0.9518072289156626
result of 7NN:
Wall time: 1.98 ms
accuracy on training set = 0.9367469879518072
accuracy on testing  set = 0.927710843373494


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=1, p=2,
                     weights='uniform')

## 评估
对二分类问题，我使用AUC来评估分类器的性能。

先定义评估函数

In [13]:
def estimate(clf, x, y, name):
    '''
    Params: classifier clf, data x, label y and the name of classifier.

    Return: f1 and auc score of clf
    '''
    pred = clf.predict(x)
    auc = metrics.roc_auc_score(pred, y)
    print("----ESTIMATING CLASSIFIER: {}----".format(name))
    print("----AUC  Score = {} ----".format(auc))
    return auc

如下面所示，综合AUC和准确率来判断分类器性能，总体而言决策树最优，贝叶斯最差。

In [14]:
_ = estimate(tree_clf, data, target, "DecisionTree")
_ = estimate(bayes_clf, data, target, "Bayes")
_ = estimate(knn_1_clf, data, target, "1NN")

----ESTIMATING CLASSIFIER: DecisionTree----
----AUC  Score = 0.9937888198757764 ----
----ESTIMATING CLASSIFIER: Bayes----
----AUC  Score = 0.9430478798845796 ----
----ESTIMATING CLASSIFIER: 1NN----
----AUC  Score = 0.986703431372549 ----


## 总结
结果表明K的值需要根据数据集进行调整。
`house votes 84`数据集就像我进行缺失值填充的时候猜想的一样，相同党派的样本关联性非常大，各种特征十分相似，所以所有的分类方法都可以很方便地对其进行精确分类。政客投票很多情况下是出于“我是xx党，所以我需要投xxx”进行投票，所以决策树在这个数据集上表现出乎意料的好。