In [1]:
%matplotlib inline

# part2. 对segment数据集进行分类

In [2]:
from scipy.io import arff
import pandas as pd

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MinMaxScaler

from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier

## 数据读取和预处理
segment数据集是arff格式文件，需要使用scipy来读取。
由于train数据集有1500条，test数据集也有800多条，这样分割有点浪费。这里把它们合并起来重新按照20%的比例划分训练集和测试集

In [3]:
data1 = arff.loadarff('data/segment-tain.txt')
data2 = arff.loadarff('data/segment-test.txt')
df = pd.concat([pd.DataFrame(data1[0]),pd.DataFrame(data2[0])],axis=0)
df

Unnamed: 0,region-centroid-col,region-centroid-row,region-pixel-count,short-line-density-5,short-line-density-2,vedge-mean,vegde-sd,hedge-mean,hedge-sd,intensity-mean,rawred-mean,rawblue-mean,rawgreen-mean,exred-mean,exblue-mean,exgreen-mean,value-mean,saturation-mean,hue-mean,class
0,38.0,189.0,9.0,0.0,0.0,1.000000,0.222222,6.222220,33.318500,29.074100,26.333300,35.22220,25.6667,-8.22222,18.44440,-10.22220,35.22220,0.271208,-2.04915,b'path'
1,25.0,199.0,9.0,0.0,0.0,1.111110,0.607407,1.055560,0.462963,17.518500,13.111100,17.88890,21.5556,-13.22220,1.11111,12.11110,21.55560,0.393002,2.69011,b'grass'
2,49.0,139.0,9.0,0.0,0.0,0.166667,0.077778,0.333333,0.088889,0.444444,0.000000,1.33333,0.0000,-1.33333,2.66667,-1.33333,1.33333,0.777778,-2.09440,b'foliage'
3,63.0,220.0,9.0,0.0,0.0,3.055560,15.263000,3.666670,6.088890,8.185190,6.555560,6.44444,11.5556,-4.88889,-5.22222,10.11110,11.55560,0.486717,2.09315,b'grass'
4,161.0,135.0,9.0,0.0,0.0,0.055556,0.136083,0.111111,0.172133,1.259260,0.777778,3.00000,0.0000,-1.44444,5.22222,-3.77778,3.00000,1.000000,-1.82221,b'window'
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
805,221.0,111.0,9.0,0.0,0.0,0.611111,0.240741,0.388889,0.240741,1.333330,0.000000,4.00000,0.0000,-4.00000,8.00000,-4.00000,4.00000,1.000000,-2.09440,b'foliage'
806,44.0,79.0,9.0,0.0,0.0,0.444444,0.344265,0.777779,0.403686,107.741000,93.888900,126.55600,102.7780,-41.55560,56.44440,-14.88890,126.55600,0.258079,-2.37797,b'sky'
807,230.0,41.0,9.0,0.0,0.0,0.888891,0.688530,1.888890,1.241270,121.481000,110.222000,138.88900,115.3330,-33.77780,52.22220,-18.44440,138.88900,0.206392,-2.28080,b'sky'
808,217.0,77.0,9.0,0.0,0.0,1.555560,2.740740,1.666670,0.533333,40.222200,37.222200,48.22220,35.2222,-9.00000,24.00000,-15.00000,48.22220,0.269192,-1.93207,b'cement'


In [4]:
# normalize function
def z_score_normalize(series):
    '''
    Params:  a pandas serious

    Return: normalized version of the series
    '''
    return (series-series.mean())/series.std()

# to train a classifier
def train(clf, xtrain, ytrain, xtest, ytest):
    '''
    Params: a classifier object and dataset splited

    Return: a classifier trained
    '''
    %time clf.fit(xtrain, ytrain)
    train_accuracy = clf.score(xtrain, ytrain)
    test_accuracy = clf.score(xtest, ytest)
    print("accuracy on training set =", train_accuracy)
    print("accuracy on testing  set =", test_accuracy)
    return clf

因为要使用多项式贝叶斯模型，所以预处理归一化到`[0,1]`

In [5]:
data = df.iloc[:,:-1]
target = df.iloc[:,-1]
# preprocess the data
data = MinMaxScaler().fit_transform(data)

# preprocess the target
class2int = { k:v for v,k in enumerate(list(target.drop_duplicates()))}
int2class = { v:k for v,k in enumerate(list(target.drop_duplicates()))}
target = target.replace(class2int)

xtrain,xtest,ytrain,ytest = train_test_split(data,target,test_size=0.2)

print("shape of xtrain" , xtrain.shape)
print("shape of xtest " , xtest.shape)
print("shape of ytrain" , ytrain.shape)
print("shape of ytest " , ytest.shape)

shape of xtrain (1848, 19)
shape of xtest  (462, 19)
shape of ytrain (1848,)
shape of ytest  (462,)


In [6]:
target

0      0
1      1
2      2
3      1
4      3
      ..
805    2
806    4
807    4
808    6
809    3
Name: class, Length: 2310, dtype: int64

## 训练各种分类器
这是一个多类别问题，采取One VS Rest策略。

从下面的现象看出，由于这个数据集相对复杂，分类器并没有像上一个数据集中一样集体达到完美的水平。所以我选择三种算法各训练一些模型，然后挑几个看起来比较好的模型去进行进一步评估，探究到底哪一种算法在这个数据集上表现更好。
### 1. 决策树
决策树的表现十分出色，虽然时间在所有算法中比较长，但达到了极高的正确率

In [7]:
clf = OneVsRestClassifier(DecisionTreeClassifier())
tree_clf = train(clf, xtrain, ytrain, xtest, ytest)

Wall time: 72 ms
accuracy on training set = 1.0
accuracy on testing  set = 0.9567099567099567


### 2. 贝叶斯
总体而言，贝叶斯算法在性能表现上及其优异，但是准确率比较差。尤其是连续性的数据，在假设属性为离散布尔型的伯努利模型上准确率出奇的低（但是反其道而行之也可以有另外两种模型差不多的表现）。我选择了表现最好的高斯模型进入下一步的评估。

In [8]:
# GaussianNB
clf = OneVsRestClassifier(GaussianNB())
bayes_clf = train(clf, xtrain, ytrain, xtest, ytest)

# MultinomialNB
clf = OneVsRestClassifier(MultinomialNB())
_ = train(clf, xtrain, ytrain, xtest, ytest)

# BernoulliNB
clf = OneVsRestClassifier(BernoulliNB())
_ = train(clf, xtrain, ytrain, xtest, ytest)

Wall time: 16 ms
accuracy on training set = 0.7851731601731602
accuracy on testing  set = 0.8225108225108225
Wall time: 18 ms
accuracy on training set = 0.770021645021645
accuracy on testing  set = 0.7748917748917749
Wall time: 16 ms
accuracy on training set = 0.21915584415584416
accuracy on testing  set = 0.18614718614718614


### 3. KNN
和老师在课上讲的一样，很多时候是1聚类效果最好。

In [9]:
# KNN
for i in [1,3,5,7]:
    clf = OneVsRestClassifier(KNeighborsClassifier(n_neighbors=i, n_jobs=-1))
    print("result of {}NN:".format(i))
    _ = train(clf, xtrain, ytrain, xtest, ytest)
knn_1_clf = OneVsRestClassifier(KNeighborsClassifier(n_neighbors=1, n_jobs=-1))
knn_1_clf.fit(xtrain, ytrain)

result of 1NN:
Wall time: 81 ms
accuracy on training set = 1.0
accuracy on testing  set = 0.9761904761904762
result of 3NN:
Wall time: 62 ms
accuracy on training set = 0.9767316017316018
accuracy on testing  set = 0.9675324675324676
result of 5NN:
Wall time: 55 ms
accuracy on training set = 0.9691558441558441
accuracy on testing  set = 0.9545454545454546
result of 7NN:
Wall time: 53 ms
accuracy on training set = 0.9632034632034632
accuracy on testing  set = 0.9567099567099567


OneVsRestClassifier(estimator=KNeighborsClassifier(algorithm='auto',
                                                   leaf_size=30,
                                                   metric='minkowski',
                                                   metric_params=None,
                                                   n_jobs=-1, n_neighbors=1,
                                                   p=2, weights='uniform'),
                    n_jobs=None)

## 三种候选分类器的进一步评估
对分类器的评估方法有很多，我这里选择使用多分类常用的F1score来评估分类器。

首先建立评估函数：

In [10]:
def estimate(clf, x, y, name):
    '''
    Params: classifier clf, data x, label y and the name of classifier.

    Return: f1 and auc score of clf
    '''
    pred = clf.predict(x)
    f1 = metrics.f1_score(pred, y, average='micro')
    print("----ESTIMATING CLASSIFIER: {}----".format(name))
    print("----F1  Score = {} ----".format(f1))
    return f1

然后对每个分类器进行评估：

In [11]:
_ = estimate(tree_clf, data, target, "DecisionTree")
_ = estimate(bayes_clf, data, target, "Bayes")
_ = estimate(knn_1_clf, data, target, "1NN")

----ESTIMATING CLASSIFIER: DecisionTree----
----F1  Score = 0.9913419913419913 ----
----ESTIMATING CLASSIFIER: Bayes----
----F1  Score = 0.7926406926406926 ----
----ESTIMATING CLASSIFIER: 1NN----
----F1  Score = 0.9952380952380953 ----


## 总结
综上所述：

1. 这个数据集上，准确度决策树和1NN相当，3~7NN次之，贝叶斯最差，而计算性能反之。
2. 三个候选分类其中，1NN表现略微好于决策树，贝叶斯较差。