In [2]:
# Classification: KNN, Bayes, Decision Tree

###Iris 資料集算是最入門的機器學習演算法資料，透過花瓣以及花萼的長與寬來預測是屬於哪一種類的Iris花(Setosa, Virginica, Versicolour)，因為特徵數目不多，只有四個，並且類別項目只有三個，因此對於初學者來說非常好理解。
###若要載入Iris 資料集只要import sklearn的datasets之後，使用load_iris()就可以獲得資料，回傳的資料格式為dictionary，因此需要做一些資料處理才能變為表格的形式，在python只要有關表格的處理主要都使用pandas為主。
###感覺可以講解KNN跟K-Means的差別

In [3]:
# import package and datasets

# import iris datasets
from sklearn import datasets
iris = datasets.load_iris()

# split function for training set and testing set
#train_test_split是交叉驗證中常用的函式，功能是從樣本中隨機的按比例選取train_data和test_data
#train_test_split會自動將資料分為以下四種(X_train, X_test, y_train, y_test)

#train_data：所要劃分的樣本特徵集
#train_target：所要劃分的樣本結果
#test_size：樣本佔比，如果是整數的話就是樣本的數量
#random_state：是隨機數的種子。

#然後使用指令 Import train_test_split() 這個函式到我們的程式當中
from sklearn.model_selection import train_test_split 

#train_test_split() 所接受的變數其實非常單純，基本上為 3 項：『原始的資料』、『Seed』、『比例』

#原始的資料：就如同上方的 data 一般，是我們打算切成 Training data 以及 Test data 的原始資料
#Seed： 亂數種子，可以固定我們切割資料的結果 (為何需要設定亂數種子?請解釋。)
#比例：可以設定 train_size 或 test_size，只要設定一邊即可，範圍在 [0-1] 之間
#X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.3)

# X= iris.data
# Y= iris.target

# assessment function: metrics
from sklearn import metrics

In [4]:
# visulize the data first
print(iris.keys()) #顯示keys 
print(iris.items()) #印出全部資料Key and Value
print(iris.target) #標籤值
print(iris.target_names) #標籤名稱
print(iris.feature_names) #特徵值名稱

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])
dict_items([('data', array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])
#找到標籤資料有(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])這些類別



In [5]:
# check the training and testing data~
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.3)

#len()返回字串的長度
print(len(X_test))
print(len(X_train))
print(len(y_test))
print(len(y_train))

45
105
45
105


In [6]:
# KNN algorithm

# import algorithm from sklearn
from sklearn import neighbors

# build model
model2 = neighbors.KNeighborsClassifier(n_neighbors=3)

# training data
model2.fit(X_train, y_train) 

# predict data
y_pred=model2.predict(X_test)

# print result
print(y_pred)
print(y_test)

[2 2 0 0 1 1 0 2 0 0 1 0 2 2 0 1 1 2 1 1 1 1 0 2 2 1 0 1 2 0 1 1 2 2 2 0 1
 1 0 0 0 1 2 1 2]
[2 2 0 0 1 1 0 2 0 0 1 0 1 2 0 1 1 2 1 1 1 1 0 1 2 1 0 1 2 0 1 1 2 2 2 0 1
 1 0 0 0 1 2 2 2]


In [7]:
# print report
print(metrics.confusion_matrix(y_test,y_pred))
print(metrics.accuracy_score(y_test,y_pred))
print(metrics.classification_report(y_pred,y_test))

[[14  0  0]
 [ 0 16  2]
 [ 0  1 12]]
0.9333333333333333
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       0.89      0.94      0.91        17
           2       0.92      0.86      0.89        14

    accuracy                           0.93        45
   macro avg       0.94      0.93      0.93        45
weighted avg       0.93      0.93      0.93        45



In [8]:
# Naive Bayes Algorithm

In [9]:
# import package and datasets

# import iris datasets
from sklearn import datasets
iris = datasets.load_iris()

# split function for training set and testing set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.3)

# assessment function: metrics
from sklearn import metrics

In [10]:
type(iris)

sklearn.utils.Bunch

In [13]:
y_train

array([2, 0, 0, 0, 2, 0, 0, 0, 1, 1, 2, 2, 2, 2, 0, 0, 2, 0, 0, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 2, 1, 0, 2, 1, 1, 2, 2, 0, 2, 0, 0, 2, 2, 0, 1,
       1, 1, 2, 2, 1, 2, 2, 2, 2, 0, 0, 2, 1, 2, 0, 0, 0, 2, 0, 1, 0, 0,
       1, 1, 2, 1, 2, 2, 0, 2, 2, 0, 1, 0, 0, 1, 1, 1, 2, 2, 0, 2, 1, 1,
       2, 1, 0, 1, 1, 2, 1, 0, 1, 2, 0, 0, 1, 2, 1, 2, 1])

In [12]:
X_train

array([[7.3, 2.9, 6.3, 1.8],
       [5.4, 3.9, 1.7, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [4.6, 3.2, 1.4, 0.2],
       [6.7, 2.5, 5.8, 1.8],
       [4.6, 3.6, 1. , 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.1, 3.8, 1.6, 0.2],
       [6. , 2.9, 4.5, 1.5],
       [5.6, 3. , 4.1, 1.3],
       [6.4, 3.1, 5.5, 1.8],
       [6.9, 3.2, 5.7, 2.3],
       [6. , 2.2, 5. , 1.5],
       [6.5, 3. , 5.2, 2. ],
       [5.1, 3.7, 1.5, 0.4],
       [5.5, 4.2, 1.4, 0.2],
       [6.1, 2.6, 5.6, 1.4],
       [5.4, 3.7, 1.5, 0.2],
       [4.9, 3.6, 1.4, 0.1],
       [5.7, 2.8, 4.5, 1.3],
       [4.8, 3.4, 1.9, 0.2],
       [5.5, 2.4, 3.7, 1. ],
       [4.4, 3. , 1.3, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [5.6, 2.9, 3.6, 1.3],
       [5.7, 2.9, 4.2, 1.3],
       [5.8, 2.7, 4.1, 1. ],
       [6.2, 2.2, 4.5, 1.5],
       [6.2, 3.4, 5.4, 2.3],
       [6.7, 3.1, 4.7, 1.5],
       [5.1, 3.5, 1.4, 0.3],
       [6.5, 3. , 5.8, 2.2],
       [6.5, 2.8, 4.6, 1.5],
       [5.5, 2.5, 4. , 1.3],
       [6.3, 2

In [3]:
# Naive Bayes  #記得講參數

# from sklearn.naive_bayes import GaussianNB
from sklearn import naive_bayes

# build model
model4 = naive_bayes.GaussianNB()
#kernel要調的好, 但會有過度學習的狀況

# training data
model4.fit(X_train, y_train) 

# predict
y_pred=model4.predict(X_test)

# print result
print(y_pred)
print(y_test)

[0 2 1 1 2 2 0 0 1 1 2 0 1 2 1 1 0 0 0 1 1 2 1 0 2 1 1 1 1 0 2 1 2 0 1 2 2
 1 2 1 2 2 0 2 2]
[0 2 2 1 1 1 0 0 1 1 2 0 1 2 1 1 0 0 0 1 1 2 1 0 2 1 1 1 1 0 2 1 2 0 1 2 2
 1 2 1 2 2 0 2 1]


In [11]:
print(iris.keys())

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])


In [12]:
print(iris.data)

[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.4 3.7 1.5 0.2]
 [4.8 3.4 1.6 0.2]
 [4.8 3.  1.4 0.1]
 [4.3 3.  1.1 0.1]
 [5.8 4.  1.2 0.2]
 [5.7 4.4 1.5 0.4]
 [5.4 3.9 1.3 0.4]
 [5.1 3.5 1.4 0.3]
 [5.7 3.8 1.7 0.3]
 [5.1 3.8 1.5 0.3]
 [5.4 3.4 1.7 0.2]
 [5.1 3.7 1.5 0.4]
 [4.6 3.6 1.  0.2]
 [5.1 3.3 1.7 0.5]
 [4.8 3.4 1.9 0.2]
 [5.  3.  1.6 0.2]
 [5.  3.4 1.6 0.4]
 [5.2 3.5 1.5 0.2]
 [5.2 3.4 1.4 0.2]
 [4.7 3.2 1.6 0.2]
 [4.8 3.1 1.6 0.2]
 [5.4 3.4 1.5 0.4]
 [5.2 4.1 1.5 0.1]
 [5.5 4.2 1.4 0.2]
 [4.9 3.1 1.5 0.2]
 [5.  3.2 1.2 0.2]
 [5.5 3.5 1.3 0.2]
 [4.9 3.6 1.4 0.1]
 [4.4 3.  1.3 0.2]
 [5.1 3.4 1.5 0.2]
 [5.  3.5 1.3 0.3]
 [4.5 2.3 1.3 0.3]
 [4.4 3.2 1.3 0.2]
 [5.  3.5 1.6 0.6]
 [5.1 3.8 1.9 0.4]
 [4.8 3.  1.4 0.3]
 [5.1 3.8 1.6 0.2]
 [4.6 3.2 1.4 0.2]
 [5.3 3.7 1.5 0.2]
 [5.  3.3 1.4 0.2]
 [7.  3.2 4.7 1.4]
 [6.4 3.2 4.5 1.5]
 [6.9 3.1 4.

In [14]:
print(iris.target)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


In [15]:
print(len(iris.data))

150


In [16]:
print(len(iris.target))

150


In [17]:
print(iris.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [10]:
# print report
print(metrics.confusion_matrix(y_test,y_pred)) #混淆矩陣
print(metrics.accuracy_score(y_test,y_pred)) #精確分數
print(metrics.classification_report(y_pred,y_test))

[[16  0  0]
 [ 0 13  1]
 [ 0  0 15]]
0.9777777777777777
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       0.93      1.00      0.96        13
           2       1.00      0.94      0.97        16

   micro avg       0.98      0.98      0.98        45
   macro avg       0.98      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45



In [11]:
# Decision Tree Algorithm

In [12]:
# import package and datasets

# import iris datasets
from sklearn import datasets
iris = datasets.load_iris()

# split function for training set and testing set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.3)

# assessment function: metrics
from sklearn import metrics

In [13]:
# decision tree
from sklearn import tree

# build model
model1 = tree.DecisionTreeClassifier()

# training data
model1.fit(X_train, y_train) 

# predict data
y_pred=model1.predict(X_test)

# print reulst
print(y_pred)
print(y_test)


[1 2 0 1 1 1 2 2 2 0 1 1 0 0 2 2 2 2 0 1 1 1 1 1 2 1 0 1 2 1 1 2 0 0 0 1 1
 2 0 2 1 1 1 0 2]
[1 2 0 1 1 1 2 2 2 0 1 1 0 0 2 2 2 2 0 1 1 1 1 1 2 1 0 1 1 1 1 2 0 0 0 1 1
 2 0 2 1 2 1 0 2]


In [14]:
# print report
print(metrics.confusion_matrix(y_test,y_pred))
print(metrics.accuracy_score(y_test,y_pred))
print(metrics.classification_report(y_pred,y_test))

[[11  0  0]
 [ 0 19  1]
 [ 0  1 13]]
0.9555555555555556
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        11
           1       0.95      0.95      0.95        20
           2       0.93      0.93      0.93        14

   micro avg       0.96      0.96      0.96        45
   macro avg       0.96      0.96      0.96        45
weighted avg       0.96      0.96      0.96        45

