In [None]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
import numpy as np

# 載入Iris資料集
iris = datasets.load_iris()

print(iris.DESCR) # 檢查一下品種是否正確？
print(iris.target[:5])

X = iris.data[:, [2, 3]]  # 花瓣長度(petal length)與花瓣寬度(petal width)
y = iris.target

# 將test資料分成 3:7, =>test_size = 0.3, 剩下 0.7 為train資料
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

print('\nClass labels', np.unique(y))
print("\nX_train shape:", X_train.shape)
print("\nX_test shape:", X_test.shape)
print("\ny_train shape:", y_train.shape)
print("\ny_test shape:", y_test.shape)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [2]:
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd


# 標準化資料
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

# 合併標準化後的訓練和測試資料
X_combined_std = np.vstack((X_train_std, X_test_std))
y_combined = np.hstack((y_train, y_test))

print("X_train_std shape:", X_train_std.shape)
print("X_test_std shape:", X_test_std.shape)

print('Mean = ', X_train_std[:,0].mean())
print('\nStandard deviation = ', X_train_std[:,0].std())
print('\nfirst 10 stdandardized sample => \n', X_train_std[:10])

X_train_std shape: (105, 2)
X_test_std shape: (45, 2)
Mean =  -3.5738607840314564e-16

Standard deviation =  0.9999999999999999

first 10 stdandardized sample => 
 [[-0.18295039 -0.29318114]
 [ 0.93066067  0.7372463 ]
 [ 1.04202177  1.63887031]
 [ 0.6522579   0.35083601]
 [ 1.09770233  0.7372463 ]
 [ 0.03977182 -0.16437771]
 [ 1.26474398  1.38126345]
 [ 0.48521625  0.47963944]
 [-0.01590873 -0.16437771]
 [ 0.59657735  0.7372463 ]]


In [3]:
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

def plot_decision_region(X, y, classifier, test_idx=None, resolution=0.02):
    markers = ('s', 'x', 'o', '^', 'v')
    colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
    cmap = ListedColormap(colors[:len(np.unique(y))])

    x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
                           np.arange(x2_min, x2_max, resolution))
    Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
    Z = Z.reshape(xx1.shape)
    plt.contourf(xx1, xx2, Z, alpha=0.3, cmap=cmap)
    plt.xlim(xx1.min(), xx1.max())
    plt.ylim(xx2.min(), xx2.max())

    for idx, cl in enumerate(np.unique(y)):
        marker = markers[idx]
        if marker == 'x':
            plt.scatter(x=X[y == cl, 0], 
                        y=X[y == cl, 1],
                        alpha=0.8, 
                        c=colors[idx],
                        marker=marker, 
                        label=cl)
        else:
            plt.scatter(x=X[y == cl, 0], 
                        y=X[y == cl, 1],
                        alpha=0.8, 
                        c=colors[idx],
                        marker=marker, 
                        label=cl, 
                        edgecolor='black')

    if test_idx:
        X_test, y_test = X[test_idx, :], y[test_idx]
        plt.scatter(X_test[:, 0], X_test[:, 1], c='none', edgecolor='black', alpha=1.0,
                    linewidth=1, marker='o', s=100, label='test set')

In [None]:
# 載入必要的套件
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 訓練和評估六種不同的機器學習演算法
classifiers = {
    # n_iter is removed from version 0.21, so I use max_iter to replace it.
    'Perceptron': Perceptron(max_iter=40, eta0=0.1, random_state=0),
    'Logistic Regression': LogisticRegression(C=100.0, random_state=0),
    'SVM': SVC(kernel='linear', C=1.0, random_state=0),
    'Decision Tree': DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=0),
    # Max_depth 可控制樹的高度，避免太深造成過擬和
    'Random Forest': RandomForestClassifier(criterion='gini', n_estimators=50, random_state=0, n_jobs=2),
    # n_estimators 決策樹的數量，越大越能提高準確率，不過計算量也會增加
    'KNN': KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski')
}

results = []

for name, clf in classifiers.items():
    clf.fit(X_train_std, y_train)
    
    # 訓練準確率
    y_train_pred = clf.predict(X_train_std)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    
    # 測試準確率
    y_test_pred = clf.predict(X_test_std)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    
    # 其他評估指標
    accuracy = accuracy_score(y_test, y_test_pred)
    precision = precision_score(y_test, y_test_pred, average='macro')
    recall = recall_score(y_test, y_test_pred, average='macro')
    f1 = f1_score(y_test, y_test_pred, average='macro')
    
    results.append((name, train_accuracy, test_accuracy, precision, recall, f1))
    
    print(f"{name} - Train Accuracy: {train_accuracy:.3f}, Test Accuracy: {test_accuracy:.3f}")
    
    plot_decision_region(X_combined_std, y_combined, classifier=clf, test_idx=range(len(y_train), len(y_combined)))
    plt.title(name)
    plt.xlabel('Petal length [standardized]')
    plt.ylabel('Petal width [standardized]')
    plt.legend(loc='upper left')
    plt.show()


In [5]:
# 顯示結果
print("Summary of accuracies:")
for name, train_accuracy, test_accuracy, precision, recall, f1 in results:
    print(f"{name}: Train Accuracy={train_accuracy:.3f}, Test Accuracy={test_accuracy:.3f}, Precision={precision:.3f}, Recall={recall:.3f}, F1 Score={f1:.3f}")

# 將結果整理成表格
results_df = pd.DataFrame(results, columns=['Algorithm', '訓練準確度', '測試準確度', 'Precision', 'Recall', 'F1 Score'])
print(results_df)

Summary of accuracies:
Perceptron: Train Accuracy=0.924, Test Accuracy=0.889, Precision=0.895, Recall=0.896, F1 Score=0.891
Logistic Regression: Train Accuracy=0.952, Test Accuracy=0.978, Precision=0.972, Recall=0.981, F1 Score=0.976
SVM: Train Accuracy=0.952, Test Accuracy=0.978, Precision=0.972, Recall=0.981, F1 Score=0.976
Decision Tree: Train Accuracy=0.981, Test Accuracy=0.978, Precision=0.972, Recall=0.981, F1 Score=0.976
Random Forest: Train Accuracy=0.990, Test Accuracy=0.978, Precision=0.972, Recall=0.981, F1 Score=0.976
KNN: Train Accuracy=0.952, Test Accuracy=1.000, Precision=1.000, Recall=1.000, F1 Score=1.000
             Algorithm     訓練準確度     測試準確度  Precision    Recall  F1 Score
0           Perceptron  0.923810  0.888889   0.894843  0.895623  0.890620
1  Logistic Regression  0.952381  0.977778   0.972222  0.981481  0.975983
2                  SVM  0.952381  0.977778   0.972222  0.981481  0.975983
3        Decision Tree  0.980952  0.977778   0.972222  0.981481  0.975983


In [6]:
from sklearn.model_selection import cross_val_score

# KNN => 100%? 
# 使用交叉驗證來評估KNN模型 
knn = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski')
scores = cross_val_score(knn, X_train_std, y_train, cv=5)
print(f"Cross-validation scores for KNN: {scores}")
print(f"Mean cross-validation score for KNN: {scores.mean():.2f}")

Cross-validation scores for KNN: [0.85714286 1.         1.         0.95238095 0.95238095]
Mean cross-validation score for KNN: 0.95


## 個人小小結論
    KNN的訓練和測試準確率都達到100%，有可能是因爲Iris的資料集較小且容易區分，且同時資料的分佈又是線性分佈，所以才會達到如此好的結果，再者，我怕KNN是直接記住訓練資料，而如果test data 與 train data非常相似的話，很有可能是過度擬和造成，導致可能在其他資料集不會有如此好的表現，因此我再多做交叉驗證的方法，得知該模型確實有不錯的表現，應為資料集的緣故？。
    
    Logistic Regression、SVM 和 Decision Tree 都有接近的表現，準確率和其他指標都在 0.97 左右。這顯示這些模型也非常適合該資料集
    同時我也將Decision Tree的n_estimators提高到50以達到更好的效果。