In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.base import clone
from itertools import combinations

# 載入葡萄酒資料集
wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)
wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline']

# 分割資料集為訓練集和測試集
X, y = wine.iloc[:, 1:].values, wine.iloc[:, 0].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# 標準化資料
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

# 合併標準化後的訓練和測試資料
X_combined_std = np.vstack((X_train_std, X_test_std))
y_combined = np.hstack((y_train, y_test))

# 定義SBS演算法
class SBS:
    def __init__(self, estimator, k_features, scoring=accuracy_score, test_size=0.25, random_state=1):
        self.scoring = scoring
        self.estimator = clone(estimator)
        self.k_features = k_features
        self.test_size = test_size
        self.random_state = random_state

    def fit(self, X, y):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=self.test_size, random_state=self.random_state)
        dim = X_train.shape[1]
        self.indices_ = tuple(range(dim))
        self.subsets_ = [self.indices_]
        score = self._calc_score(X_train, y_train, X_test, y_test, self.indices_)
        self.scores_ = [score]

        while dim > self.k_features:
            scores = []
            subsets = []

            for p in combinations(self.indices_, r=dim - 1):
                score = self._calc_score(X_train, y_train, X_test, y_test, p)
                scores.append(score)
                subsets.append(p)

            best = np.argmax(scores)
            self.indices_ = subsets[best]
            self.subsets_.append(self.indices_)
            dim -= 1

            self.scores_.append(scores[best])
        self.k_score_ = self.scores_[-1]
        return self

    def transform(self, X):
        return X[:, self.indices_]

    def _calc_score(self, X_train, y_train, X_test, y_test, indices):
        self.estimator.fit(X_train[:, indices], y_train)
        y_pred = self.estimator.predict(X_test[:, indices])
        score = self.scoring(y_test, y_pred)
        return score

# 使用SBS選取重要特徵
knn = KNeighborsClassifier(n_neighbors=5)
sbs = SBS(knn, k_features=5)
sbs.fit(X_train_std, y_train)

# 選取前2個重要特徵
k2 = list(sbs.subsets_[-3])
feature_names_k2 = [wine.columns[i+1] for i in k2]  # +1 因為第一列是Class label
print(f'Selected features by KNN (indices): {k2}')
print(f'Selected features by KNN (names): {feature_names_k2}')

# 使用隨機森林列出特徵的重要性
forest = RandomForestClassifier(n_estimators=500, random_state=1)
forest.fit(X_train, y_train)
importances = forest.feature_importances_
indices = np.argsort(importances)[::-1]

# 列出前兩個重要特徵
top2_indices = indices[:2]
feature_names_top2 = [wine.columns[i+1] for i in top2_indices]  # +1 因為第一列是Class label
print(f'Feature ranking by Random Forest (indices): {indices}')
print(f'Top 2 features by Random Forest (indices): {top2_indices}')
print(f'Top 2 features by Random Forest (names): {feature_names_top2}')

# 定義plot_decision_region函數
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

def plot_decision_region(X, y, classifier, test_idx=None, resolution=0.02):
    markers = ('s', 'x', 'o', '^', 'v')
    colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
    cmap = ListedColormap(colors[:len(np.unique(y))])

    x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
                           np.arange(x2_min, x2_max, resolution))
    Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
    Z = Z.reshape(xx1.shape)
    plt.contourf(xx1, xx2, Z, alpha=0.3, cmap=cmap)
    plt.xlim(xx1.min(), xx1.max())
    plt.ylim(xx2.min(), xx2.max())

    for idx, cl in enumerate(np.unique(y)):
        marker = markers[idx]
        if marker == 'x':
            plt.scatter(x=X[y == cl, 0], 
                        y=X[y == cl, 1],
                        alpha=0.8, 
                        c=colors[idx],
                        marker=marker, 
                        label=cl)
        else:
            plt.scatter(x=X[y == cl, 0], 
                        y=X[y == cl, 1],
                        alpha=0.8, 
                        c=colors[idx],
                        marker=marker, 
                        label=cl, 
                        edgecolor='black')

    if test_idx:
        X_test, y_test = X[test_idx, :], y[test_idx]
        plt.scatter(X_test[:, 0], X_test[:, 1], c='none', edgecolor='black', alpha=1.0,
                    linewidth=1, marker='o', s=100, label='test set')

# 訓練和評估六種不同的機器學習演算法
classifiers = {
    'Perceptron': Perceptron(max_iter=40, eta0=0.1, random_state=0),
    'Logistic Regression': LogisticRegression(C=100.0, random_state=0),
    'SVM': SVC(kernel='linear', C=1.0, random_state=0),
    'Decision Tree': DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=0),
    'Random Forest': RandomForestClassifier(criterion='gini', n_estimators=50, random_state=0, n_jobs=2),
    'KNN': KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski')
}

results_knn_features = []
results_rf_features = []

# 使用KNN選取的前兩個重要特徵進行訓練和測試
for name, clf in classifiers.items():
    clf.fit(X_train_std[:, k2], y_train)
    
    # 訓練準確率
    y_train_pred = clf.predict(X_train_std[:, k2])
    train_accuracy = accuracy_score(y_train, y_train_pred)
    
    # 測試準確率
    y_test_pred = clf.predict(X_test_std[:, k2])
    test_accuracy = accuracy_score(y_test, y_test_pred)
    
    # 平均準確率
    accuracy = (train_accuracy + test_accuracy) / 2
    
    # 其他評估指標
    precision = precision_score(y_test, y_test_pred, average='macro')
    recall = recall_score(y_test, y_test_pred, average='macro')
    f1 = f1_score(y_test, y_test_pred, average='macro')
    
    results_knn_features.append((name, accuracy, precision, recall, f1))
    
    print(f"{name} with KNN features - Accuracy: {accuracy:.3f}")
    
    plot_decision_region(X_combined_std[:, k2], y_combined, classifier=clf, test_idx=range(len(y_train), len(y_combined)))
    plt.title(f"{name} with KNN features")
    plt.xlabel(wine.columns[k2[0]+1])
    plt.ylabel(wine.columns[k2[1]+1])
    plt.legend(loc='upper left')
    plt.show()

# 使用隨機森林選取的前兩個重要特徵進行訓練和測試
for name, clf in classifiers.items():
    clf.fit(X_train_std[:, top2_indices], y_train)
    
    # 訓練準確率
    y_train_pred = clf.predict(X_train_std[:, top2_indices])
    train_accuracy = accuracy_score(y_train, y_train_pred)
    
    # 測試準確率
    y_test_pred = clf.predict(X_test_std[:, top2_indices])
    test_accuracy = accuracy_score(y_test, y_test_pred)
    
    # 平均準確率
    accuracy = (train_accuracy + test_accuracy) / 2
    
    # 其他評估指標
    precision = precision_score(y_test, y_test_pred, average='macro')
    recall = recall_score(y_test, y_test_pred, average='macro')
    f1 = f1_score(y_test, y_test_pred, average='macro')
    
    results_rf_features.append((name, accuracy, precision, recall, f1))
    
    print(f"{name} with RF features - Accuracy: {accuracy:.3f}")
    
    plot_decision_region(X_combined_std[:, top2_indices], y_combined, classifier=clf, test_idx=range(len(y_train), len(y_combined)))
    plt.title(f"{name} with RF features")
    plt.xlabel(wine.columns[top2_indices[0]+1])
    plt.ylabel(wine.columns[top2_indices[1]+1])
    plt.legend(loc='upper left')
    plt.show()

# 顯示結果
print("Summary of accuracies with KNN features:")
for result in results_knn_features:
    name, accuracy, precision, recall, f1 = result
    print(f"{name}: Accuracy={accuracy:.3f}, Precision={precision:.3f}, Recall={recall:.3f}, F1 Score={f1:.3f}")

print("\nSummary of accuracies with RF features:")
for result in results_rf_features:
    name, accuracy, precision, recall, f1 = result
    print(f"{name}: Accuracy={accuracy:.3f}, Precision={precision:.3f}, Recall={recall:.3f}, F1 Score={f1:.3f}")

# 將結果整理成表格
results_knn_df = pd.DataFrame(results_knn_features, columns=['Algorithm', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
results_rf_df = pd.DataFrame(results_rf_features, columns=['Algorithm', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])

print("\nResults with KNN features:")
print(results_knn_df)

print("\nResults with RF features:")
print(results_rf_df)

# 將結果轉換為HTML檔案
results_knn_html = results_knn_df.to_html()
results_rf_html = results_rf_df.to_html()

# 保存HTML檔案
with open("results_knn.html", "w") as file:
    file.write(results_knn_html)

with open("results_rf.html", "w") as file:
    file.write(results_rf_html)

print("HTML files have been created.")