<h3>使用資料: wine.csv</h3>

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegressionCV
import numpy as np
from sklearn.svm import SVC, LinearSVC
from sklearn.neural_network import MLPClassifier


<h3>函式說明: </h3>
將資料集分割成訓練集和測試集，比例為 7:3。<br>
使用 StandardScaler() 進行資料標準化，並使用PCA降維至 2個維度<br>
使用 LogisticRegression 進行模型訓練，其中使用了三種不同的 solver，並計算其在測試集上的準確率。<br>
使用 LogisticRegressionCV 進行模型訓練，同樣使用了三種不同的 solver 和不同的正則化程度 (Cs)，並計算其在測試集上的準確率。<br>
使用 SVM 進行模型訓練，其中使用了不同的核函數 (linear、rbf、poly) 搭配正則化程度 (C=1)，並計算其在測試集上的準確率。<br>
使用 MLPClassifier 進行模型訓練，其中使用了隱藏層大小 (h=30)、不同的激活函數 (logistic、relu) 和求解器 (adam、sgd)，並計算其在測試集上的準確率。<br>
同樣的步驟，但是使用PCA降維後的資料集。<br>
最後，函式會回傳原始的資料集、各模型在原始資料集上的準確率、各模型在PCA後資料集上的準確率。

In [74]:
def  classifier (X,y,dataname,h):
    data=dataname
    # Split data into training and testing data 7:3
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30) 
    # Standardize data
    scaler = StandardScaler()
    X_train_ = scaler.fit_transform(X_train)
    X_test_ = scaler.fit_transform(X_test)

#PCA
    pca = PCA(n_components = 2).fit(X_train_)
    Z_train = pca.transform(X_train_)
    Z_test = pca.transform(X_test_)

#LogisticRegression
    LR=np.zeros(3)
    LR_CV=np.zeros(3)
    LR_pca=np.zeros(3)
    LR_CV_pca=np.zeros(3)

    Cs = np.logspace(-5, 5, 20)
    opts = dict(tol = 1e-6, max_iter = int(1e6), verbose=1)
    solver = ['lbfgs','liblinear','newton-cg'] 
    for s in range(3):
        clf_original = LogisticRegression(solver = solver[s], **opts)
        clf_original.fit(X_train_, y_train)
        y_pred = clf_original.predict(X_test_)
        # 測 試 資 料 之 準 確 率 回 報
        LR[s]= accuracy_score(y_test, y_pred)

        clf_PCA = LogisticRegression(solver = solver[s], **opts)
        clf_PCA.fit(Z_train, y_train)
        LR_pca[s]=clf_PCA.score(Z_test, y_test)

#LogisticRegressionCV
        clf_original = LogisticRegressionCV(solver = solver[s], Cs=Cs, **opts)
        clf_original.fit(X_train_, y_train)
        y_pred = clf_original.predict(X_test_)
        # 測 試 資 料 之 準 確 率 回 報
        LR_CV[s]= accuracy_score(y_test, y_pred)

        clf_PCA = LogisticRegressionCV(solver = solver[s], Cs=Cs, **opts)
        clf_PCA.fit(Z_train, y_train)
        LR_CV_pca[s]=clf_PCA.score(Z_test, y_test)    

 # SVM
    SVM=np.zeros((2,3))
    SVM_pca=np.zeros((2,3))
    C = 1 # SVM regularization parameter
    opts = [dict(C = C, tol = 1e-6, max_iter = int(1e6)),dict(C = C, decision_function_shape = 'ovo', tol = 1e-6, max_iter = int(1e6))]
    
    for i in range(2):
        for j in range(3):
            clf_svm = [SVC(kernel="linear", **opts[i]),\
            SVC(kernel="rbf", gamma=0.2, **opts[i]),\
             SVC(kernel="poly", degree=3, gamma="auto", **opts[i])]
             #LinearSVC(**opts[i]) ]

            clf_svm[j].fit(X_train, y_train)
            predictions = clf_svm[j].predict(X_test)
            SVM[i][j]= accuracy_score(y_test, predictions)
            
            clf_svm[j].fit(Z_train, y_train) #pca
            predictions = clf_svm[j].predict(Z_test)
            SVM_pca[i][j]= accuracy_score(y_test, predictions)

#MLPClassifier
    MLP= np.zeros((2,2))
    MLP_pca = np.zeros((2,2))
    hidden_layers = (h,)
    activation = ['logistic','relu']
    solver = ['adam','sgd']
    for i in range(2):
        for j in range(2):
            opts = dict(hidden_layer_sizes = hidden_layers, verbose = True, \
            activation = activation[i], tol = 1e-6, max_iter = int(1e6))
            clf_MLP = MLPClassifier(solver = solver[j], **opts)
            clf_MLP.fit(X_train, y_train)
            predictions_mlp = clf_MLP.predict(X_test)
            MLP[i][j]= accuracy_score(y_test, predictions_mlp)

            clf_MLP.fit(Z_train, y_train) #pca
            predictions_mlp = clf_MLP.predict(Z_test)
            MLP_pca[i][j]= accuracy_score(y_test, predictions_mlp)

    return data,LR,LR_CV,SVM,MLP,LR_pca,LR_CV_pca,SVM_pca,MLP_pca

In [75]:
# Read data
df = pd.read_csv('E:\\淺度機器學習\\data\\wine.csv')
X = np.array(df.iloc[:, :-1]) # 排 除 最 後 一 欄 標 籤
y = np.array(df.iloc[:, -1])

data,LR,LR_CV,SVM,MLP,LR_pca,LR_CV_pca,SVM_pca,MLP_pca =classifier(X,y,"wine.csv",30)


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    1.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.8s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear

[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear

[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:    0.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.8s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.4s finished


Iteration 1, loss = 1.27625767
Iteration 2, loss = 1.26876266
Iteration 3, loss = 1.26140735
Iteration 4, loss = 1.25418597
Iteration 5, loss = 1.24709967
Iteration 6, loss = 1.24015161
Iteration 7, loss = 1.23334497
Iteration 8, loss = 1.22668204
Iteration 9, loss = 1.22016354
Iteration 10, loss = 1.21378770
Iteration 11, loss = 1.20754899
Iteration 12, loss = 1.20143615
Iteration 13, loss = 1.19542911
Iteration 14, loss = 1.18949428
Iteration 15, loss = 1.18357731
Iteration 16, loss = 1.17759200
Iteration 17, loss = 1.17140381
Iteration 18, loss = 1.16480618
Iteration 19, loss = 1.15749067
Iteration 20, loss = 1.14902482
Iteration 21, loss = 1.13893241
Iteration 22, loss = 1.12722519
Iteration 23, loss = 1.11604196
Iteration 24, loss = 1.11142378
Iteration 25, loss = 1.11211819
Iteration 26, loss = 1.11085390
Iteration 27, loss = 1.10736055
Iteration 28, loss = 1.10194035
Iteration 29, loss = 1.09505073
Iteration 30, loss = 1.08855617
Iteration 31, loss = 1.08472192
Iteration 32, los

In [77]:
print("dataname = ",data,"\n","LR = ",LR,"\n","LR_CV= ",LR_CV,"\n","SVM= ",SVM,"\n","MLP= ",MLP,\
      "\n--------------\n",\
      "LR_pca= ",LR_pca,"\n","LR_CV_pca= ",LR_CV_pca,"\n","SVM_pca= ",SVM_pca,"\n","MLP_pca= ",MLP_pca)

dataname =  wine.csv 
 LR =  [0.98148148 0.96296296 0.98148148] 
 LR_CV=  [0.90740741 0.96296296 0.90740741] 
 SVM=  [[0.94444444 0.35185185 0.98148148]
 [0.94444444 0.35185185 0.98148148]] 
 MLP=  [[0.90740741 0.2962963 ]
 [0.2962963  0.33333333]] 
--------------
 LR_pca=  [0.88888889 0.87037037 0.88888889] 
 LR_CV_pca=  [0.87037037 0.87037037 0.87037037] 
 SVM_pca=  [[0.85185185 0.7962963  0.77777778]
 [0.85185185 0.7962963  0.77777778]] 
 MLP_pca=  [[0.85185185 0.90740741]
 [0.83333333 0.90740741]]


<h3>正確率最高的模型</h3>
將各個模型中不同參數下，正確率最高的模型挑選出來作為代表，進行比較

In [20]:
acc = {'LR': [0.9815, 0.8889],
        'LR_CV': [ 0.9074,0.8704 ],
        'SVM': [ 0.9815, 0.8519 ],
        'MLP': [0.9074,0.9074]       
        }
index = ['original', 'pca']
df = pd.DataFrame(acc, index=index)
df

Unnamed: 0,LR,LR_CV,SVM,MLP
original,0.9815,0.9074,0.9815,0.9074
pca,0.8889,0.8704,0.8519,0.9074


<h3>實驗結果觀察</h3>
根據以上實驗結果，對於沒有進行PCA的資料集，羅吉斯迴歸模型皆有0.9以上的正確率，其中以　solver = ['lbfgs','newton-cg'] 兩模型獲得最高正確率，而SVM模型中除了 kernel="rbf" 的模型正確率明顯較低之外，其他四種模型正確率皆在0.95左右，MLP模型則是唯一以 activation='logistic'、solver='adam' 為最高正確率模型。<br>
當 n_components＝2 時，可以發現雖然大部份沒有做PCA的資料集正確率比有做PCA的資料集來得低，但有做PCA的資料集在每個模型的不同參數搭配下，有更相近（穩定）的正確率，此外，可以特別注意到，在SVM模型中 kernel="rbf" 的模型，以及MLP模型中 activation='logistic'、solver='sgd' 和 activation='relu' 搭配 solver = ['adam','sgd'] 的模型，在有做PCA的情況下，有更高的正確率，其中以 activation='relu' 的模型有最高正確率。<br>
綜合以上觀察結果，對於此資料集，會建議使用經過PCA的資料集，藉由 MLP(activation='relu') 的模型進行建模預測會更有效率!!!