# Lab07: KNN, SVM, Data Preprocessing, and Scikit-learn Pipeline
<hr>

110062802 呂宸漢


## 1. Preprocess Data and Handle Missing Value

用`LabelEncoder`將字串資料轉成數字並將`?`設成`NaN`。


In [1]:
import pandas as pd

df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/'
                 'mushroom/agaricus-lepiota.data', header=None, engine='python')
column_name = ['classes', 'cap-shape', 'cap-surface', 'cap-color', 'bruises?', 'odor',
               'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
               'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
               'stalk-surface-below-ring', 'stalk-color-above-ring',
               'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
               'ring-type', 'spore-print-color', 'population', 'habitat']
df.columns = column_name
df.head()


Unnamed: 0,classes,cap-shape,cap-surface,cap-color,bruises?,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [2]:
from sklearn.preprocessing import LabelEncoder
import numpy as np

le = LabelEncoder()
df['classes'] = le.fit_transform(df['classes'].values)

categorical_features = ['cap-shape', 'cap-surface', 'cap-color', 'bruises?', 'odor',
                        'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
                        'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
                        'stalk-surface-below-ring', 'stalk-color-above-ring',
                        'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
                        'ring-type', 'spore-print-color', 'population', 'habitat']

categories = []
for i in categorical_features:
    df[i] = le.fit_transform(df[i].values)
    classes_list = le.classes_.tolist()
    if '?' in classes_list:
        idx = classes_list.index('?')
        df[i] = df[i].replace(idx, np.nan)
    categories.append(np.arange(len(classes_list)))

df.head(5)


Unnamed: 0,classes,cap-shape,cap-surface,cap-color,bruises?,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


## 2. Split Data into Training and Testing Data


In [3]:
from sklearn.model_selection import train_test_split

df_X = df.drop(columns=['classes'])
df_y = df['classes']

categorical_feature_indices = [df_X.columns.to_list().index(f) for f in categorical_features]

X_train, X_test, y_train, y_test = train_test_split(
    df_X.values, df_y.values, test_size=0.2, random_state=0)


## 3. Preprocess Data and Evaluate Model

將資料依序經過下列方式處理並餵進model做training與prediction。
1. 利用`SimpleImputer`將`NaN`取代成最常出現的資料。
2. 將所有categorical features利用`OneHotEncoder`轉成one hot。
3. 利用`StandardScaler`做標準化。

由於兩個model在預測testing data的accuracy都為100%，因此兩個model都可以選為最佳的model。


In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

ohe = ColumnTransformer(
    [
        ('ohe', OneHotEncoder(categories=categories, sparse=False), categorical_feature_indices),
    ],
    remainder="passthrough"
)

pipe_knn = Pipeline([
    ("imr", SimpleImputer(missing_values=np.NaN, strategy='most_frequent')),
    ("ohe", ohe),
    ("scl", StandardScaler()),
    ("clf", KNeighborsClassifier())
])

pipe_svc = Pipeline([
    ("imr", SimpleImputer(missing_values=np.NaN, strategy='most_frequent')),
    ("ohe", ohe),
    ("scl", StandardScaler()),
    ('clf', SVC())
])

pipe_knn.fit(X_train, y_train)
y_pred = pipe_knn.predict(X_test)
print('[KNN]')
print('Misclassified samples: %d' % (y_test != y_pred).sum())
print('Accuracy: %.4f' % accuracy_score(y_test, y_pred))

pipe_svc.fit(X_train, y_train)
y_pred = pipe_svc.predict(X_test)
print('\n[SVC]')
print('Misclassified samples: %d' % (y_test != y_pred).sum())
print('Accuracy: %.4f' % accuracy_score(y_test, y_pred))


[KNN]
Misclassified samples: 0
Accuracy: 1.0000

[SVC]
Misclassified samples: 0
Accuracy: 1.0000
