In [9]:
# inline plotting instead of popping out
%matplotlib inline

# python 3.8.8
import os, itertools, csv

from IPython.display import Image
from IPython.display import display

# numpy  1.22.4
import numpy as np

# pandas  1.2.4
import pandas as pd

# scikit-learn  0.24.1
from sklearn.compose import ColumnTransformer
from sklearn.datasets import make_moons
from sklearn.impute import SimpleImputer 
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.svm import SVC

# matplotlib  3.3.4
import matplotlib
matplotlib.rcParams.update({'font.size': 12})
plt = matplotlib.pyplot

# load utility classes/functions e.g., plot_decision_regions()
import urllib.request
urllib.request.urlretrieve("https://nthu-datalab.github.io/ml/labs/04-1_Perceptron_Adaline/lab04lib.py", "lab04lib.py")
from lab04lib import *

# Make output directory
if not os.path.exists("output/") : os.mkdir("output/")
    
import warnings
warnings.filterwarnings("ignore")

In [21]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/'
                 'mushroom/agaricus-lepiota.data', header=None, engine='python')
column_name = ['classes','cap-shape', 'cap-surface','cap-color','bruises?','odor',
               'gill-attachment','gill-spacing','gill-size','gill-color',
               'stalk-shape','stalk-root','stalk-surface-above-ring',
               'stalk-surface-below-ring','stalk-color-above-ring',
               'stalk-color-below-ring','veil-type','veil-color','ring-number',
               'ring-type','spore-print-color','population','habitat']
df.columns = column_name
df.head()

Unnamed: 0,classes,cap-shape,cap-surface,cap-color,bruises?,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [22]:
# encode label first
label_le = LabelEncoder()
df['classes'] = label_le.fit_transform(df['classes'].values)

label_category = LabelEncoder()
category_features = df.columns[1:]
categories = []


# transform categorical values into numerical values
# be careful that '?' will also be encoded
# we have to replace it to NaN in numerical
for i in category_features:
    df[i] = label_category.fit_transform(df[i].values)
    class_list = label_category.classes_.tolist()

    # replace '?' with 'NaN'
    if '?' in class_list:
        idx = class_list.index('?')
        df[i] = df[i].replace(idx, np.nan)
  
    categories.append(np.arange(len(class_list)))

display(df.head(15))

Unnamed: 0,classes,cap-shape,cap-surface,cap-color,bruises?,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1
5,0,5,3,9,1,0,1,0,0,5,...,2,7,7,0,2,1,4,2,2,1
6,0,0,2,8,1,0,1,0,0,2,...,2,7,7,0,2,1,4,2,2,3
7,0,0,3,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,3,3
8,1,5,3,8,1,6,1,0,1,7,...,2,7,7,0,2,1,4,2,4,1
9,0,0,2,9,1,0,1,0,0,2,...,2,7,7,0,2,1,4,2,3,3


In [23]:
print(df.shape)

# drop rows with missing values
df_drop_row = df.dropna()
print(df_drop_row.shape)

(8124, 23)
(5644, 23)


In [40]:
# we perform one-hot encoding on both impute data and drop-row data
# impute_data = df_impute.values
drop_row_data = df_drop_row.values

# find the index of the categorical feature
catego_features_idx = []
for fea in category_features:
    catego_features_idx.append(df.columns.tolist().index(fea)-1)

# give the column index you want to do one-hot encoding
ohe = ColumnTransformer(
    [
        ("one_hot_encoder", OneHotEncoder(sparse = False, categories = "auto"), catego_features_idx)
    ],
    remainder = "passthrough"
)

# impute_onehot_data = ohe.fit_transform(impute_data)
# print('Impute: {}'.format(impute_data.shape))
# print('Impute one-hot: {}'.format(impute_onehot_data.shape))

drop_row_onehot_data = ohe.fit_transform(drop_row_data)
print('Drop row: {}'.format(drop_row_data.shape))
print('Drop row one-hot: {}'.format(drop_row_onehot_data.shape))

Drop row: (5644, 23)
Drop row one-hot: (5644, 95)


In [41]:
X = df.drop('classes', axis=1).values
Y = df['classes'].values

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

ohe = ColumnTransformer(
    [
        ('ohe', OneHotEncoder(categories = categories, sparse = False), catego_features_idx),
    ],
    remainder = "passthrough"
)

pipeLine_knn = Pipeline([
                        ("imr", SimpleImputer(missing_values = np.NaN, strategy = 'most_frequent')),
                        ("ohe", ohe),
                        ("scl", StandardScaler()),
                        ("clf", KNeighborsClassifier(n_neighbors = 10, p = 2, metric = "minkowski"))
])

pipeLine_svc = Pipeline([
                         ("imr", SimpleImputer(missing_values = np.NaN, strategy = 'most_frequent')),
                         ("ohe", ohe),
                         ("scl", StandardScaler()),
                         ("clf", SVC(kernel='rbf', random_state=0, gamma=0.001, C=100.0))
])


pipeLine_knn.fit(x_train, y_train)
y_pred = pipeLine_knn.predict(x_test)
print('[KNN]')
print('Misclassified samples: %d' % (y_test != y_pred).sum())
print('Accuracy: %.4f' % accuracy_score(y_test, y_pred))

pipeLine_svc.fit(x_train, y_train)
y_pred = pipeLine_svc.predict(x_test)
print('\n[SVC]')
print('Misclassified samples: %d' % (y_test != y_pred).sum())
print('Accuracy: %.4f' % accuracy_score(y_test, y_pred))
     

[KNN]
Misclassified samples: 0
Accuracy: 1.0000

[SVC]
Misclassified samples: 0
Accuracy: 1.0000


# PipeLine_svc
## choose best model

In [42]:
param_gamma = [0.0001, 0.001, 0.01, 0.1, 1.0]
param_C = [0.1, 1.0, 10.0, 100.0]

# here you can set parameter for different steps 
# by adding two underlines (__) between step name and parameter name
param_grid = [{'clf__C': param_C, 
               'clf__kernel': ['linear']},
              {'clf__C': param_C, 
               'clf__gamma': param_gamma, 
               'clf__kernel': ['rbf']}]

# set pipe_svm as the estimator
gs = GridSearchCV(
    estimator = pipeLine_svc, 
    param_grid = param_grid, 
    scoring = "accuracy",
    cv = 3
)

gs = gs.fit(x_train, y_train)
print('[SVC: grid search]')
print('Validation accuracy: %.3f' % gs.best_score_)
print(gs.best_params_)

clf = gs.best_estimator_
clf.fit(x_train, y_train)
print('Test accuracy: %.3f' % clf.score(x_test, y_test))

[SVC: grid search]
Validation accuracy: 1.000
{'clf__C': 0.1, 'clf__kernel': 'linear'}
Test accuracy: 1.000
