In [2]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectFromModel
import matplotlib.pyplot as plt
import time

In [3]:
from sklearn.model_selection import GridSearchCV
def searchCV(X, y, params, estimator, folds=5):
    clf = GridSearchCV(estimator=estimator, param_grid=params, cv=folds)
    clf.fit(X, y)
    return clf, clf.best_estimator_, clf.best_score_, \
        clf.cv_results_.get('params'), clf.cv_results_.get('mean_test_score'), \
        clf.cv_results_.get('rank_test_score')

In [4]:
import matplotlib.pyplot as plt
import time

def preprocess(data: pd.DataFrame, percent: float):
    idx = int(data.shape[0] * percent)
    train = data[:idx]
    test = data[idx:]
    train_X = train.to_numpy()[:, 1:]
    train_y = train.to_numpy()[:, 0]
    test_X = test.to_numpy()[:, 1:]
    test_y = test.to_numpy()[:, 0]

    return train_X, train_y, test_X, test_y

def data_process() -> dict:
    data_path = "/content/drive/MyDrive/514_assignment2/letter-recognition.data"
    data = pd.read_csv(data_path, header=None)
    HK = data.loc[(data[0] == 'H') | (data[0] == 'K')]
    MY = data.loc[(data[0] == 'M') | (data[0] == 'Y')]
    LU = data.loc[(data[0] == 'L') | (data[0] == 'U')]
    return {
        'HK': preprocess(HK, .9),
        'MY': preprocess(MY, .9),
        'LU': preprocess(LU, .9)
    }

In [5]:
def plot_param_score(title, score, params, best_idx, dimensionReductionMethod):
    labels = []
    for _params in params:
        label = ''
        for param in _params:
            label += f'{param}: {_params[param]}\n'
        labels.append(label)
    plt.figure(figsize=(int(len(score)/1.5), 5))
    plt.ylabel("Scores")
    plt.xlabel("Hyperparams")
    plt.plot(labels, score, marker='o')
    plt.title(f'{title}\nBest Param(s): {labels[best_idx]}\nBest Score: {score[best_idx]}\nDimension Reduction Method: {dimensionReductionMethod}')
    plt.xticks(rotation=60)
    plt.tight_layout()
    for i in range(len(score)):
        plt.annotate(round(score[i], 5), (labels[i], score[i]))
    plt.savefig(f'{title}_{dimensionReductionMethod}')
    plt.clf()

In [None]:
#There are other dimension reduction methods I need, now I use PCA method, you can copy the method you need into the next module.
'''
    pca = PCA(n_components=4)
    ffs = SequentialFeatureSelector(KNeighborsClassifier(
        n_neighbors=3), n_features_to_select=4)
    bfe = SequentialFeatureSelector(KNeighborsClassifier(
        n_neighbors=3), n_features_to_select=4, direction='backward')
    forest = SelectFromModel(
        estimator=RandomForestClassifier(), max_features=4)
    '''

'''

        'pca': [
            pca.fit_transform(train_X),
            pca.transform(test_X)
        ],
    
        'forwardFeatureSelection': [
            ffs.fit_transform(train_X, train_y),
            ffs.transform(test_X)
        ],
        'backwardFeatureElimination': [
            bfe.fit_transform(train_X, train_y),
            bfe.transform(test_X)
        ],
        'randomForest': [
            forest.fit_transform(train_X, train_y), 
            forest.transform(test_X)
        ],
    }
'''

In [8]:
def dimension_reduction(train_X, test_X, train_y=None):

    pca = PCA(n_components=4)
    
    return {
        'none': [
            train_X,
            test_X
        ],

        'pca': [
            pca.fit_transform(train_X),
            pca.transform(test_X)
        ]
    }

    

In [None]:
#There are other models I used, now I use KNN Classifier, you can copy the method you need into the next module.
'''
                KNeighborsClassifier(): {
                    'n_neighbors': [1, 2, 3, 4, 5],
                    'algorithm': ('ball_tree', 'kd_tree', 'brute')
                },
                DecisionTreeClassifier(): {
                    'max_depth': [11, 14, 16, 17, 8],
                    'min_samples_leaf': [4, 5, 6, 7, 8]
                },
                SVC(): {
                    'C': [3.0, 4.0, 5.0, 6.0, 7.0],
                    'kernel': ['linear', 'poly', 'rbf'],
                    'max_iter':[100, 200, 300, 400, 500]
                },
                RandomForestClassifier(): {
                    'n_estimators': [100, 200, 300, 400, 500],
                    'max_depth': [None, 2, 4, 6, 8]
                }
                
            }
'''


In [9]:
if __name__ == '__main__':
    with open('results.yml', 'a') as f:
      pairs = data_process()
      for pair in pairs:
            f.write(f'\n###### Current Pair: {pair}')
            print('\n###### Current Pair: ', pair)
            train_X, train_y, test_X, test_y = pairs[pair]

            # Model, other models can be seen above.
            models = {
                KNeighborsClassifier(): {
                    'n_neighbors': [1, 2, 3, 4, 5],
                    'algorithm': ('ball_tree', 'kd_tree', 'brute')
                }
            }

            dReduction = dimension_reduction(train_X=train_X, test_X=test_X, train_y=train_y)
            plt_model_lb = []

            for model in models:
                f.write(f'\n##Model: {str(model)}')
                print(f'\n##Model: {str(model)}')
                
                for method in dReduction:
                    f.write(f"\n###Dimension Reduction Method: {method}")
                    print(f"\n###Dimension Reduction Method: {method}")
                    data = dReduction[method]
                  
                    cv_result = searchCV(X=data[0], y=train_y,
                                        params=models[model], estimator=model)
  
                    f.write(f"Best Estimator: {cv_result[1]}, params: {str(cv_result[3][cv_result[5].tolist().index(1)])}")
                    print(f"Best Estimator: {cv_result[1]}, params: {str(cv_result[3][cv_result[5].tolist().index(1)])}")
                    
                    plot_param_score(f'{pair}-{str(model)}',
                                    cv_result[4], cv_result[3], cv_result[5].tolist().index(1),method)
                    clf = cv_result[0]
                    score = clf.score(data[1], test_y)
                    f.write(f"Test Performance: {score}")
                    print(f"Test Performance: {score}")
                    
                    #you can uncomment the model you need
                    start = time.time()
                    KNeighborsClassifier()
                    #DecisionTreeClassifier()
                    #RandomForestClassifier()
                    #SVC()
                    end = time.time()
                    print("KNN classify time before tune:", end - start)


###### Current Pair:  HK

##Model: KNeighborsClassifier()

###Dimension Reduction Method: none
Best Estimator: KNeighborsClassifier(algorithm='brute', n_neighbors=1), params: {'algorithm': 'brute', 'n_neighbors': 1}
Test Performance: 0.9391891891891891
KNN classify time before tune: 8.821487426757812e-06

###Dimension Reduction Method: pca
Best Estimator: KNeighborsClassifier(algorithm='ball_tree'), params: {'algorithm': 'ball_tree', 'n_neighbors': 5}
Test Performance: 0.8918918918918919
KNN classify time before tune: 1.3589859008789062e-05

###### Current Pair:  MY

##Model: KNeighborsClassifier()

###Dimension Reduction Method: none
Best Estimator: KNeighborsClassifier(algorithm='ball_tree', n_neighbors=1), params: {'algorithm': 'ball_tree', 'n_neighbors': 1}
Test Performance: 1.0
KNN classify time before tune: 1.0013580322265625e-05

###Dimension Reduction Method: pca
Best Estimator: KNeighborsClassifier(algorithm='ball_tree', n_neighbors=3), params: {'algorithm': 'ball_tree', 'n_n

<Figure size 1000x500 with 0 Axes>

<Figure size 1000x500 with 0 Axes>

<Figure size 1000x500 with 0 Axes>

<Figure size 1000x500 with 0 Axes>

<Figure size 1000x500 with 0 Axes>

<Figure size 1000x500 with 0 Axes>