## Import

In [115]:
import scipy.io as sio
from scipy.stats import mode
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier


%config InlineBackend.figure_format = 'retina'

## Define Constants and Methods - change this if adding new models/datasets

In [89]:
CUTOFF_SIZE = 5000
K_LIST_SIZE = 26

TEST_PARTITION_PERCENTAGES = [0.2,0.5,0.8]

NUM_MODELS = 4
NUM_DATASETS = 3
NUM_PARTITIONS = len(TEST_PARTITION_PERCENTAGES)
NUM_TRIALS = 3

# Encode all attributes values in accuracy table
KNN=0
SVM=1
LR=2
RF=3

PART_TEST_20=0
PART_TEST_50=1
PART_TEST_80=2

ADULT=0
COV=1
LETTER=2

In [None]:
# accuracy table is indexed as follows: 
# ACCU_TABLE[MODEL][DSET][PART][TRIAL]
# e.g.: ACCU_TABLE[KNN][ADULT][PART_TEST_50][2] = (train_acc,val_acc,test_acc)
# ---------------- Running this cell will clear all data! ----------------
ACCU_TABLE = np.zeros((NUM_MODELS,
                       NUM_DATASETS,
                       NUM_PARTITIONS,
                       NUM_TRIALS), 
                       dtype=object)
BEST_PARAM_TABLE = np.zeros((NUM_MODELS,
                             NUM_DATASETS,
                             NUM_PARTITIONS,
                             NUM_TRIALS),
                             dtype=object)

In [4]:
def print_shapes(*args):
    for arg in args:
       print(arg.shape)

def ___():
    print("-"*50)
    
def __():
    print("-"*25)

In [5]:
def store_acc_param(model_idx, 
                    dset_idx, 
                    part_idx,
                    trial_idx,
                    best_param,
                    train_acc,
                    val_acc,
                    test_acc): 
    BEST_PARAM_TABLE[model_idx][dset_idx][part_idx][trial_idx] = \
        best_param
    ACCU_TABLE[model_idx][dset_idx][part_idx][trial_idx] = \
        (train_acc, val_acc, test_acc)

## Load dataset

In [6]:
# Adult dataset
adult_set = pd.read_csv('adult.data', header=None,
                        skipinitialspace=True)
adult_set = adult_set[~adult_set.eq('?').any(1)] # data cleaning
adult_set = shuffle(adult_set)[:CUTOFF_SIZE]

# Covtype dataset
covtype_set = pd.read_csv('covtype.data', header=None,
                          skipinitialspace=True)
covtype_set = shuffle(covtype_set)[:CUTOFF_SIZE]

# Letter dataset
letter_set = pd.read_csv('letter-recognition.data.txt', header=None,
                         skipinitialspace=True)
letter_set = shuffle(letter_set)[:CUTOFF_SIZE]

In [7]:
# check shape of matrix
print_shapes(adult_set,covtype_set,letter_set)

(5000, 15)
(5000, 55)
(5000, 17)


## Data Preprocessing

In [8]:
# Pos-Neg dataset splitting criteria
print((adult_set[adult_set.shape[1] - 1] == ">50K").value_counts())
___()
# 2 is the most cover type, choose it to be true
print(covtype_set[covtype_set.shape[1] - 1].value_counts()) 
__()
print((covtype_set[covtype_set.shape[1] - 1] == 2).value_counts())
___()
print((letter_set[0] <= 'M').value_counts())

False    3762
True     1238
Name: 14, dtype: int64
--------------------------------------------------
2    2470
1    1782
3     345
7     160
6     152
5      67
4      24
Name: 54, dtype: int64
-------------------------
False    2530
True     2470
Name: 54, dtype: int64
--------------------------------------------------
True     2508
False    2492
Name: 0, dtype: int64


In [9]:
# label pos/neg data
adult_set[adult_set.shape[1] - 1] = adult_set[adult_set.shape[1] - 1] == ">50K"
covtype_set[covtype_set.shape[1] - 1] = covtype_set[covtype_set.shape[1] - 1] == 2
letter_set[0] = letter_set[0] <= 'M' # y is the first column

In [10]:
print_shapes(adult_set,covtype_set,letter_set)

(5000, 15)
(5000, 55)
(5000, 17)


In [11]:
# subsets to feed into models
adult_X_trains = []
adult_X_tests = []
adult_Y_trains = []
adult_Y_tests = []

cov_X_trains = []
cov_X_tests = []
cov_Y_trains = []
cov_Y_tests = []

letter_X_trains = []
letter_X_tests = []
letter_Y_trains = []
letter_Y_tests = []

# adult_X_trains is of size 9 = 3 trials * 3 partitions

# PRECONDITION:
# adult_set.shape[0] == covtype.shape[0] == letter_set.shape[0] == 5000
for i in range(NUM_TRIALS):
    adult_set = shuffle(adult_set)
    covtype_set = shuffle(covtype_set)
    letter_set = shuffle(letter_set)
    
    adult_X = adult_set.loc[:,:13]; adult_Y = adult_set.loc[:,14]
    cov_X = covtype_set.loc[:,:53]; cov_Y = covtype_set.loc[:,54]
    letter_X = letter_set.loc[:,1:]; letter_Y = letter_set.loc[:,0]

    adult_X = pd.get_dummies(adult_X)
    cov_X = pd.get_dummies(cov_X)
    letter_X = pd.get_dummies(letter_X)
    
    for part_percent in [0.2,0.5,0.8]:
        
        # working on adult dataset
        adult_X_train, adult_X_test, adult_Y_train, adult_Y_test = \
            train_test_split(adult_X, adult_Y, test_size=part_percent)

        adult_X_trains.append(adult_X_train)
        adult_X_tests.append(adult_X_test)
        adult_Y_trains.append(adult_Y_train)
        adult_Y_tests.append(adult_Y_test)
    
        # working on cov dataset
        cov_X_train, cov_X_test, cov_Y_train, cov_Y_test = \
            train_test_split(cov_X, cov_Y, test_size=part_percent)

        cov_X_trains.append(cov_X_train)
        cov_X_tests.append(cov_X_test)
        cov_Y_trains.append(cov_Y_train)
        cov_Y_tests.append(cov_Y_test)
    
        # working on letter dataset
        letter_X_train, letter_X_test, letter_Y_train, letter_Y_test = \
            train_test_split(letter_X, letter_Y, test_size=part_percent)

        letter_X_trains.append(letter_X_train)
        letter_X_tests.append(letter_X_test)
        letter_Y_trains.append(letter_Y_train)
        letter_Y_tests.append(letter_Y_test)
    
# POSTCONDITION:
# after the above process dataset lists becomes
# - trial 1
#   - [0]part1 (80% data)
#   - [1]part2 (50% data)
#   - [2]part3 (20% data)
# - trial 2
#   - [3]part1 (80% data)
#     ...
# with each input subset having 9 = 3 trials * 3 partition entries
# with a total of 9 such input subsets

In [None]:
# reorder the datasets into
# - part 1
#   - [0]part1_trial1 (80% data)
#   - [1]part1_trail2 (80% data)
#   - [2]part1_trail3 (80% data)
# - part 2
#   - [3]part2_trial1 (50% data)
#     ...

# mylist=['a','b','c','d','e']
# myorder=[3,2,0,1,4]
# mylist = [ mylist[i] for i in myorder] #// mylist becomes [d,c,a,b,e]
# print mylist



## Train & Test

### KNN

In [None]:
# parameters
CV_FOLD = 3

In [87]:
# KNN on adult dataset 
for part_itr in range(NUM_PARTITIONS):
    for trial_itr in range(NUM_TRIALS):
        data_idx = trial_itr * 3 + part_itr
        
        # extract input data from input data table
        adult_X_train = adult_X_trains[data_idx]
        adult_X_test = adult_X_tests[data_idx]
        adult_Y_train = adult_Y_trains[data_idx]
        adult_Y_test = adult_Y_tests[data_idx]
        
        # K_list elements varies by train size so need to be put inside loop
        K_list = np.linspace(start=1,
                             stop=adult_X_train.shape[0] * (1-1/CV_FOLD),
                             num=K_LIST_SIZE).astype(int) # because we are doing 3 fold cv
        knn_param = {'n_neighbors':K_list}
        
        clf_knn = GridSearchCV(KNeighborsClassifier(algorithm="brute"), 
                               knn_param, 
                               cv=CV_FOLD,
                               return_train_score=True)
        clf_knn.fit(adult_X_train, adult_Y_train)
        
        store_acc_param(KNN,ADULT,part_itr,trial_itr,
                        clf_knn.best_params_,
                        clf_knn.cv_results_['mean_train_score'][clf_knn.best_index_],
                        clf_knn.cv_results_['mean_test_score'][clf_knn.best_index_],
                        clf_knn.score(adult_X_test,adult_Y_test))

In [43]:
# KNN on cov dataset 
for part_itr in range(NUM_PARTITIONS):
    for trial_itr in range(NUM_TRIALS):
        data_idx = trial_itr * 3 + part_itr
        
        # extract input data from input data table
        cov_X_train = cov_X_trains[data_idx]
        cov_X_test = cov_X_tests[data_idx]
        cov_Y_train = cov_Y_trains[data_idx]
        cov_Y_test = cov_Y_tests[data_idx]
        
        # K_list elements varies by train size so need to be put inside loop
        K_list = np.linspace(start=1,
                             stop=cov_X_train.shape[0] * (1-1/CV_FOLD),
                             num=K_LIST_SIZE).astype(int) # because we are doing 3 fold cv
        knn_param = {'n_neighbors':K_list}
        
        clf_knn = GridSearchCV(KNeighborsClassifier(algorithm="brute"), 
                               knn_param, 
                               cv=CV_FOLD,
                               return_train_score=True)
        clf_knn.fit(cov_X_train, cov_Y_train)
        
        store_acc_param(KNN,COV,part_itr,trial_itr,
                        clf_knn.best_params_,
                        clf_knn.cv_results_['mean_train_score'][clf_knn.best_index_],
                        clf_knn.cv_results_['mean_test_score'][clf_knn.best_index_],
                        clf_knn.score(cov_X_test,cov_Y_test))

In [66]:
# KNN on letter dataset 
for part_itr in range(NUM_PARTITIONS):
    for trial_itr in range(NUM_TRIALS):
        data_idx = trial_itr * 3 + part_itr
        
        # extract input data from input data table
        letter_X_train = letter_X_trains[data_idx]
        letter_X_test = letter_X_tests[data_idx]
        letter_Y_train = letter_Y_trains[data_idx]
        letter_Y_test = letter_Y_tests[data_idx]
        
        # K_list elements varies by train size so need to be put inside loop
        K_list = np.linspace(start=1,
                             stop=letter_X_train.shape[0] * (1-1/CV_FOLD),
                             num=K_LIST_SIZE).astype(int) # because we are doing 3 fold cv
        knn_param = {'n_neighbors':K_list}
        
        clf_knn = GridSearchCV(KNeighborsClassifier(algorithm="brute"), 
                               knn_param, 
                               cv=CV_FOLD,
                               return_train_score=True)
        clf_knn.fit(letter_X_train, letter_Y_train)
        
        store_acc_param(KNN,LETTER,part_itr,trial_itr,
                        clf_knn.best_params_,
                        clf_knn.cv_results_['mean_train_score'][clf_knn.best_index_],
                        clf_knn.cv_results_['mean_test_score'][clf_knn.best_index_],
                        clf_knn.score(letter_X_test,letter_Y_test))

### SVM

In [68]:
# parameters
C_list_svm     = [1e-7,1e-6,1e-5,1e-4,1e-3,1e-2,1e-1, 1, 10, 100, 1000] # Different C to try.
widths = np.array([0.001,0.005,0.01,0.05,0.1,0.5,1,2])
gammas = 1/np.sqrt(widths*2)

svm_params_linear = {
    'C':C_list_svm
}

In [27]:
# SVM on adult dataset
for part_itr in range(NUM_PARTITIONS):
    for trial_itr in range(NUM_TRIALS):
        data_idx = trial_itr * 3 + part_itr
        
        # extract input data from input data table
        adult_X_train = preprocessing.scale(adult_X_trains[data_idx])
        adult_X_test = preprocessing.scale(adult_X_tests[data_idx])
        adult_Y_train = adult_Y_trains[data_idx]
        adult_Y_test = adult_Y_tests[data_idx]
        
        clf_svm = GridSearchCV(estimator = LinearSVC(), 
                               param_grid=svm_params_linear,
                               return_train_score=True)
        clf_svm.fit(adult_X_train,adult_Y_train)

        store_acc_param(SVM,ADULT,part_itr,trial_itr,
                        clf_svm.best_params_,
                        clf_svm.cv_results_['mean_train_score'][clf_svm.best_index_],
                        clf_svm.cv_results_['mean_test_score'][clf_svm.best_index_],
                        clf_svm.score(adult_X_test,adult_Y_test))



In [69]:
# SVM on cov dataset
for part_itr in range(NUM_PARTITIONS):
    for trial_itr in range(NUM_TRIALS):
        data_idx = trial_itr * 3 + part_itr
        
        # extract input data from input data table
        cov_X_train = preprocessing.scale(cov_X_trains[data_idx])
        cov_X_test = preprocessing.scale(cov_X_tests[data_idx])
        cov_Y_train = cov_Y_trains[data_idx]
        cov_Y_test = cov_Y_tests[data_idx]
        
        clf_svm = GridSearchCV(estimator = LinearSVC(), 
                               param_grid=svm_params_linear,
                               return_train_score=True)
        clf_svm.fit(cov_X_train,cov_Y_train)

        store_acc_param(SVM,COV,part_itr,trial_itr,
                        clf_svm.best_params_,
                        clf_svm.cv_results_['mean_train_score'][clf_svm.best_index_],
                        clf_svm.cv_results_['mean_test_score'][clf_svm.best_index_],
                        clf_svm.score(cov_X_test,cov_Y_test))

In [70]:
# SVM on letter dataset
for part_itr in range(NUM_PARTITIONS):
    for trial_itr in range(NUM_TRIALS):
        data_idx = trial_itr * 3 + part_itr
        
        # extract input data from input data table
        letter_X_train = preprocessing.scale(letter_X_trains[data_idx])
        letter_X_test = preprocessing.scale(letter_X_tests[data_idx])
        letter_Y_train = letter_Y_trains[data_idx]
        letter_Y_test = letter_Y_tests[data_idx]
        
        clf_svm = GridSearchCV(estimator = LinearSVC(), 
                               param_grid=svm_params_linear,
                               return_train_score=True)
        clf_svm.fit(letter_X_train,letter_Y_train)

        store_acc_param(SVM,LETTER,part_itr,trial_itr,
                        clf_svm.best_params_,
                        clf_svm.cv_results_['mean_train_score'][clf_svm.best_index_],
                        clf_svm.cv_results_['mean_test_score'][clf_svm.best_index_],
                        clf_svm.score(letter_X_test,letter_Y_test))

### LR

In [41]:
# parameters
C_list_lr = [1e-8,1e-7,1e-6,1e-5,1e-4,1e-3,1e-2,1e-1, 1, 10, 100, 1000, 10000] # Different C to try.

lr_params = {
    'C':C_list_lr
}

In [83]:
# LR on adult dataset 
for part_itr in range(NUM_PARTITIONS):
    for trial_itr in range(NUM_TRIALS):
        data_idx = trial_itr * 3 + part_itr
        
        # extract input data from input data table
        adult_X_train = adult_X_trains[data_idx]
        adult_X_test = adult_X_tests[data_idx]
        adult_Y_train = adult_Y_trains[data_idx]
        adult_Y_test = adult_Y_tests[data_idx]
        
        clf_lr = GridSearchCV(estimator = LogisticRegression(), 
                              param_grid=lr_params,
                              return_train_score=True)
        clf_lr.fit(adult_X_train,adult_Y_train)
        
        store_acc_param(LR,ADULT,part_itr,trial_itr,
                        clf_lr.best_params_,
                        clf_lr.cv_results_['mean_train_score'][clf_lr.best_index_],
                        clf_lr.cv_results_['mean_test_score'][clf_lr.best_index_],
                        clf_lr.score(adult_X_test,adult_Y_test))

In [117]:
# LR on cov dataset 
for part_itr in range(NUM_PARTITIONS):
    for trial_itr in range(NUM_TRIALS):
        data_idx = trial_itr * 3 + part_itr
        
        # extract input data from input data table
        cov_X_train = cov_X_trains[data_idx]
        cov_X_test = cov_X_tests[data_idx]
        cov_Y_train = cov_Y_trains[data_idx]
        cov_Y_test = cov_Y_tests[data_idx]
        
        clf_lr = GridSearchCV(estimator = LogisticRegression(), 
                              param_grid=lr_params,
                              return_train_score=True)
        clf_lr.fit(cov_X_train,cov_Y_train)
        
        store_acc_param(LR,COV,part_itr,trial_itr,
                        clf_lr.best_params_,
                        clf_lr.cv_results_['mean_train_score'][clf_lr.best_index_],
                        clf_lr.cv_results_['mean_test_score'][clf_lr.best_index_],
                        clf_lr.score(cov_X_test,cov_Y_test))

In [118]:
# LR on letter dataset 
for part_itr in range(NUM_PARTITIONS):
    for trial_itr in range(NUM_TRIALS):
        data_idx = trial_itr * 3 + part_itr
        
        # extract input data from input data table
        letter_X_train = letter_X_trains[data_idx]
        letter_X_test = letter_X_tests[data_idx]
        letter_Y_train = letter_Y_trains[data_idx]
        letter_Y_test = letter_Y_tests[data_idx]
        
        clf_lr = GridSearchCV(estimator = LogisticRegression(), 
                              param_grid=lr_params,
                              return_train_score=True)
        clf_lr.fit(letter_X_train,letter_Y_train)
        
        store_acc_param(LR,LETTER,part_itr,trial_itr,
                        clf_lr.best_params_,
                        clf_lr.cv_results_['mean_train_score'][clf_lr.best_index_],
                        clf_lr.cv_results_['mean_test_score'][clf_lr.best_index_],
                        clf_lr.score(letter_X_test,letter_Y_test))

### Random Forest

In [116]:
# parameters
NUM_TREES=[200]
NUM_FEATURES=[1,2,4,6,8,12,16,20]
rf_params = {
    'n_estimators':NUM_TREES,
    'max_features':NUM_FEATURES
}

In [120]:
# RF on adult dataset 
for part_itr in range(NUM_PARTITIONS):
    for trial_itr in range(NUM_TRIALS):
        data_idx = trial_itr * 3 + part_itr
        
        # extract input data from input data table
        adult_X_train = adult_X_trains[data_idx]
        adult_X_test = adult_X_tests[data_idx]
        adult_Y_train = adult_Y_trains[data_idx]
        adult_Y_test = adult_Y_tests[data_idx]
        
        clf_rf = GridSearchCV(estimator = RandomForestClassifier(), 
                              param_grid=rf_params,
                              return_train_score=True)
        clf_rf.fit(adult_X_train,adult_Y_train)
        
        store_acc_param(RF,ADULT,part_itr,trial_itr,
                        clf_rf.best_params_,
                        clf_rf.cv_results_['mean_train_score'][clf_rf.best_index_],
                        clf_rf.cv_results_['mean_test_score'][clf_rf.best_index_],
                        clf_rf.score(adult_X_test,adult_Y_test))

In [122]:
# RF on cov dataset 
for part_itr in range(NUM_PARTITIONS):
    for trial_itr in range(NUM_TRIALS):
        data_idx = trial_itr * 3 + part_itr
        
        # extract input data from input data table
        cov_X_train = cov_X_trains[data_idx]
        cov_X_test = cov_X_tests[data_idx]
        cov_Y_train = cov_Y_trains[data_idx]
        cov_Y_test = cov_Y_tests[data_idx]
        
        clf_rf = GridSearchCV(estimator = RandomForestClassifier(), 
                              param_grid=rf_params,
                              return_train_score=True)
        clf_rf.fit(cov_X_train,cov_Y_train)
        
        store_acc_param(RF,COV,part_itr,trial_itr,
                        clf_rf.best_params_,
                        clf_rf.cv_results_['mean_train_score'][clf_rf.best_index_],
                        clf_rf.cv_results_['mean_test_score'][clf_rf.best_index_],
                        clf_rf.score(cov_X_test,cov_Y_test))

In [None]:
# RF on letter dataset 
for part_itr in range(NUM_PARTITIONS):
    for trial_itr in range(NUM_TRIALS):
        data_idx = trial_itr * 3 + part_itr
        
        # extract input data from input data table
        letter_X_train = letter_X_trains[data_idx]
        letter_X_test = letter_X_tests[data_idx]
        letter_Y_train = letter_Y_trains[data_idx]
        letter_Y_test = letter_Y_tests[data_idx]
        
        clf_rf = GridSearchCV(estimator = RandomForestClassifier(), 
                              param_grid={
                                'n_estimators':NUM_TREES,
                                'max_features':[1,2,4,6,8,12,16]
                              },
                              return_train_score=True)
        clf_rf.fit(letter_X_train,letter_Y_train)
        
        store_acc_param(RF,LETTER,part_itr,trial_itr,
                        clf_rf.best_params_,
                        clf_rf.cv_results_['mean_train_score'][clf_rf.best_index_],
                        clf_rf.cv_results_['mean_test_score'][clf_rf.best_index_],
                        clf_rf.score(letter_X_test,letter_Y_test))

## Save Outputs

In [121]:
# dump raw accuracies & best_params to csv files
np.save('best_params_4.npy',BEST_PARAM_TABLE)
np.save('accuracies_4.npy',ACCU_TABLE)

## Evaluate Outputs & Results