##### Preparing the Datasets

In [1]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split

wine = load_wine()
wine_X_train, wine_X_test, wine_y_train, wine_y_test = train_test_split(wine.data, wine.target, random_state=309)

In [2]:
import numpy as np

zip_test_X = np.genfromtxt('zip.test', delimiter=' ', usecols=np.arange(1, 257))
zip_test_Y = np.genfromtxt('zip.test', delimiter=' ', usecols=0, dtype='int')
zip_train_X = np.genfromtxt('zip.train', delimiter=' ', usecols=np.arange(1, 257))
zip_train_Y = np.genfromtxt('zip.train', delimiter = ' ', usecols=0).astype(int)

zip_X = np.concatenate([zip_test_X, zip_train_X])
zip_Y = np.concatenate([zip_test_Y, zip_train_Y])
zip_X_train , zip_X_test, zip_y_train, zip_y_test = train_test_split(zip_X, zip_Y, random_state=309)


In [3]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

svm = SVC().fit(wine_X_train, wine_y_train)
generalization_accuracy = cross_val_score(svm, wine_X_train, wine_y_train).mean()
print("Generalization Accuracy for the Wine Dataset: ", generalization_accuracy)

Generalization Accuracy for the Wine Dataset:  0.6692307692307693


In [4]:
test_error_rate = 1 - svm.score(wine_X_test, wine_y_test)
print("Test Error Rate for Wine Dataset: ", test_error_rate)

Test Error Rate for Wine Dataset:  0.37777777777777777


In [5]:
svm = SVC().fit(zip_X_train, zip_y_train)
generalization_accuracy = cross_val_score(svm, zip_X_train, zip_y_train).mean()
print("Generalization Accuracy for the Zip Dataset: ", generalization_accuracy)


Generalization Accuracy for the Zip Dataset:  0.9710318158210045


In [6]:
test_error_rate = 1 - svm.score(zip_X_test, zip_y_test)
print("Test Error Rate for Zip Dataset: ", test_error_rate)

Test Error Rate for Zip Dataset:  0.03354838709677421


4. The test error rate for the Zip dataset is basically the complement of the Generalization Accuracy for the Zip dataset. For the wine dataset, the Test Error Rate is 0.03 or 3% off from being the complement for the Generalization Accuracy. (NEEDS WORK)

In [15]:
from sklearn.preprocessing import Normalizer, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.pipeline import make_pipeline

scalers = [Normalizer(), StandardScaler(), MinMaxScaler(), RobustScaler()]
pipes = [make_pipeline(scaler, SVC()) for scaler in scalers]

param_grid = {'svc__C': [0.01, 0.1, 1, 10, 100],
              'svc__gamma': [0.001, 0.01, 0.1, 1, 10, 100]}

from sklearn.model_selection import GridSearchCV

for pipe in pipes:
    grid = GridSearchCV(pipe, param_grid=param_grid, cv=5)
    grid.fit(wine_X_train, wine_y_train)
    print("--------------- Results for " + str(pipe.steps[0][1]) + " ---------------")
    print("Best cross-validation accuracy:", grid.best_score_)
    print("Test set score:", grid.score(wine_X_test, wine_y_test))
    print("Test error rate:", 1 - grid.score(wine_X_test, wine_y_test))
    print("Best Parameters:", grid.best_params_)

--------------- Results for Normalizer() ---------------
Best cross-validation accuracy: 0.9091168091168091
Test set score: 0.8888888888888888
Test error rate: 0.11111111111111116
Best Parameters: {'svc__C': 100, 'svc__gamma': 10}
--------------- Results for StandardScaler() ---------------
Best cross-validation accuracy: 0.9851851851851852
Test set score: 1.0
Test error rate: 0.0
Best Parameters: {'svc__C': 1, 'svc__gamma': 0.01}
--------------- Results for MinMaxScaler() ---------------
Best cross-validation accuracy: 0.9851851851851852
Test set score: 0.9777777777777777
Test error rate: 0.022222222222222254
Best Parameters: {'svc__C': 10, 'svc__gamma': 0.1}
--------------- Results for RobustScaler() ---------------
Best cross-validation accuracy: 0.9851851851851852
Test set score: 0.9777777777777777
Test error rate: 0.022222222222222254
Best Parameters: {'svc__C': 10, 'svc__gamma': 0.01}


In [16]:
for pipe in pipes:
    grid = GridSearchCV(pipe, param_grid=param_grid, cv=5, n_jobs=-1)
    grid.fit(zip_X_train, zip_y_train)
    print("--------------- Results for " + str(pipe.steps[0][1]) + " ---------------")
    print("Best cross-validation accuracy:", grid.best_score_)
    print("Test set score:", grid.score(zip_X_test, zip_y_test))
    print("Test error rate:", 1 - grid.score(zip_X_test, zip_y_test))
    print("Best Parameters:", grid.best_params_)

--------------- Results for Normalizer() ---------------
Best cross-validation accuracy: 0.9731830733867112
Test set score: 0.9724731182795698
Test error rate: 0.027526881720430163
Best Parameters: {'svc__C': 100, 'svc__gamma': 1}
--------------- Results for StandardScaler() ---------------
Best cross-validation accuracy: 0.9664426651856651
Test set score: 0.9625806451612903
Test error rate: 0.03741935483870973
Best Parameters: {'svc__C': 10, 'svc__gamma': 0.001}
--------------- Results for MinMaxScaler() ---------------
Best cross-validation accuracy: 0.9690239274309252
Test set score: 0.9673118279569892
Test error rate: 0.03268817204301078
Best Parameters: {'svc__C': 10, 'svc__gamma': 0.01}
--------------- Results for RobustScaler() ---------------
Best cross-validation accuracy: 0.9209759182980823
Test set score: 0.9359139784946237
Test error rate: 0.06408602150537634
Best Parameters: {'svc__C': 100, 'svc__gamma': 0.001}


In [None]:
def cross_conformal_predictor(predictor = SVC(), k_folds:int = 5):
    pass