In [1]:
import numpy as np
import pandas as pd
import  matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score
import warnings

In [2]:
# WIne dataset
from sklearn.datasets import load_wine
wine = load_wine()
print(wine.data.shape)

(178, 13)


In [3]:
# USPS dataset
USPS_train = np.genfromtxt("zip.train")
USPS_test = np.genfromtxt("zip.test")
usps_data = np.concatenate((USPS_train, USPS_test), axis = 0)

In [4]:
X_train, X_test, Y_train, Y_test = train_test_split(wine.data, wine.target, random_state = 107)

In [5]:
usps_x = usps_data[:,:-1]
usps_y = usps_data[:,-1].astype(int)
usps_X_train, usps_X_test, usps_Y_train, usps_Y_test = train_test_split(usps_x, usps_y, random_state = 107)

In [6]:
warnings.filterwarnings('ignore')
svm = SVC()
score_wine = cross_val_score(svm, X_train, Y_train)
wine_train = np.mean(score_wine)
print('accuracy of wine:', wine_train)

accuracy of wine: 0.7293447293447294


In [7]:
from sklearn import preprocessing
svm =SVC()
encoder = preprocessing.LabelEncoder()
score_encoded = encoder.fit_transform(usps_Y_train)
score_usps = cross_val_score(svm, usps_X_train, usps_Y_train)
usps_train = np.mean(score_usps)
print('accuracy of usps:', usps_train)

accuracy of usps: 0.9905349603780668


In [8]:
svm.fit(X_train, Y_train)
test_error_wine = 1 - svm.score(X_test, Y_test)
print('Test error rate of wine:', test_error_wine)

Test error rate of wine: 0.37777777777777777


In [9]:
svm.fit(usps_X_train, usps_Y_train)
test_error_usps = 1 - svm.score(usps_X_test, usps_Y_test)
print('Test error rate of usps:', test_error_usps)

Test error rate of usps: 0.007311827956989259


## Observations.

* Both accuracy and test error rate for both datasets are similar.

In [10]:
from sklearn.model_selection import GridSearchCV
param_grid = {'svc__C': [0.01, 0.1, 1, 10, 100],'svc__gamma': [0.001, 0.01, 0.1, 1, 10, 100]}

In [11]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, Normalizer


In [12]:
# Creating pipelines for different Normalization
MMS = make_pipeline(MinMaxScaler(), SVC())
SS = make_pipeline(StandardScaler(), SVC())
RS = make_pipeline(RobustScaler(), SVC())
Norm = make_pipeline(Normalizer(), SVC())

In [13]:
# WIne dataset.
print('__Wine Dataset__')
mms_grid_wine = GridSearchCV(MMS, param_grid = param_grid)
mms_grid_wine.fit(X_train, Y_train)
print('accuracy of MinMaxScaler:', mms_grid_wine.best_score_)

ss_grid_wine = GridSearchCV(SS, param_grid = param_grid)
ss_grid_wine.fit(X_train, Y_train)
print('accuracy of StandardScaler:', ss_grid_wine.best_score_)

rs_grid_wine = GridSearchCV(RS, param_grid = param_grid)
rs_grid_wine.fit(X_train, Y_train)
print('accuracy of RobustScaler:', rs_grid_wine.best_score_)

n_grid_wine = GridSearchCV(Norm, param_grid = param_grid)
n_grid_wine.fit(X_train, Y_train)
print('accuracy of Normalizer:', n_grid_wine.best_score_)

__Wine Dataset__
accuracy of MinMaxScaler: 0.9925925925925926
accuracy of StandardScaler: 0.9925925925925926
accuracy of RobustScaler: 0.9925925925925926
accuracy of Normalizer: 0.9401709401709402


In [14]:
# usps dataset.
param_grid1 = {'svc__C': [0.01, 1, 10],'svc__gamma': [0.001, 0.01, 0.1]}
print('__USPS Dataset__')
mms_grid_usps = GridSearchCV(MMS, param_grid = param_grid1)
mms_grid_usps.fit(usps_X_train, usps_Y_train)
print('accuracy of MinMaxScaler:', mms_grid_usps.best_score_)

ss_grid_usps = GridSearchCV(SS, param_grid = param_grid1)
ss_grid_usps.fit(usps_X_train, usps_Y_train)
print('accuracy of StandardScaler:', ss_grid_usps.best_score_)

rs_grid_usps = GridSearchCV(RS, param_grid = param_grid1)
rs_grid_usps.fit(usps_X_train, usps_Y_train)
print('accuracy of RobustScaler:', rs_grid_usps.best_score_)

n_grid_usps = GridSearchCV(Norm, param_grid = param_grid1)
n_grid_usps.fit(usps_X_train, usps_Y_train)
print('accuracy of Normalizer:', n_grid_usps.best_score_)

__USPS Dataset__
accuracy of MinMaxScaler: 0.9939765405244184
accuracy of StandardScaler: 0.9967014804872907
accuracy of RobustScaler: 0.9886709554002561
accuracy of Normalizer: 0.9931160169286702


In [15]:
# WIne dataset.
print('__Wine Dataset__')
mms_predict_wine = mms_grid_wine.predict(X_test)
print('Test Score of MinMaxScaler:', np.mean(Y_test == mms_predict_wine))

ss_predict_wine = ss_grid_wine.predict(X_test)
print('Test Score of StandardScaler:', np.mean(Y_test == ss_predict_wine))

rs_predict_wine = rs_grid_wine.predict(X_test)
print('Test Score of RobustScaler:', np.mean(Y_test == rs_predict_wine))

n_predict_wine = n_grid_wine.predict(X_test)
print('Test Score of Normalizer:', np.mean(Y_test == n_predict_wine))

__Wine Dataset__
Test Score of MinMaxScaler: 0.9333333333333333
Test Score of StandardScaler: 0.9111111111111111
Test Score of RobustScaler: 0.9555555555555556
Test Score of Normalizer: 0.8888888888888888


In [16]:
# USPS Dataset.
print('__USPS Dataset__')
mms_predict_usps = mms_grid_usps.predict(usps_X_test)
print('Test Score of MinMaxScaler:', np.mean(usps_Y_test == mms_predict_usps))

ss_predict_usps = ss_grid_usps.predict(usps_X_test)
print('Test Score of StandardScaler:', np.mean(usps_Y_test == ss_predict_usps))

rs_predict_usps = rs_grid_usps.predict(usps_X_test)
print('Test Score of RobustScaler:', np.mean(usps_Y_test == rs_predict_usps))

n_predict_usps = n_grid_usps.predict(usps_X_test)
print('Test Score of Normalizer:', np.mean(usps_Y_test == n_predict_usps))

__USPS Dataset__
Test Score of MinMaxScaler: 0.995268817204301
Test Score of StandardScaler: 0.9965591397849463
Test Score of RobustScaler: 0.9879569892473118
Test Score of Normalizer: 0.9944086021505376


## Observations.

* For Wine Dataset.
    * MinMaxScaler and RobustScaler are better considering the change in performance across all the four types of Normalization
    
* For USPS Dataset.
    * MinMaxScaler and Normalizer are better considering the change in performance.

## MLP

In [17]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(random_state = 107)
scores_mlp_wine = cross_val_score(mlp, X_train,Y_train)
accuracy_mlp_wine = np.mean(scores_mlp_wine)

print('Initial accuracy with MLP:', accuracy_mlp_wine)
print('accuracy of wine:', wine_train)

Initial accuracy with MLP: 0.5116809116809116
accuracy of wine: 0.7293447293447294


In [18]:
mlp.fit(X_train, Y_train)
predict_mlp_wine = mlp.predict(X_test)
print('Test error rate with MLP', np.mean(Y_test != predict_mlp_wine))
print('Test error of wine:', test_error_wine)

Test error rate with MLP 0.5555555555555556
Test error of wine: 0.37777777777777777


In [19]:
# USPS Dataset.
mlp = MLPClassifier(random_state = 107)
scores_mlp_usps = cross_val_score(mlp, usps_X_train,usps_Y_train)
accuracy_mlp_usps = np.mean(scores_mlp_usps)

print('Initial accuracy with MLP:', accuracy_mlp_usps)
print('accuracy of usps:', usps_train)

Initial accuracy with MLP: 0.9922558018749068
accuracy of usps: 0.9905349603780668


In [20]:
mlp.fit(usps_X_train, usps_Y_train)
predict_mlp_usps = mlp.predict(usps_X_test)
print('Test error rate with MLP', np.mean(usps_Y_test != predict_mlp_usps))
print('Test error of usps:', test_error_usps)

Test error rate with MLP 0.005161290322580645
Test error of usps: 0.007311827956989259


In [21]:
# WIne dataset.
param_grid = {'mlpclassifier__solver': ['lbfgs','adam'],
              'mlpclassifier__hidden_layer_sizes': [[10],[10,10],[10,10,10]],
              'mlpclassifier__activation':['relu']}

MMS_wine = make_pipeline(MinMaxScaler(), mlp)
grid = GridSearchCV(MMS_wine, param_grid = param_grid)
grid.fit(X_train, Y_train)
predict_MMS_wine = grid.predict(X_test)
print('Test Error by (MinMaxScaler) Classifier on Wine dataset', np.mean(Y_test != predict_MMS_wine))
print('Test Error by MinMaxScaler on Wine dataset',np.mean(Y_test != MMS_wine))

Norm_wine = make_pipeline(Normalizer(), mlp)
grid_norm = GridSearchCV(Norm_wine, param_grid = param_grid)
grid_norm.fit(X_train, Y_train)
predict_Norm_wine = grid_norm.predict(X_test)
print('Test error by (Normalizer) classifier on wine dataset', np.mean(Y_test != predict_Norm_wine))
print('Test error by Normalizer on wine dataset', np.mean(Y_test != Norm_wine))

Test Error by (MinMaxScaler) Classifier on Wine dataset 0.15555555555555556
Test Error by MinMaxScaler on Wine dataset 1.0
Test error by (Normalizer) classifier on wine dataset 0.06666666666666667
Test error by Normalizer on wine dataset 1.0


In [22]:
# USPS Dataset.
param_grid = {'mlpclassifier__solver': ['lbfgs','adam'],
              'mlpclassifier__hidden_layer_sizes': [[10],[10,10],[10,10,10]],
              'mlpclassifier__activation':['relu']}

MMS_usps = make_pipeline(MinMaxScaler(), mlp)
usps_grid = GridSearchCV(MMS_usps, param_grid = param_grid)
usps_grid.fit(usps_X_train, usps_Y_train)
predict_MMS_usps = usps_grid.predict(usps_X_test)
print('Test Error by (MinMaxScaler) Classifier on USPS dataset', np.mean(usps_Y_test != predict_MMS_usps))
print('Test Error by MinMaxScaler on USPS dataset',np.mean(usps_Y_test != MMS_usps))

Norm_usps = make_pipeline(Normalizer(), mlp)
usps_grid_norm = GridSearchCV(Norm_usps, param_grid = param_grid)
usps_grid_norm.fit(usps_X_train, usps_Y_train)
predict_Norm_usps = usps_grid_norm.predict(usps_X_test)
print('Test error by (Normalizer) classifier on USPS dataset', np.mean(usps_Y_test != predict_Norm_usps))
print('Test error by Normalizer on USPS dataset', np.mean(usps_Y_test != Norm_usps))

Test Error by (MinMaxScaler) Classifier on USPS dataset 0.005591397849462366
Test Error by MinMaxScaler on USPS dataset 1.0
Test error by (Normalizer) classifier on USPS dataset 0.004731182795698925
Test error by Normalizer on USPS dataset 1.0


## Observations.

* For wine dataset.

In [23]:
print('Initial accuracy with MLP:', accuracy_mlp_wine)
print('accuracy of wine:', wine_train)
print('Test error rate with MLP', np.mean(Y_test != predict_mlp_wine))
print('Test error of wine:', test_error_wine)
print('Test Error by (MinMaxScaler) Classifier on Wine dataset', np.mean(Y_test != predict_MMS_wine))
print('Test Error by MinMaxScaler on Wine dataset',np.mean(Y_test != MMS_wine))
print('Test error by (Normalizer) classifier on wine dataset', np.mean(Y_test != predict_Norm_wine))
print('Test error by Normalizer on wine dataset', np.mean(Y_test != Norm_wine))

Initial accuracy with MLP: 0.5116809116809116
accuracy of wine: 0.7293447293447294
Test error rate with MLP 0.5555555555555556
Test error of wine: 0.37777777777777777
Test Error by (MinMaxScaler) Classifier on Wine dataset 0.15555555555555556
Test Error by MinMaxScaler on Wine dataset 1.0
Test error by (Normalizer) classifier on wine dataset 0.06666666666666667
Test error by Normalizer on wine dataset 1.0


* For USPS dataset.

In [24]:
print('Initial accuracy with MLP:', accuracy_mlp_usps)
print('accuracy of usps:', usps_train)
print('Test error rate with MLP', np.mean(usps_Y_test != predict_mlp_usps))
print('Test error of usps:', test_error_usps)
print('Test Error by (MinMaxScaler) Classifier on USPS dataset', np.mean(usps_Y_test != predict_MMS_usps))
print('Test Error by MinMaxScaler on USPS dataset',np.mean(usps_Y_test != MMS_usps))
print('Test error by (Normalizer) classifier on USPS dataset', np.mean(usps_Y_test != predict_Norm_usps))
print('Test error by Normalizer on USPS dataset', np.mean(usps_Y_test != Norm_usps))

Initial accuracy with MLP: 0.9922558018749068
accuracy of usps: 0.9905349603780668
Test error rate with MLP 0.005161290322580645
Test error of usps: 0.007311827956989259
Test Error by (MinMaxScaler) Classifier on USPS dataset 0.005591397849462366
Test Error by MinMaxScaler on USPS dataset 1.0
Test error by (Normalizer) classifier on USPS dataset 0.004731182795698925
Test error by Normalizer on USPS dataset 1.0
