In [1]:
#importing mnist dataset and randomizing samples before dividing in training and testing
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_mldata
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt


# import warnings
# warnings.filterwarnings('ignore')


try:
    from sklearn.datasets import fetch_openml
    mnist = fetch_openml('mnist_784', version=1, cache=True)
except ImportError:
    from sklearn.datasets import fetch_mldata
    mnist = fetch_mldata('MNIST original')


X, y = mnist["data"], mnist["target"]

X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

np.random.seed(42)
shuffle_index = np.random.permutation(60000)
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]

In [56]:
#scaling the dataset and making 3 batches of training data and 2 validation sets. tr: half. rest half dividd in 2
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

m = X_train.shape[0]

int1, int2 = int( m/2), int(m/4)
X_tr_b1, X_tr_b2, X_tr_b3 = X_train_scaled[0:int1], X_train_scaled[int1:int1 + int2], X_train_scaled[int1 + int2:]

print("training shape:", X_tr_b1.shape)
print("validation sets shape:", X_tr_b2.shape)         

#taking small batches to test
x_tr = X_tr_b1[:6000]
y_tr = y_train[:6000]
print('small batch size ', x_tr.shape[0])

ValueError: could not convert string to float: '<1H OCEAN'

In [4]:
#testing different technques to see which is the best estimator with limited samples of 6000

#Using LinearSVC to fit and make predictions
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

svm_linear = LinearSVC(random_state = 42, C = 0.5)

svm_linear.fit(x_tr,y_tr)

y_pred = svm_linear.predict(x_tr)
tr_ac = accuracy_score(y_tr, y_pred)
print('LINEARSVC: Training acc: ', tr_ac)

y_predv = svm_linear.predict(X_tr_b2)
v_ac = accuracy_score(y_train[int1:int1+int2], y_predv)
print('LINEARSVC: Validation acc: ', v_ac)


#using poly kernel with degree 3
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

svm_poly_reg = SVC(kernel = 'poly', degree = 5,  C = 0.1, gamma = 'scale')
svm_poly_reg.fit(x_tr, y_tr)

y_pred = svm_poly_reg.predict(x_tr)
tr_ac = accuracy_score(y_tr, y_pred)
print('POLY KERNEL: Training acc: ', tr_ac)

y_predv = svm_poly_reg.predict(X_tr_b2)
v_ac = accuracy_score(y_train[int1:int1+int2], y_predv)
print('POLY KERNEL: Validation acc: ', v_ac)

#using rbf kernel
clf = SVC(C=1.0, kernel='rbf', gamma='scale', random_state=42, decision_function_shape='ovr')
clf.fit(x_tr, y_tr) 

y_pred = clf.predict(x_tr)
tr_ac = accuracy_score(y_tr, y_pred)
print('RBF KERNEL: Training acc: ', tr_ac)

v_sc = clf.predict(X_tr_b2)
v_ac = accuracy_score(y_train[int1 :int2 + int1], v_sc)
print('RBF KERNEL: Validation acc: ', v_ac)



LINEARSVC: Training acc:  0.9941666666666666
LINEARSVC: Validation acc:  0.8377333333333333
POLY KERNEL: Training acc:  0.243
POLY KERNEL: Validation acc:  0.20546666666666666
RBF KERNEL: Training acc:  0.9835
RBF KERNEL: Validation acc:  0.9304666666666667


In [3]:
#using rbf kernel with randomized search on limited dataset to get better hyperparams

from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, reciprocal, uniform

clf = SVC(C=0.1, kernel='rbf', random_state=42, decision_function_shape='ovr')

params = {
    'C': uniform(5 , 20),
     'gamma':   reciprocal(0.001, 0.1)# [0.0001, 0.001, 0.5]
}
 
newgridsearch = RandomizedSearchCV(clf, param_distributions = params, n_iter=10, cv = 3, scoring='accuracy')#, verbose = True)
print('done with search. now gonna fit')
r_grid_search = newgridsearch.fit(x_tr, y_tr)
print('adfdsf')
r_cv = r_grid_search.cv_results_

for mean_score, params in zip(r_cv["mean_test_score"], r_cv["params"]):
    print((mean_score), params)



done with search. now gonna fit
adfdsf
0.9336666666666666 {'C': 10.197786954824865, 'gamma': 0.001513186272679838}
0.20933333333333334 {'C': 9.111710799145637, 'gamma': 0.06719156480223124}
0.9325 {'C': 5.214011801409708, 'gamma': 0.001848939794318145}
0.18033333333333335 {'C': 20.160792702435387, 'gamma': 0.09114265653213442}
0.20766666666666667 {'C': 18.763705910048422, 'gamma': 0.06993087682473523}
0.7723333333333333 {'C': 22.10471513421927, 'gamma': 0.009118854871587641}
0.6356666666666667 {'C': 11.670464907157104, 'gamma': 0.018385511949522124}
0.9355 {'C': 15.671068219080436, 'gamma': 0.0011263118134606108}
0.678 {'C': 24.145943401650122, 'gamma': 0.01381957639278581}
0.37166666666666665 {'C': 22.489835833553386, 'gamma': 0.029461872248440116}


In [4]:
final_model = r_grid_search.best_estimator_
final_model

SVC(C=15.671068219080436, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.0011263118134606108,
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)

In [5]:
from sklearn.metrics import accuracy_score
final_model.fit(X_train_scaled, y_train)
y_pred = final_model.predict(X_train_scaled)
tr_ac = accuracy_score(y_train, y_pred)
print('RBF KERNEL: Training acc: ', tr_ac)

RBF KERNEL: Training acc:  0.9994666666666666


In [6]:
#testing dataset
X_test_scaled = scaler.fit_transform(X_test)
y_pred = final_model.predict(X_test_scaled)
tr_ac = accuracy_score(y_test, y_pred)
print('RBF KERNEL: Testing acc: ', tr_ac)

RBF KERNEL: Testing acc:  0.9721


In [164]:
#------------------ Californiahousing
import os
import tarfile
from six.moves import urllib
import hashlib
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

# import warnings
# warnings.filterwarnings('ignore')


  
HOUSING_PATH = "datasets/housing/housing.csv"

 
def load_housing_data(housing_path = HOUSING_PATH):
    return pd.read_csv(housing_path)

housing = load_housing_data()
mh = housing.copy()
y = (housing["median_house_value"]/100000)

mh.drop(['median_house_value', 'ocean_proximity'] ,axis = 1, inplace = True)
X = mh 


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [165]:


num_pipe = Pipeline([
    ('scaling', StandardScaler()),
    ('imputing', SimpleImputer(strategy = 'median')),
    
])

X_train_scaled = num_pipe.fit_transform(X_train)
X_test_scaled = num_pipe.fit_transform(X_test)

In [166]:
#linearSVR
from sklearn.svm import LinearSVR
from sklearn.metrics import mean_squared_error

reg_l = LinearSVR(random_state = 42)
reg_l.fit(X_train_scaled, y_train)
y_pred = reg_l.predict(X_train_scaled)
mse = mean_squared_error(y_train, y_pred)
print('Linear SVR MSE: ',mse)

#SVR
from sklearn.svm import SVR
reg_svr = SVR(kernel = 'linear', C = 1)
reg_svr.fit(X_train_scaled, y_train)
y_pred = reg_svr.predict(X_train_scaled)
mse = mean_squared_error(y_train, y_pred)
print('SVR(linear) MSE: ',mse)

#rbf kernel
from sklearn.svm import SVR
reg_svr = SVR( kernel = 'rbf', C = 1)
reg_svr.fit(X_train_scaled, y_train)
y_pred = reg_svr.predict(X_train_scaled)
mse = mean_squared_error(y_train, y_pred)
print('RBF SVR MSE: ',mse)



Linear SVR MSE:  0.49756610100542215
SVR(linear) MSE:  0.49712035321522663
RBF SVR MSE:  0.3165258088441752


In [167]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, reciprocal, uniform

params = {
    'C': uniform(15 , 20),
     'gamma':   reciprocal(0.0001, 0.01),# [0.0001, 0.001, 0.5]
    'epsilon' : [0.00001, 0.0001]
}
 
newgridsearch = RandomizedSearchCV(reg_svr, param_distributions = params, n_iter=20, cv = 3, scoring = "neg_mean_squared_error")#, verbose = True)
r_grid_search = newgridsearch.fit(X_train[:2000], y_train[:2000])

r_cv = r_grid_search.cv_results_

for mean_score, params in zip(r_cv["mean_test_score"], r_cv["params"]):
    print(-(mean_score), params)



1.2716675307031036 {'C': 29.812024447057, 'epsilon': 1e-05, 'gamma': 0.0008737339605848272}
1.3132813313391567 {'C': 22.79918377945657, 'epsilon': 0.0001, 'gamma': 0.005914672909572354}
1.4269802127023523 {'C': 33.21318317178179, 'epsilon': 1e-05, 'gamma': 0.000154240995784087}
1.302880282626853 {'C': 20.320468596023233, 'epsilon': 1e-05, 'gamma': 0.0023933765933444864}
1.3125333926264793 {'C': 28.833945975804188, 'epsilon': 0.0001, 'gamma': 0.005170349182943951}
1.3310344011242494 {'C': 25.44572996264208, 'epsilon': 1e-05, 'gamma': 0.00022305228196174462}
1.3212450683952635 {'C': 33.14785254498749, 'epsilon': 0.0001, 'gamma': 0.0002362960902142455}
1.3107599940669445 {'C': 16.49300766907983, 'epsilon': 1e-05, 'gamma': 0.004153998145767329}
1.3081567822838525 {'C': 30.176001672077557, 'epsilon': 1e-05, 'gamma': 0.003315316466483531}
1.3102413830290973 {'C': 28.922595311477643, 'epsilon': 1e-05, 'gamma': 0.003946957578035454}
1.2827266783350593 {'C': 15.358913572813027, 'epsilon': 1e-05

In [163]:
final_model = r_grid_search.best_estimator_
final_model.fit(X_train_scaled, y_train)
y_pred = final_model.predict(X_train_scaled)
mse = mean_squared_error(y_train, y_pred)
print('Training: RBF SVR MSE after randomizedsearch: ',mse)

y_pred = final_model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
print('Testing: RBF SVR MSE after randomizedsearch: ',mse)

SVR(C=25.2848007410704, cache_size=200, coef0=0.0, degree=3, epsilon=1e-05,
  gamma=0.00050990606601943, kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False)



0.949968822217229

array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894])