<a href="https://colab.research.google.com/github/AndrewDavidRatnam/Support-Vector-Machines/blob/main/Support_Vector_Machines.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Support Vector Machines on the Wine Dataset and California Housing Dataset

## SVM on the Wine Dataset
- load the data
- understand the data, view it etc
- preprocess the data for SVM Classifiers
-  compare results with SGD, Linear Regression?
- Conclusion


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.datasets import load_wine
wine = load_wine (as_frame = True)

In [None]:
print(wine.DESCR)

.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
    Total Phenols:                0.98  3.88    2.29  0.63
    Fl

In [None]:
len(wine.data), set(wine.target), wine.target_names

(178, {0, 1, 2}, array(['class_0', 'class_1', 'class_2'], dtype='<U7'))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(wine.data, wine.target, random_state=42)

In [None]:
X_train.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
100,12.08,2.08,1.7,17.5,97.0,2.23,2.17,0.26,1.4,3.3,1.27,2.96,710.0
122,12.42,4.43,2.73,26.5,102.0,2.2,2.13,0.43,1.71,2.08,0.92,3.12,365.0
154,12.58,1.29,2.1,20.0,103.0,1.48,0.58,0.53,1.4,7.6,0.58,1.55,640.0
51,13.83,1.65,2.6,17.2,94.0,2.45,2.99,0.22,2.29,5.6,1.24,3.37,1265.0


In [None]:
y_train.head()

2      0
100    1
122    1
154    2
51     0
Name: target, dtype: int64

In [None]:
lin_clf = make_pipeline(StandardScaler(),
                        LinearSVC(max_iter=1_000_0000,random_state=42))
lin_clf.fit(X_train, y_train)


In [None]:
lin_clf[1].n_iter_

195

In [None]:
cross_val_score(lin_clf, X_train, y_train).mean()

In [None]:
from sklearn.metrics import mean_squared_error

y_pred = lin_clf.predict(X_test)
mean_squared_error(y_test, y_pred)

0.022222222222222223

Using Kernelized SVM

In [None]:
#Using Kernelized SVM
svm_clf = make_pipeline(StandardScaler(), SVC(random_state=42))
cross_val_score(svm_clf, X_train, y_train).mean()

0.9698005698005698

In [None]:
#hyper parameter tuning using randomsearch
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import loguniform, uniform

param_distrib = {
    "svc__gamma": loguniform(0.001, 0.1),
    "svc__C" : uniform(1, 10)
}
rnd_search = RandomizedSearchCV(svm_clf, param_distrib, random_state=42, cv=5, n_iter=100) #give kernel for RBF AND poly
rnd_search.fit(X_train, y_train)

In [None]:
rnd_search.best_params_, rnd_search.best_score_

({'svc__C': 9.925589984899778, 'svc__gamma': 0.011986281799901188},
 0.9925925925925926)

In [None]:
rnd_search.score(X_test, y_test)

0.9777777777777777

## SVM on California housing dataset

In [None]:
import pandas as pd
import numpy as np
from sklearn.svm import LinearSVR, SVR #doing regression lol
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
# df = pd.read_csv("/content/sample_data/california_housing_train.csv")
# X_train, y_train = df.drop("median_house_value", axis=1), df["median_house_value"]
# if fetch california housing does not work

In [None]:
X = housing.data
y = housing.target

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)
lin_svr = make_pipeline(StandardScaler(), LinearSVR(random_state=42, max_iter=5_000))
lin_svr.fit(X_train, y_train)

In [None]:
lin_svr[1].n_iter_

2519

In [None]:
from sklearn.metrics import mean_squared_error
y_pred = lin_svr.predict(X_train)
mean_squared_error(y_train, y_pred), np.sqrt(mean_squared_error(y_train, y_pred))

# $98_000 error lol

(0.9595484665813285, 0.979565447829459)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import loguniform, uniform_direction

svm_reg = make_pipeline(StandardScaler(), SVR())

param_distrib = {
    "svr__gamma": loguniform(0.001, 0.1),
    "svr__C" : uniform(1, 10)
}
rnd_search_cv = RandomizedSearchCV(svm_reg, param_distrib, random_state=42, n_iter=100, cv=3)
rnd_search_cv.fit(X_train[:2000], y_train[:2000]) # svm is O(m^2*n) i guess approx

In [None]:
rnd_search_cv.best_params_, rnd_search_cv.best_score_, rnd_search_cv.best_estimator_

({'svr__C': 4.63629602379294, 'svr__gamma': 0.08781408196485979},
 0.7572774474774365,
 Pipeline(steps=[('standardscaler', StandardScaler()),
                 ('svr', SVR(C=4.63629602379294, gamma=0.08781408196485979))]))

In [None]:
-cross_val_score(rnd_search_cv.best_estimator_, X_train, y_train,
                 cv=3, scoring="neg_mean_squared_error")

array([0.34992382, 0.32241241, 0.34923475])

In [None]:
y_pred = rnd_search_cv.best_estimator_.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
rmse

0.5854732265172238