# **SVM - Support Vector Machine (ML-CUP22)**

In [None]:
# Install packages
!pip install keras_tuner

In [None]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import model_selection
import tensorflow as tf

# SVM
from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor

# Keras tuner
import keras_tuner

# Cross validation
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import make_scorer, accuracy_score, log_loss

In [None]:
# Mount google drive to access data loaded on Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

**Definition of functions**

In [None]:
## Definition of Mean Euclidean Error (MEE): metric used for performance evaluation of the model
def MEE(y_true, y_pred):
  eucl_norm = np.linalg.norm(y_true - y_pred, ord=2, axis=1)
  return np.mean(eucl_norm)

---
## **Data Preparation**

In [None]:
# Loading the training dataset ML-CUP
path = '/content/drive/MyDrive/data/Data_CUP/ML-CUP22-TR.csv'
col_names = ['a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', 'a8', 'a9', 'x', 'y']

data = pd.read_csv(path, names=col_names, comment='#')
data.shape

In [None]:
# Check the first 5 rows
data.head()

In [None]:
# Check for NaN values
data.isnull().any()

In [None]:
# Split data into design (85%) and test (15%) sets
design, test = train_test_split(data, test_size=0.15, shuffle=True, random_state=42)
print(f'design shape: {design.shape}')
print(f'test shape: {test.shape}')

# Split design data into train (80%) and validation (20%) sets
train, val = train_test_split(design, test_size=0.20, shuffle=True, random_state=42)
print(f'train shape: {train.shape}')
print(f'val shape: {val.shape}')

In [None]:
# Descriptive statistics on design data
design.describe()

In [None]:
# X_design, y_design split
X_design , y_design = design.iloc[:, :-2], design.loc[:, ['x', 'y']]
print(f'X_design shape: {X_design.shape}')
print(f'y_design shape: {y_design.shape}')
print()

# X_train, y_train split
X_train , y_train = train.iloc[:, :-2], train.loc[:, ['x', 'y']]
print(f'X_train shape: {X_train.shape}')
print(f'y_train shape: {y_train.shape}')
print()

# X_val, y_val split
X_val , y_val = val.iloc[:, :-2], val.loc[:, ['x', 'y']]
print(f'X_val shape: {X_val.shape}')
print(f'y_val shape: {y_val.shape}')
print()

# X_test, y_test split
X_test , y_test = test.iloc[:, :-2], test.loc[:, ['x', 'y']]
print(f'X_test shape: {X_test.shape}')
print(f'y_test shape: {y_test.shape}')

---
## **Preliminary Experimental Phase**

In [None]:
# Define the model
model = MultiOutputRegressor(SVR(C=4, kernel='rbf', epsilon=0.001, gamma='scale'))

In [None]:
# Fit the model
model.fit(X_train, y_train)

In [None]:
# Predict the validation data
y_pred = model.predict(X_val)

In [None]:
# Compute MEE
mee = MEE(y_pred, y_val)
print(mee)

---
##**GridSearch**

### **Coarse-grained GridSearch (linear & rbf kernel functions)**

In [None]:
# Build GridSearch
def build_model(hp):
  model = MultiOutputRegressor(SVR(
      kernel=hp.Choice('kernel', ['linear', 'rbf']),
      C = hp.Choice('C', [1e-3, 1e-2, 1e-1, 1.0, 10.0, 100.0]),
      epsilon = hp.Choice('epsilon', [1e-2, 1e-1, 1.0, 10.0]),
      gamma = hp.Choice('gamma', ['auto', 'scale']),
  ))
  return model

In [None]:
# Define Keras Tuner
tuner1 = keras_tuner.tuners.SklearnTuner(
    # Define hyperparameter optimization algorithm
    oracle=keras_tuner.oracles.GridSearchOracle(
        objective = keras_tuner.Objective('score', 'min'),
        max_consecutive_failed_trials=1),
    # Evaluate on MSE
    scoring=metrics.make_scorer(metrics.mean_squared_error),
    # Define GridSearch
    hypermodel = build_model,
    # Define KFold with k=5
    cv=model_selection.KFold(5),
    )

In [None]:
%%time
# Search for the best hyperparameter configuration
tuner1.search(X_design, y_design)

In [None]:
# Summary results
tuner1.results_summary()

### **Coarse-grained GridSearch (poly)**

In [None]:
# Build GridSearch
def build_model(hp):
  model = MultiOutputRegressor(SVR(
      kernel=hp.Fixed('kernel','poly'),
      degree = hp.Choice('degree', [2, 3, 4]),
      C = hp.Choice('C', [1e-3, 1e-2, 1e-1, 1.0, 10.0, 100.0]),
      epsilon = hp.Choice('epsilon', [1e-2, 1e-1, 1.0, 10.0]),
      gamma = hp.Choice('gamma', ['auto', 'scale']),
  ))
  return model

In [None]:
# Define Keras Tuner
tuner2 = keras_tuner.tuners.SklearnTuner(
    # Define hyperparameter optimization algorithm
    oracle=keras_tuner.oracles.GridSearchOracle(
        objective = keras_tuner.Objective('score', 'min'),
        max_consecutive_failed_trials=1),
    # Evaluate on MSE
    scoring=metrics.make_scorer(metrics.mean_squared_error),
    # Define GridSearch
    hypermodel = build_model,
    # Define KFold with k=5
    cv=model_selection.KFold(5),
    )

In [None]:
%%time
# Search for the best hyperparameter configuration
tuner2.search(X_design, y_design)

In [None]:
# Summary results
tuner2.results_summary()

###**Fine-grained GridSearch**

In [None]:
# Build model
def build_model(hp):
  model = MultiOutputRegressor(SVR(
      kernel='rbf',
      C = hp.Int('C', min_value=1, max_value=10, step=1),
      epsilon = hp.Float('epsilon', min_value=0.01, max_value=1, step=0.01),
      gamma = hp.Choice('gamma', ['auto', 'scale']),
  ))
  return model

In [None]:
# Define Keras Tuner
tuner_fine = keras_tuner.tuners.SklearnTuner(
    # Define hyperparameter optimization algorithm
    oracle=keras_tuner.oracles.GridSearchOracle(
        objective = keras_tuner.Objective('score', 'min'),
        max_consecutive_failed_trials=1),
    # Evaluate on MSE
    scoring=metrics.make_scorer(metrics.mean_squared_error),
    # Define GridSearch
    hypermodel = build_model,
    # Define KFold with k=5
    cv=model_selection.KFold(5),
    )

In [None]:
%%time
# Search for the best hyperparameter configuration
tuner_fine.search(X_design, y_design)

In [None]:
# Summary results
tuner_fine.results_summary()

----
##**K-Fold Cross Validation best model**

In [None]:
# Define best model
best_model = MultiOutputRegressor(SVR(C=4, kernel='rbf', epsilon=0.22, gamma='auto'))

In [None]:
# Define the K-fold Cross Validation (k=5) and evaluate on MEE
MEE_score = cross_val_score(best_model, X=X_design, y=y_design, cv=5, scoring=metrics.make_scorer(MEE))

In [None]:
print("Mean MEE: ", np.mean(MEE_score))
print("Standard Deviation MEE: ", np.std(MEE_score))

Mean MEE:  1.4460880227324244
Standard Deviation MEE:  0.06062418254977978


---
##**Model Assessment**

In [None]:
# Fit the model on design set
best_model.fit(X_design, y_design)

In [None]:
# Predict the design data
y_pred_design = best_model.predict(X_design)

# Compute MeanEuclideanError on design set
mee_design = MEE(y_pred_design, y_design)
print(mee_design)

In [None]:
# Predict the test data
y_pred_TS = best_model.predict(X_test)

# Compute MeanEuclideanError on test set
mee_TS = MEE(y_pred_TS, y_test)
print(mee_TS)

## **Prediction Plots**

In [None]:
# y prediction
plt.figure(figsize=(8, 6))
sns.set_theme(style="darkgrid")
sns.scatterplot(x=y_test['x'], y=y_test['y'], color='k', label='y_true')
sns.scatterplot(x=y_test['x'], y=y_pred_TS[:, 1], color='g', label='y_pred')
plt.title('Support Vector y prediction', fontsize=14)
plt.xlabel('x', fontsize=14)
plt.ylabel('y', fontsize=14)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.legend()
plt.show()

In [None]:
# x prediction
plt.figure(figsize=(8, 6))
sns.set_theme(style="darkgrid")
sns.scatterplot(x=y_test['x'], y=y_test['y'], color='k', label='x_true')
sns.scatterplot(x=y_pred_TS[:, 0], y=y_test['y'], color='g', label='x_pred')
plt.title('Support Vector x prediction', fontsize=14)
plt.xlabel('x', fontsize=14)
plt.ylabel('y', fontsize=14)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.legend()
plt.show()