In [None]:
# Trying out a Grid search CV for hyperparameter tuning of SVM model
# Import all necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.offline as py
import plotly.graph_objs as go
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import plotly.express as px
import plotly.graph_objs as go
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import f1_score

In [None]:
# Loading Iris dataset as proof-of-concept 
# Using this new SVR model pipeline using custom kernels and GridSearchCV hyperparameter tuning
data=pd.read_csv("Iris.csv")

In [None]:
# Visualize the data, ploting features to see distributions if  they are any consensuses
plt.hist(data['Species'], bins=10)
plt.show()

# Petal features look far from normally distributed, closer to two seperate distributions
# Likely where I'll focus feature engineering efforts
sns.displot(data['PetalLengthCm'], bins=10, kde=True)
sns.displot(data['PetalWidthCm'], bins=10, kde=True)

#Sepal features look more normally distributed than petal features
sns.displot(data['SepalLengthCm'], bins=10, kde=True)
sns.displot(data['SepalWidthCm'], bins=10, kde=True)

In [None]:
#Dividing up all columns into categorical, numerical features
#Removing Id column as it's not a useful feature
#Checking dataset info 
data['Species'] = data['Species'].astype('category').cat.codes
data['SepalLengthCm'] = data['SepalLengthCm'].astype('category')
data['SepalWidthCm'] = data['SepalWidthCm'].astype('category')
data['PetalLengthCm'] = data['PetalLengthCm'].astype('category')
data['PetalWidthCm'] = data['PetalWidthCm'].astype('category')

data['Species'] = data['Species'] + 1 # Adjust labels to avoid zero

#data=data.drop(['Id'],axis=1)

data.info()
data.head()

In [None]:
# Splitting data into features and target variable and into training and testing sets
X=data.iloc[:, :-1] # All columns are features except the last column
y=data["Species"] # Target variable is the last column

# 70% training, 30% testing split with random state for reproducibility during model evaluation
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)

# Renaming columns for easier access
column_mapping = {
    'SepalLengthCm': 'SepalLength',
    'SepalWidthCm': 'SepalWidth',
    'PetalLengthCm': 'PetalLength',
    'PetalWidthCm': 'PetalWidth'
}

# Apply the renaming to the DataFrame
data = data.rename(columns=column_mapping)

In [None]:
# Looking at Sepal Length feature distribution with boxplots using plotly
fig_Sepal_Length = px.box(data, 
             x='Species', 
             y='SepalLength', 
             color='Species',
             title='Sepal Length Distribution by Iris Species',
             labels={'SepalLength': 'Sepal Length (cm)',
                    'Species': 'Iris Species'},
             template='plotly_white')
# Customize the layout
fig_Sepal_Length.update_layout(
    showlegend=True,
    xaxis_title_font=dict(size=14),
    yaxis_title_font=dict(size=14),
    title_font=dict(size=16),
    boxmode='group',
    plot_bgcolor='white'
)
# Add hover information
fig_Sepal_Length.update_traces(
    jitter=0.5,  # Increase point spread
    marker=dict(
        size=12,  # Much larger points
        opacity=0.6,  # More transparency for overlapping
        line=dict(width=1, color='darkgrey'),  # Add border to points
        symbol='circle'
    ),
    pointpos=-1.8,
    hoveron='points+boxes'  # Enable hover on both points and boxes
)
fig_Sepal_Length.show()


In [None]:
# Looking at Sepal Width feature distribution with boxplots using plotly
fig_Sepal_Width = px.box(data, 
             x='Species', 
             y='SepalWidth', 
             color='Species',
             title='Sepal Width Distribution by Iris Species',
             labels={'SepalWidth': 'Sepal Width (cm)',
                    'Species': 'Iris Species'},
             template='plotly_white')
# Customize the layout
fig_Sepal_Width.update_layout(
    showlegend=True,
    xaxis_title_font=dict(size=14),
    yaxis_title_font=dict(size=14),
    title_font=dict(size=16),
    boxmode='group',
    plot_bgcolor='white'
)
# Add hover information
fig_Sepal_Width.update_traces(
    jitter=0.5,  # Increase point spread
    marker=dict(
        size=12,  # Much larger points
        opacity=0.6,  # More transparency for overlapping
        line=dict(width=1, color='darkgrey'),  # Add border to points
        symbol='circle'
    ),
    pointpos=-1.8,
    hoveron='points+boxes'  # Enable hover on both points and boxes
)
fig_Sepal_Width.show()

In [None]:
# Looking at Petal Length feature distribution with boxplots using plotly
fig_Petal_Length = px.box(data, 
             x='Species', 
             y='PetalLength', 
             color='Species',
             title='Sepal Length Distribution by Iris Species',
             labels={'PetalLength': 'Petal Length (cm)',
                    'Species': 'Iris Species'},
             template='plotly_white')
# Customize the layout
fig_Petal_Length.update_layout(
    showlegend=True,
    xaxis_title_font=dict(size=14),
    yaxis_title_font=dict(size=14),
    title_font=dict(size=16),
    boxmode='group',
    plot_bgcolor='white'
)
# Add hover information
fig_Petal_Length.update_traces(
    jitter=0.5,  # Increase point spread
    marker=dict(
        size=12,  # Much larger points
        opacity=0.6,  # More transparency for overlapping
        line=dict(width=1, color='darkgrey'),  # Add border to points
        symbol='circle'
    ),
    pointpos=-1.8,
    hoveron='points+boxes'  # Enable hover on both points and boxes
)
fig_Petal_Length.show()


In [None]:
# Looking at Petal Width feature distribution with boxplots using plotly
fig_Petal_Width = px.box(data, 
             x='Species', 
             y='PetalWidth', 
             color='Species',
             title='Petal Width Distribution by Iris Species',
             labels={'PetalWidth': 'Petal Width (cm)',
                    'Species': 'Iris Species'},
             template='plotly_white')
# Customize the layout
fig_Petal_Width.update_layout(
    showlegend=True,
    xaxis_title_font=dict(size=14),
    yaxis_title_font=dict(size=14),
    title_font=dict(size=16),
    boxmode='group',
    plot_bgcolor='white'
)
# Add hover information
fig_Petal_Width.update_traces(
    jitter=0.5,  # Increase point spread
    marker=dict(
        size=12,  # Much larger points
        opacity=0.6,  # More transparency for overlapping
        line=dict(width=1, color='darkgrey'),  # Add border to points
        symbol='circle'
    ),
    pointpos=-1.8,
    hoveron='points+boxes'  # Enable hover on both points and boxes
)
fig_Petal_Width.show()

In [None]:
#Correlation heatmap to see relationships between features
sns.heatmap(data.corr(),square=True)

In [None]:
#Training a basic SVM classifier model as a baseline and creating hyperparameter tuning with GridSearchCV
svm = SVC()
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto'],
    'degree': [3, 4, 5]  #only relevant for 'poly'
}

# Baseline model evaluation with balanced accuracy score and F1-macro score
y_pred_svm=svm.fit(X_train, y_train).predict(X_test)
balanced_acc_baseline = balanced_accuracy_score(y_test, y_pred_svm)
print(f"Balanced Accuracy: {balanced_acc_baseline:.4f}")
f1_macro = f1_score(y_test, y_pred_svm, average='macro')
print(f"Macro F1-score for baseline model: {f1_macro:.4f}")

#Setup GridSearchCV
grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy')

#Fit on training data
grid_search.fit(X_train, y_train)

#Best params and best score
print("Best parameters:", grid_search.best_params_)
print("Best accuracy:", grid_search.best_score_)


In [None]:
#SVM clasifier model with best parameters from GridSearchCV
SVM=SVC(C= 0.1, degree= 3, gamma= 'scale', kernel= 'linear')

#Fitting model and predicting X test set
y_pred_SVM=SVM.fit(X_train, y_train).predict(X_test)

#Testing balanced accuracy score against baseline model
balanced_acc_baseline = balanced_accuracy_score(y_test, y_pred_SVM)
print(f"Balanced Accuracy after hyperparameter tuning: {balanced_acc_baseline:.4f}")
f1_macro_SVM = f1_score(y_test, y_pred_SVM, average='macro')
print(f"Macro F1-score for hyperparameter-tuned model: {f1_macro:.4f}")



