In [44]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_score


## SVM - Support Vector Machines

On this notebook I'll apply the SVM model to predict the Dry days from the dataset where I preceed the Exploratory Data Analysis, Feature Engineering and data preprocessing on the _weather_classification_End-to-End_EDA_ notebook.

In [45]:
# Split the dataset into features and target variable

# df = features
df = pd.read_csv(r"../datasets/processed_weather_data.csv")

# df1 = target variable (Dryness)
df1 = pd.read_csv(r"../datasets/weather_data_target.csv")

In [46]:
df.head()

Unnamed: 0,num__Temperature,num__Humidity,num__Wind Speed,num__Precipitation (%),num__Atmospheric Pressure,num__UV Index,num__Visibility (km),cat__Cloud Cover_cloudy,cat__Cloud Cover_overcast,cat__Cloud Cover_partly cloudy,cat__Season_Spring,cat__Season_Summer,cat__Season_Winter,cat__Location_inland,cat__Location_mountain
0,-0.294931,0.212404,-0.048086,0.887629,0.134203,-0.520104,-0.582231,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
1,1.143035,1.351385,-0.192836,0.543291,0.150602,0.776424,1.345768,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
2,0.625367,-0.233285,-0.409962,-1.178401,0.346579,0.257813,0.010999,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,1.085516,0.707613,-1.206089,0.887629,0.549008,0.776424,-1.323769,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.452811,0.261924,1.037543,0.386773,-0.40749,-0.77941,-0.878846,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0


In [47]:
df1.head()

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Cloud Cover,Atmospheric Pressure,UV Index,Season,Visibility (km),Location,Dryness
0,14.0,73,9.5,82.0,partly cloudy,1010.82,2,Winter,3.5,inland,0
1,39.0,96,8.5,71.0,partly cloudy,1011.43,7,Spring,10.0,inland,0
2,30.0,64,7.0,16.0,clear,1018.72,5,Spring,5.5,mountain,0
3,38.0,83,1.5,82.0,clear,1026.25,7,Spring,1.0,coastal,0
4,27.0,74,17.0,66.0,overcast,990.67,1,Winter,2.5,mountain,0


In [48]:
X = df
y = df1["Dryness"]

In [49]:
y.value_counts()

Dryness
0    12618
1      582
Name: count, dtype: int64

In [50]:
X.shape

(13200, 15)

UPSAMPLING THE DATASET TO AVOID UNBALANCE

In [51]:
smote = SMOTE(random_state=42)
X_upsampled, y_upsampled = smote.fit_resample(X, y)

y_upsampled.value_counts()

Dryness
0    12618
1    12618
Name: count, dtype: int64

### Splitting dataset - training and testing

In [52]:
X_train_up, X_test_up, y_train_up, y_test_up = train_test_split(X_upsampled, y_upsampled, test_size = 0.2, random_state = 42)
X_train_up.shape, X_test_up.shape, y_train_up.shape, y_test_up.shape

((20188, 15), (5048, 15), (20188,), (5048,))

### Applying the Support Vector Machine

In [53]:
from itertools import product

kernels = ['linear', 'poly', 'rbf', 'sigmoid']
c_values = [1, 100, 1000]
outcome = []

for c,kernel in product(c_values, kernels):
    svc_model = SVC(kernel=kernel, C=c)
    svc_model.fit(X_train_up, y_train_up)
    Y_pred = svc_model.predict(X_test_up)
    acs = accuracy_score(y_test_up, Y_pred)
    outcome.append((kernel, c, acs))

for kernel, c, acs in outcome:
    print(f'Kernel = {kernel} | C = {c} | Accuracy Score = {acs}')

Kernel = linear | C = 1 | Accuracy Score = 0.9712757527733756
Kernel = poly | C = 1 | Accuracy Score = 0.9849445324881141
Kernel = rbf | C = 1 | Accuracy Score = 0.9839540412044374
Kernel = sigmoid | C = 1 | Accuracy Score = 0.8676703645007924
Kernel = linear | C = 100 | Accuracy Score = 0.9706814580031695
Kernel = poly | C = 100 | Accuracy Score = 0.9920760697305864
Kernel = rbf | C = 100 | Accuracy Score = 0.9950475435816165
Kernel = sigmoid | C = 100 | Accuracy Score = 0.8631141045958796
Kernel = linear | C = 1000 | Accuracy Score = 0.970879556259905
Kernel = poly | C = 1000 | Accuracy Score = 0.993066561014263
Kernel = rbf | C = 1000 | Accuracy Score = 0.9966323296354992
Kernel = sigmoid | C = 1000 | Accuracy Score = 0.8631141045958796


### Rerunning the loop with the cross validation for each selected number of folds 

In [None]:
for kernel, c in product(kernels, c_values):
    model = SVC(kernel=kernel, C=c)
    cv_scores = cross_val_score(model, X_train_up, y_train_up, cv=5)
    print(f"Kernel = {kernel} | C = {c} | CV Accuracy = {cv_scores}")

Kernel = linear | C = 1 | CV Accuracy = [0.97077761 0.96384349 0.97300644 0.9742383  0.97052267]
Kernel = linear | C = 100 | CV Accuracy = [0.9717682  0.96433878 0.97251114 0.9742383  0.97101808]


#### Key findings:

 * Based on both accuracy and 5-fold cross-validation scores, the SVM model demonstrated consistently high precision (above 97%) when using RBF and polynomial kernels, particularly at higher C values. Linear kernels also performed well, but sigmoid showed a noticeable drop in accuracy (~86%), suggesting it may not be well-suited for this task.

 * These results indicate that the SVM model is highly effective in classifying dry days when the appropriate kernel and regularization strength are selected. This has potential applications in early wildfire risk detection, where understanding dry-day patterns __(temperature, humidity and precipitation)__ could contribute to preventative planning.

 * Between all Kernels the only with the highest confidence is the:
 Kernel = rbf | C = 1000 | demonstrating accuracy results above 99.5%.

 
