In [61]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats # For statistical tests
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE

# Load the dataset
df1 = pd.read_csv(r"../datasets/weather_classification_data.csv")
# Preprocess the data
df = pd.read_csv(r"../datasets/processed_weather_data.csv")

### Machine learning 

* ML model selection
    - At this point of the project, I'll start the process to perform the __Logistics Regression__ to practice and consolidate the knowledge acquired during the classes (2025/06/16).

In [62]:
df.head()

Unnamed: 0,num__Temperature,num__Humidity,num__Wind Speed,num__Precipitation (%),num__Atmospheric Pressure,num__UV Index,num__Visibility (km),cat__Cloud Cover_cloudy,cat__Cloud Cover_overcast,cat__Cloud Cover_partly cloudy,cat__Season_Spring,cat__Season_Summer,cat__Season_Winter,cat__Location_inland,cat__Location_mountain
0,-0.294931,0.212404,-0.048086,0.887629,0.134203,-0.520104,-0.582231,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
1,1.143035,1.351385,-0.192836,0.543291,0.150602,0.776424,1.345768,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
2,0.625367,-0.233285,-0.409962,-1.178401,0.346579,0.257813,0.010999,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,1.085516,0.707613,-1.206089,0.887629,0.549008,0.776424,-1.323769,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.452811,0.261924,1.037543,0.386773,-0.40749,-0.77941,-0.878846,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0


In [63]:
df1.head()

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Cloud Cover,Atmospheric Pressure,UV Index,Season,Visibility (km),Location,Weather Type
0,14.0,73,9.5,82.0,partly cloudy,1010.82,2,Winter,3.5,inland,Rainy
1,39.0,96,8.5,71.0,partly cloudy,1011.43,7,Spring,10.0,inland,Cloudy
2,30.0,64,7.0,16.0,clear,1018.72,5,Spring,5.5,mountain,Sunny
3,38.0,83,1.5,82.0,clear,1026.25,7,Spring,1.0,coastal,Sunny
4,27.0,74,17.0,66.0,overcast,990.67,1,Winter,2.5,mountain,Rainy


In [64]:
# Creating binary columns for Weather Dryness - 1 for dry weather, 0 for not dry
# Assuming 'precipitation', 'humidity', and 'temperature' as the defining features for dryness
df1['Dryness'] = np.where((df1["Precipitation (%)"] < 10) & (df1["Humidity"] < 50) & (df1["Temperature"] > 25), 1, 0)

In [65]:
df1 = df1.drop(columns=['Weather Type'])

In [66]:
df1.head()

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Cloud Cover,Atmospheric Pressure,UV Index,Season,Visibility (km),Location,Dryness
0,14.0,73,9.5,82.0,partly cloudy,1010.82,2,Winter,3.5,inland,0
1,39.0,96,8.5,71.0,partly cloudy,1011.43,7,Spring,10.0,inland,0
2,30.0,64,7.0,16.0,clear,1018.72,5,Spring,5.5,mountain,0
3,38.0,83,1.5,82.0,clear,1026.25,7,Spring,1.0,coastal,0
4,27.0,74,17.0,66.0,overcast,990.67,1,Winter,2.5,mountain,0


In [67]:
# # Saving the modified DataFrame to a new CSV file
# df1.to_csv(r"../datasets/weather_data_target.csv", index=False)

In [68]:
X = df

y = (df1['Dryness'])

In [69]:
y.shape

(13200,)

In [70]:
y.value_counts()

Dryness
0    12618
1      582
Name: count, dtype: int64

1 for Dry, 0 for Not Dry

In [71]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Dryness, dtype: int64

### Logistical Regression

In [72]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((10560, 15), (2640, 15), (10560,), (2640,))

__UP SAMPLING TRAINING DATAPOINTS__

In [73]:
smote = SMOTE(random_state=42)
X_train_upsampled, y_train_upsampled = smote.fit_resample(X_train, y_train)

In [74]:
X_train_upsampled.shape, X_test.shape, y_train_upsampled.shape, y_test.shape

((20168, 15), (2640, 15), (20168,), (2640,))

In [75]:
lr = LogisticRegression(random_state=16, max_iter=1000)
lr.fit(X_train_upsampled, y_train_upsampled)

In [76]:
y_pred = lr.predict(X_test)
y_pred

array([1, 0, 0, ..., 0, 0, 0], shape=(2640,))

In [77]:
diff_df = pd.DataFrame({'Actual' : y_test, 'Predicted': y_pred})
diff_df

Unnamed: 0,Actual,Predicted
4111,0,1
10607,0,0
7372,0,0
11786,0,0
12227,0,0
...,...,...
2543,0,0
96,0,0
2474,0,0
2522,0,0


In [78]:
conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix

array([[2404,  130],
       [   1,  105]])

### Observations:
#### True Negatives (TN) = 2404
The model correctly predicted 2520 the weather as "Not Dry".
#### False Positives (FP) = 130
The model incorrectly predicted 130 the weather as "Dry". When those actually didn't (Type I error).
#### False Negatives (FN) = 1
The model incorrectly predicted 1 the weather as "Not Dry". When they actually was dry (Type II error).
#### True Positives (TP) = 105
The model correctly predicted 105 the weather as "Dry".

In [79]:
target_names = ['Not dry weather', 'dry weather']
print(classification_report(y_test, y_pred, target_names=target_names))

                 precision    recall  f1-score   support

Not dry weather       1.00      0.95      0.97      2534
    dry weather       0.45      0.99      0.62       106

       accuracy                           0.95      2640
      macro avg       0.72      0.97      0.79      2640
   weighted avg       0.98      0.95      0.96      2640



### Observations:
* Accuracy = 100% → The model correctly classifies 95% of all instances in the dataset.

* Precision (Positive Predictive Value):
 - "Not Dry weather": 0.99 → When the model predicts "Not Dry", it's correct 99% of the time.
 - "Dry weather": 0.45 → When the model predicts "Dried weather", it's correct 45% of the time.

* Recall (Sensitivity/True Positive Rate):
 - Not Dry weather: 0.95 → The model correctly identifies 95% of all actual "Not dried weathers" cases.
 - Dry weather: 0.99 → The model correctly identifies 99% of all actual "dry weather" cases.

* F1-score (Balance between Precision & Recall):
 - Not Dry: 0.97
 - Dry: 0.62
 - Since the F1-score for "Dry" is lower, the model struggles more with correctly predicting positive cases.

### Key Observations
The logistic regression model demonstrated strong predictive power in identifying dry weather conditions, achieving a high recall rate of 99%, which means it successfully captured nearly all actual dry days.

 * A critical factor in reducing wildfire risks.

 * However, its precision of 45% indicates a high rate of false positives, where many days are mistakenly classified as dry.

 * Despite this limitation, the model serves as a valuable early-warning tool by ensuring that risky conditions are rarely missed.

### Future Actions for improvements 
To improve precision and reduce unnecessary alerts, the next step involves implementing more advanced classification algorithms.

 * Random Forest Machine Learn model, can better capture complex patterns in the data through deeper decision paths.


### Conclusion

This approach supports my learning journey and strengthens my career goal in Data Science, particularly in applying Machine Learning to predict and prevent wildfire risks—contributing to both the prevention and recovery of degraded areas. By starting with a simpler model like logistic regression and progressing toward more advanced algorithms such as Random Forest, this notebook represents a key step in moving from foundational techniques to more complex, precise solutions. These improvements aim not only to enhance precision but also to increase the overall reliability of predictions in real-world environmental applications.