In [29]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import pickle

In [30]:
df = pd.read_csv("FWI Cleaned.csv")

print("Dataset loaded successfully")
print("Shape:", df.shape)

df.head()

Dataset loaded successfully
Shape: (244, 17)


Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes,Region,Region_encoded,Classes_encoded
0,1,6,2012,29.0,57,18.0,0.0,65.7,3.4,7.6,1.3,3.4,0.5,not,0,0,2
1,2,6,2012,29.0,61,13.0,1.25,64.4,4.1,7.6,1.0,3.9,0.4,not,0,0,2
2,3,6,2012,26.0,82,21.5,1.25,47.7375,2.5,7.1,0.3,2.7,0.1,not,0,0,2
3,4,6,2012,25.0,89,13.0,1.25,47.7375,1.3,6.9,0.0,1.7,0.0,not,0,0,2
4,5,6,2012,27.0,77,16.0,0.0,64.8,3.0,14.2,1.2,3.9,0.5,not,0,0,2


In [31]:
df = df.dropna(subset=["FWI"])

print("After removing missing FWI values:")
print("Shape:", df.shape)

After removing missing FWI values:
Shape: (243, 17)


In [32]:
target = "FWI"

features = [
    "Temperature", "RH", "Ws", "Rain",
    "FFMC", "DMC", "DC", "ISI", "BUI"
]

X = df[features]
y = df[target]

print("Selected Features:", features)

Selected Features: ['Temperature', 'RH', 'Ws', 'Rain', 'FFMC', 'DMC', 'DC', 'ISI', 'BUI']


In [33]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

Training set shape: (194, 9)
Testing set shape: (49, 9)


In [34]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Feature scaling completed")


Feature scaling completed


In [35]:
with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

print("scaler.pkl saved successfully")

scaler.pkl saved successfully


In [36]:
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso()
}

results = []

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)

    results.append({
        "Model": name,
        "MAE": mean_absolute_error(y_test, y_pred),
        "RMSE": np.sqrt(mean_squared_error(y_test, y_pred)),
        "R2 Score": r2_score(y_test, y_pred)
    })

results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Model,MAE,RMSE,R2 Score
0,Linear Regression,0.526735,1.115519,0.958943
1,Ridge Regression,0.47161,0.861265,0.975526
2,Lasso Regression,0.910876,1.131535,0.957755


In [37]:
results_df.sort_values(by="R2 Score", ascending=False)

Unnamed: 0,Model,MAE,RMSE,R2 Score
1,Ridge Regression,0.47161,0.861265,0.975526
0,Linear Regression,0.526735,1.115519,0.958943
2,Lasso Regression,0.910876,1.131535,0.957755


In [38]:
ridge = Ridge()

param_grid = {
    "alpha": [0.01, 0.1, 1, 10, 100]
}

grid = GridSearchCV(
    estimator=ridge,
    param_grid=param_grid,
    cv=5,
    scoring="r2"
)

grid.fit(X_train_scaled, y_train)

print("Best Alpha:", grid.best_params_)
print("Best Cross-Validated R2 Score:", grid.best_score_)


Best Alpha: {'alpha': 1}
Best Cross-Validated R2 Score: 0.9691378508528823


In [39]:
best_ridge = grid.best_estimator_

y_pred = best_ridge.predict(X_test_scaled)

print("Final Ridge Regression Performance:")
print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred))
)
print("R2 Score:", r2_score(y_test, y_pred))


Final Ridge Regression Performance:
MAE: 0.4716097304336704
RMSE: 0.8612650072466204
R2 Score: 0.9755256995839695


In [40]:
with open("ridge.pkl", "wb") as f:
    pickle.dump(best_ridge, f)

print("ridge.pkl saved successfully")

ridge.pkl saved successfully


In [41]:
print(
    "Ridge Regression was selected as the final model because it "
    "handles multicollinearity among correlated weather features "
    "and demonstrated better generalization performance during "
    "cross-validation."
)

Ridge Regression was selected as the final model because it handles multicollinearity among correlated weather features and demonstrated better generalization performance during cross-validation.


In [42]:
final_summary = {
    "Model": "Ridge Regression",
    "Best Alpha": grid.best_params_["alpha"],
    "MAE": mean_absolute_error(y_test, y_pred),
    "RMSE": np.sqrt(mean_squared_error(y_test, y_pred)),
    "R2 Score": r2_score(y_test, y_pred)
}

final_summary


{'Model': 'Ridge Regression',
 'Best Alpha': 1,
 'MAE': 0.4716097304336704,
 'RMSE': np.float64(0.8612650072466204),
 'R2 Score': 0.9755256995839695}