**04: Model development**

Importing required modules

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, KFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

#extra
import os
import pickle
import datetime
import warnings

In [2]:
pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore")

Directory variables

In [3]:
transformed_data_path = r"C:\Data\Study\Ai_adventures\Projects\Machine_Learning\Regression\DengAi_Disease spread prediction\outputs\transformed_data.pkl"
outputs = r"C:\Data\Study\Ai_adventures\Projects\Machine_Learning\Regression\DengAi_Disease spread prediction\models"

Loading data

In [4]:
def load_transformed_data(file_path):
    with open(file_path, "rb") as file:
        return pickle.load(file)

In [5]:
data = load_transformed_data(transformed_data_path)
data.head()

Unnamed: 0,city,day,month,day_of_week,year,weekofyear,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,reanalysis_max_air_temp_k,reanalysis_min_air_temp_k,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm,total_cases
0,1,30,4,0,1990,18,-0.016536,-0.045015,0.110036,-0.082612,-0.744185,-0.943302,-1.214591,-1.619633,-1.387852,-0.329611,0.335308,-1.566752,-0.744185,-1.587135,-0.368641,-1.291053,-0.300128,-1.448836,-1.484469,-0.272887,4
1,1,7,5,0,1990,19,0.37702,0.3015,-0.478241,-0.436839,-0.314551,-0.487879,-0.700195,-0.930554,-0.676998,-0.111301,-0.40883,-0.523304,-0.314551,-0.930743,-0.651463,-0.492902,-0.74996,-0.388845,-0.047147,-0.800602,5
2,1,14,5,0,1990,20,-0.792355,0.568155,-0.567387,-0.187652,0.097705,-0.071461,-0.367098,-0.035477,-0.919344,0.354113,0.061403,0.354093,0.097705,-0.04538,-0.73863,-0.492902,-0.645358,-0.133716,0.351279,0.798169,4
3,1,21,5,0,1990,21,0.034186,1.163593,0.537103,0.709611,-0.616682,0.081599,-0.090599,-0.124142,-0.400195,0.187881,-0.685225,0.075074,-0.616682,-0.16452,-0.584496,0.08583,-0.401578,0.476215,0.685422,-1.191951,3
4,1,28,5,0,1990,22,0.592251,1.300171,0.861593,0.847845,-0.96724,0.483642,0.267015,0.263463,-0.152622,0.471071,-0.810182,0.096439,-0.96724,0.215625,-0.025532,1.544761,0.952789,1.553328,1.088514,-1.032905,6


Features and labels

In [6]:
X = data.drop(columns=["total_cases"])
y = data["total_cases"]
X.shape, y.shape

((1114, 26), (1114,))

Splitting data

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)

Model selection 

In [8]:
#we will examine performance on base models and tune top 2 models

In [9]:
models = [
    ("KNeighborsRegressor", KNeighborsRegressor()),
    ("SVR", SVR()),
    ("DecisionTreeRegressor", DecisionTreeRegressor()),
    ("LinearRegression", LinearRegression()),
    ("RandomForestRegressor", RandomForestRegressor()),
    ("GradientBoostingRegressor", GradientBoostingRegressor()),
    ("VotingRegressor", VotingRegressor(estimators=[
        ("knn", KNeighborsRegressor()),
        ("svr", SVR()),
        ("dt", DecisionTreeRegressor()),
        ("lr", LinearRegression()),
        ("rf", RandomForestRegressor()),
        ("gr", GradientBoostingRegressor())
    ]))
]

In [10]:
# Evaluate models and store performance metrics
model_metrics = []

for model_name, model in models:
    if isinstance(model, str):
        continue
    model.fit(X_train, y_train)
    y_preds = model.predict(X_test)
    mse = mean_squared_error(y_test, y_preds)
    mae = mean_absolute_error(y_test, y_preds)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_preds)
    model_metrics.append({
        "Model_name": model_name,
        "Model":model,
        "MSE": mse,
        "MAE": mae,
        "RMSE": rmse,
        "R2": r2,
    })

In [11]:
model_summary = pd.DataFrame(model_metrics)

# Display model performance
print("Model Performance Summary:")
model_summary.sort_values(["MAE","MSE","RMSE"])

Model Performance Summary:


Unnamed: 0,Model_name,Model,MSE,MAE,RMSE,R2
4,RandomForestRegressor,"(DecisionTreeRegressor(max_features='auto', ra...",302.697976,10.91417,17.398218,0.610705
5,GradientBoostingRegressor,([DecisionTreeRegressor(criterion='friedman_ms...,337.536058,11.345014,18.372154,0.5659
6,VotingRegressor,"VotingRegressor(estimators=[('knn', KNeighbors...",429.433845,12.689953,20.722786,0.447712
2,DecisionTreeRegressor,DecisionTreeRegressor(),699.165919,14.869955,26.441746,0.100814
0,KNeighborsRegressor,KNeighborsRegressor(),582.36574,15.181166,24.132255,0.251029
1,SVR,SVR(),855.417094,17.035444,29.247514,-0.100138
3,LinearRegression,LinearRegression(),857.063081,18.499839,29.27564,-0.102255


In [12]:
# thus we will tune RandomForestRegressor and GradientBoostingRegressor

In [13]:
rf_param_grid = {
    'n_estimators': [100, 200, 500],            
    'max_depth': [None, 1, 3],           
    'min_samples_split': [2, 5, 10],          
    'min_samples_leaf': [1, 2, 4]              
}

In [14]:
gb_param_grid = {
    'n_estimators': [100, 200, 500],           
    'learning_rate': [0.01, 0.1, 0.2],         
    'max_depth': [None, 1, 3],                  
    'min_samples_split': [2, 3, 4],           
    'min_samples_leaf': [1, 2, 3],           
}

In [15]:
rf_rand_search = RandomizedSearchCV(RandomForestRegressor(), param_distributions=rf_param_grid, n_iter=10, cv=8)
gb_rand_search = RandomizedSearchCV(GradientBoostingRegressor(), param_distributions=gb_param_grid, n_iter=10, cv=8)

In [16]:
rf_rand_search.fit(X_train,y_train)
gb_rand_search.fit(X_train,y_train)

RandomizedSearchCV(cv=8, estimator=GradientBoostingRegressor(),
                   param_distributions={'learning_rate': [0.01, 0.1, 0.2],
                                        'max_depth': [None, 1, 3],
                                        'min_samples_leaf': [1, 2, 3],
                                        'min_samples_split': [2, 3, 4],
                                        'n_estimators': [100, 200, 500]})

In [17]:
print(f"rf_rand_search best params: {rf_rand_search.best_params_}")
best_rf_model = rf_rand_search.best_estimator_

rf_rand_search best params: {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_depth': None}


In [18]:
print(f"gb_rand_search best params: {gb_rand_search.best_params_}")
best_gb_model = gb_rand_search.best_estimator_

gb_rand_search best params: {'n_estimators': 500, 'min_samples_split': 3, 'min_samples_leaf': 3, 'max_depth': 3, 'learning_rate': 0.01}


In [19]:
model_list = [(model_name, model) for model_name, model in models]
model_list.append(("RandomForestRegressor_Tuned", best_rf_model))
model_list.append(("GradientBoostingRegressor_Tuned", best_gb_model))

In [20]:
model_metrics = []
for model_name, model in model_list[-2:]:
    if isinstance(model, str):
        continue
    model.fit(X_train, y_train)
    y_preds = model.predict(X_test)
    mse = mean_squared_error(y_test, y_preds)
    mae = mean_absolute_error(y_test, y_preds)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_preds)
    model_metrics.append({
        "Model_name": model_name,
        "Model":model,
        "MSE": mse,
        "MAE": mae,
        "RMSE": rmse,
        "R2": r2,
    })

In [21]:
model_summary = pd.concat([model_summary,pd.DataFrame(model_metrics)],ignore_index=True)

In [22]:
# Display model performance
print("Model Performance Summary:")
model_summary.sort_values(["MAE","MSE","RMSE"])

Model Performance Summary:


Unnamed: 0,Model_name,Model,MSE,MAE,RMSE,R2
8,GradientBoostingRegressor_Tuned,([DecisionTreeRegressor(criterion='friedman_ms...,297.167681,10.670331,17.238552,0.617818
4,RandomForestRegressor,"(DecisionTreeRegressor(max_features='auto', ra...",302.697976,10.91417,17.398218,0.610705
7,RandomForestRegressor_Tuned,"(DecisionTreeRegressor(max_features='auto', mi...",312.007628,10.940162,17.663738,0.598732
5,GradientBoostingRegressor,([DecisionTreeRegressor(criterion='friedman_ms...,337.536058,11.345014,18.372154,0.5659
6,VotingRegressor,"VotingRegressor(estimators=[('knn', KNeighbors...",429.433845,12.689953,20.722786,0.447712
0,KNeighborsRegressor,KNeighborsRegressor(),582.36574,15.181166,24.132255,0.251029
2,DecisionTreeRegressor,DecisionTreeRegressor(),699.165919,14.869955,26.441746,0.100814
1,SVR,SVR(),855.417094,17.035444,29.247514,-0.100138
3,LinearRegression,LinearRegression(),857.063081,18.499839,29.27564,-0.102255


In [23]:
final_model = model_summary.nsmallest(1,columns="MAE").Model.values.item()

In [24]:
final_model

GradientBoostingRegressor(learning_rate=0.01, min_samples_leaf=3,
                          min_samples_split=3, n_estimators=500)

In [25]:
print(f"final model: {final_model}")

final model: GradientBoostingRegressor(learning_rate=0.01, min_samples_leaf=3,
                          min_samples_split=3, n_estimators=500)


Saving trained models with final model

In [26]:
for model_name, model in model_list:
    model_filename = os.path.join(outputs, f"{model_name}.pkl")
    with open(model_filename, "wb") as model_file:
        pickle.dump(model, model_file)
print("Trained models saved successfully.")

Trained models saved successfully.


In [27]:
with open(r"C:\Data\Study\Ai_adventures\Projects\Machine_Learning\Regression\DengAi_Disease spread prediction\models\final_model.pkl","wb") as file:
    pickle.dump(final_model, file)