<a href="https://colab.research.google.com/github/DHarley22/Prediction_case_malaria_mozambique/blob/1-xgboost-predictor-python-version/model_building_mozambique.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip install gdown optuna


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import geopandas as gpd
import os
import numpy as np
import xgboost as xgb
import sklearn
import gdown
import optuna
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split as split_into_two
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from xgboost import XGBRegressor

In [None]:
%%capture

# The mozambique dataset is hosted here
# https://drive.google.com/file/d/1uHGOjxlxfDYY5E-aL9HVRGvheDa_wNyq/view?usp=sharing
dataset_drive_id = '1uHGOjxlxfDYY5E-aL9HVRGvheDa_wNyq'
dataset_output_file = 'downloaded_sheet.xlsx'

# The shape file relatd to the dataset is hosted here
# https://drive.google.com/drive/folders/14UJ7ZXWmNeL28sYAv6dObNsC42kQr4Ja?usp=sharing
shape_file_drive_id = '14UJ7ZXWmNeL28sYAv6dObNsC42kQr4Ja'
shape_output_folder = 'shape_files'

gdown.download_folder(id=shape_file_drive_id, output=shape_output_folder, quiet=False)
gdown.download(id=dataset_drive_id, output=dataset_output_file, quiet=False)


In [None]:
dataset = pd.read_csv(f"/content/{dataset_output_file}")
shape_file = gpd.read_file(f"/content/{shape_output_folder}/shape_file.shp")

In [None]:
# (Left) join the dataset to the shape_file using the "district" field
shaped_dataset = dataset.merge(
    shape_file[["district", "geometry"]],
    on="district",
    how="left"
)

# Convert the Dataframe into GeoDataframe and add some fields to it
shaped_dataset =  gpd.GeoDataFrame(shaped_dataset, geometry="geometry")

shaped_dataset = shaped_dataset.to_crs(epsg=3857)

# Calculate the area and centroid using the projected CRS
shaped_dataset["area"] = shaped_dataset.geometry.area
shaped_dataset["centroid_x"] = shaped_dataset.geometry.centroid.x
shaped_dataset["centroid_y"] = shaped_dataset.geometry.centroid.y

### Missing data handling

Rules:

- If {some condition} then {we do this}
- If {some condition} then {we do this}

In [None]:
# Handle missing data
# TODO:

### Test/Val/Test Split

Rules:

- We will use


In [None]:
TARGET_DISEASE_FIELD = "malaria_cases_u5"


# Relevant features
relevant_features = [
    # Environemental features
    'tmin', 'tmax', 'precipitation', 'ndvi','RH',
    # Socio-economic features
    'prop_poor', 'prop_Rural', 'prop_drinking_TreatedWater',
    # Spatial features
     'centroid_x', 'centroid_y'
]

# Test dataset predicate; use lines between March and June for testing
test_data_predicate = (shaped_dataset['year'] == 2018) & (shaped_dataset['month'].between(3, 6))

# Test dataset
X_test = shaped_dataset[test_data_predicate][relevant_features]
y_test = shaped_dataset[test_data_predicate][TARGET_DISEASE_FIELD]

# The rest of the dataset separated as X, y
X = shaped_dataset[~test_data_predicate][relevant_features]
y = shaped_dataset[~test_data_predicate][TARGET_DISEASE_FIELD]

print(X.shape, y.shape)
X_train, X_val, y_train, y_val = split_into_two(X, y, test_size=0.3, random_state=42)




(5088, 10) (5088,)


In [None]:

# Define the objective function for Optuna
def objective(trial):
    # Hyperparameter search space
    params = {
        'verbosity': 0,
        'booster': trial.suggest_categorical('booster', ['gbtree', 'dart']),
        'objective': 'reg:squarederror',  # Regression objective
        'eval_metric': 'rmse',            # Root Mean Squared Error
        'learning_rate': trial.suggest_float('learning_rate', 1e-5, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 2, 30),
        'gamma': trial.suggest_float('gamma', 0.01, 10, log=True),
        'subsample': trial.suggest_float('subsample', 0.01, 1),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-7, 1.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-7, 1.0, log=True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 0, 10),
        'n_estimators': trial.suggest_int('n_estimators', 5, 5000),
    }


    # Cross-validation
    cv_results = xgb.cv(
        params=params,
        dtrain=xgb.DMatrix(X_train, label=y_train),
        nfold=5,
        metrics='rmse',
        num_boost_round=params['n_estimators'],
        as_pandas=True,
        verbose_eval=50,
        seed=0,
        early_stopping_rounds=20
    )

    return cv_results['test-rmse-mean'].iloc[-1]

# Run Optuna optimization (minization problem)
study = optuna.create_study(study_name="xgboost_model_study", direction="minimize")
study.optimize(objective, n_trials=1000)

# Best hyperparameters
print("Best hyperparameters:", study.best_params)


[I 2025-01-28 23:18:28,153] A new study created in memory with name: xgboost_model_study


[0]	train-rmse:1719.83702+28.44118	test-rmse:1724.11189+116.20863
[50]	train-rmse:437.15463+15.10728	test-rmse:852.87053+81.79799
[88]	train-rmse:296.86054+12.40243	test-rmse:845.63384+78.61251


[I 2025-01-28 23:18:46,581] Trial 0 finished with value: 844.011353490672 and parameters: {'booster': 'gbtree', 'learning_rate': 0.07601163674492135, 'max_depth': 15, 'gamma': 0.09663792942226598, 'subsample': 0.6883562503913834, 'reg_alpha': 0.0001778847397262791, 'reg_lambda': 0.0010899520586166416, 'colsample_bytree': 0.8142450282621815, 'min_child_weight': 7, 'n_estimators': 1996}. Best is trial 0 with value: 844.011353490672.


[0]	train-rmse:1821.88011+29.48188	test-rmse:1818.30172+122.91806
[50]	train-rmse:1759.00447+27.13149	test-rmse:1760.42923+115.09066
[100]	train-rmse:1699.26787+23.12886	test-rmse:1705.49628+110.24363
[150]	train-rmse:1647.97228+20.63433	test-rmse:1659.15391+104.76887
[200]	train-rmse:1599.69062+19.74186	test-rmse:1614.86322+99.76910
[250]	train-rmse:1557.90245+18.73941	test-rmse:1577.25070+95.19078
[300]	train-rmse:1516.84641+16.99610	test-rmse:1540.85802+92.62792
[350]	train-rmse:1479.31669+14.01070	test-rmse:1507.81516+90.63444
[400]	train-rmse:1440.17185+13.99203	test-rmse:1473.81657+86.07100
[450]	train-rmse:1406.75453+13.88201	test-rmse:1444.41056+82.33046
[500]	train-rmse:1373.42409+14.17345	test-rmse:1414.69388+78.97510
[550]	train-rmse:1345.52556+12.97321	test-rmse:1390.51686+77.40674
[600]	train-rmse:1317.99407+12.74403	test-rmse:1366.83670+75.31122
[650]	train-rmse:1291.89595+12.37119	test-rmse:1343.66457+73.37115
[700]	train-rmse:1267.46305+12.04606	test-rmse:1322.41597+72.

[I 2025-01-28 23:40:49,969] Trial 1 finished with value: 1180.1704665605764 and parameters: {'booster': 'dart', 'learning_rate': 0.0020791703039784734, 'max_depth': 5, 'gamma': 0.12248771941055511, 'subsample': 0.12907229536906292, 'reg_alpha': 3.078227945458243e-06, 'reg_lambda': 5.49739383581436e-06, 'colsample_bytree': 0.4661992859739765, 'min_child_weight': 2, 'n_estimators': 1205}. Best is trial 0 with value: 844.011353490672.


[0]	train-rmse:1820.26774+29.35022	test-rmse:1816.74656+122.88441
[50]	train-rmse:1674.60148+25.77104	test-rmse:1689.56459+111.46638
[100]	train-rmse:1543.32755+22.73667	test-rmse:1576.26347+101.53769
[150]	train-rmse:1429.22210+20.22948	test-rmse:1482.58901+93.46156
[200]	train-rmse:1325.64144+18.26763	test-rmse:1399.34688+86.58594
[250]	train-rmse:1234.60825+16.64073	test-rmse:1329.49157+81.50700
[300]	train-rmse:1151.48389+15.76337	test-rmse:1266.00446+77.34569
[350]	train-rmse:1076.28354+14.75557	test-rmse:1209.24729+73.89683
[400]	train-rmse:1008.50863+13.94121	test-rmse:1159.83603+71.15075
[450]	train-rmse:947.99528+13.46704	test-rmse:1118.04284+68.96575
[500]	train-rmse:893.72365+13.20022	test-rmse:1081.31251+67.34555
[550]	train-rmse:845.69185+12.92845	test-rmse:1051.23736+66.35497
[600]	train-rmse:801.53553+12.34101	test-rmse:1024.34990+65.96069
[650]	train-rmse:761.52998+12.37085	test-rmse:1001.09638+65.38124
[700]	train-rmse:726.32082+12.39670	test-rmse:981.48177+64.71560
[7

[I 2025-01-28 23:43:47,744] Trial 2 finished with value: 842.1446760902779 and parameters: {'booster': 'gbtree', 'learning_rate': 0.0022261763858362335, 'max_depth': 24, 'gamma': 0.012595601154787132, 'subsample': 0.778677744816563, 'reg_alpha': 2.261407167214719e-06, 'reg_lambda': 0.021416539300779663, 'colsample_bytree': 0.5480149886225263, 'min_child_weight': 6, 'n_estimators': 4605}. Best is trial 2 with value: 842.1446760902779.


[0]	train-rmse:1820.66735+29.42620	test-rmse:1817.01893+122.89810
[50]	train-rmse:1715.06694+25.60135	test-rmse:1728.51846+115.17513
[100]	train-rmse:1617.21037+23.07917	test-rmse:1649.02365+107.70315
[150]	train-rmse:1538.15906+20.99103	test-rmse:1589.35911+102.42460
[200]	train-rmse:1459.71302+19.63933	test-rmse:1527.24763+95.39843
[250]	train-rmse:1392.78899+19.37917	test-rmse:1478.40765+90.02398
[300]	train-rmse:1328.46165+17.49575	test-rmse:1430.44057+86.68682
[350]	train-rmse:1264.32322+15.35723	test-rmse:1380.38136+82.51940
[400]	train-rmse:1203.96573+14.68034	test-rmse:1334.02299+78.26206
[450]	train-rmse:1152.65011+14.83400	test-rmse:1297.90649+74.23865
[500]	train-rmse:1102.85320+14.80240	test-rmse:1260.87459+71.53488
[550]	train-rmse:1060.37992+14.11421	test-rmse:1232.03860+69.64636
[600]	train-rmse:1021.34930+14.21553	test-rmse:1206.04279+67.95872
[650]	train-rmse:983.44965+13.58693	test-rmse:1181.20611+65.77347
[700]	train-rmse:949.75285+13.54969	test-rmse:1159.43652+64.39

In [None]:
# Here we will use the best parameters to train the model
d_matrix_train = xgb.DMatrix(X_train, label=y_train)
d_matrix_val = xgb.DMatrix(X_val, label=y_val)
d_matrix_test = xgb.DMatrix(X_test, label=y_test)
best_model = xgb.train(
    study.best_params,
    xgb.DMatrix(X_train, label=y_train),
    num_boost_round=study.best_params["n_estimators"],
    evals=[(d_matrix_train, 'train'), (d_matrix_val, 'val')],
    verbose_eval=50
)
