<a href="https://colab.research.google.com/github/DHarley22/Prediction_case_malaria_mozambique/blob/1-xgboost-predictor-python-version/model_building_mozambique.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install gdown optuna


In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import geopandas as gpd
import os
import numpy as np
import xgboost as xgb
import sklearn
import gdown
import optuna
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split as split_into_two
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from xgboost import XGBRegressor

In [3]:
%%capture

# The mozambique dataset is hosted here
# https://drive.google.com/file/d/1uHGOjxlxfDYY5E-aL9HVRGvheDa_wNyq/view?usp=sharing
dataset_drive_id = '1uHGOjxlxfDYY5E-aL9HVRGvheDa_wNyq'
dataset_output_file = 'downloaded_sheet.xlsx'

# The shape file relatd to the dataset is hosted here
# https://drive.google.com/drive/folders/14UJ7ZXWmNeL28sYAv6dObNsC42kQr4Ja?usp=sharing
shape_file_drive_id = '14UJ7ZXWmNeL28sYAv6dObNsC42kQr4Ja'
shape_output_folder = 'shape_files'

gdown.download_folder(id=shape_file_drive_id, output=shape_output_folder, quiet=False)
gdown.download(id=dataset_drive_id, output=dataset_output_file, quiet=False)


In [4]:
dataset = pd.read_csv(f"/content/{dataset_output_file}")
shape_file = gpd.read_file(f"/content/{shape_output_folder}/shape_file.shp")

In [5]:
# (Left) join the dataset to the shape_file using the "district" field
shaped_dataset = dataset.merge(
    shape_file[["district", "geometry"]],
    on="district",
    how="left"
)

# Convert the Dataframe into GeoDataframe and add some fields to it
shaped_dataset =  gpd.GeoDataFrame(shaped_dataset, geometry="geometry")

shaped_dataset = shaped_dataset.to_crs(epsg=3857)

# Calculate the area and centroid using the projected CRS
shaped_dataset["area"] = shaped_dataset.geometry.area
shaped_dataset["centroid_x"] = shaped_dataset.geometry.centroid.x
shaped_dataset["centroid_y"] = shaped_dataset.geometry.centroid.y

### Missing data handling

Rules:

- If {some condition} then {we do this}
- If {some condition} then {we do this}

In [6]:
# Handle missing data
# TODO:

### Test/Val/Test Split

Rules:

- We will use


In [7]:
TARGET_DISEASE_FIELD = "malaria_cases_u5"


# Relevant features
relevant_features = [
    # Environemental features
    'tmin', 'tmax', 'precipitation', 'ndvi','RH',
    # Socio-economic features
    'prop_poor', 'prop_Rural', 'prop_drinking_TreatedWater',
    # Spatial features
     'centroid_x', 'centroid_y'
]

# Test dataset predicate; use lines between March and June for testing
test_data_predicate = (shaped_dataset['year'] == 2018) & (shaped_dataset['month'].between(3, 6))

# Test dataset
X_test = shaped_dataset[test_data_predicate][relevant_features]
y_test = shaped_dataset[test_data_predicate][TARGET_DISEASE_FIELD]

# The rest of the dataset separated as X, y
X = shaped_dataset[~test_data_predicate][relevant_features]
y = shaped_dataset[~test_data_predicate][TARGET_DISEASE_FIELD]

print(X.shape, y.shape)
X_train, X_val, y_train, y_val = split_into_two(X, y, test_size=0.3, random_state=42)




(5088, 10) (5088,)


In [9]:
def objective(trial: optuna.Trial) -> float:
    x = trial.suggest_float("x", -5, 5)
    y = trial.suggest_float("y", -5, 5)
    return x**2 + y**2


# module = optunahub.load_module(package="samplers/auto_sampler")
# study = optuna.create_study(sampler=module.AutoSampler())
study = optuna.create_study()

study.optimize(objective, n_trials=10)

print(study.best_trial.value, study.best_trial.params)

[I 2025-01-28 14:24:22,943] A new study created in memory with name: no-name-55269359-6bf0-48a3-92f7-7bb08cd63e67
[I 2025-01-28 14:24:22,948] Trial 0 finished with value: 22.49027132418556 and parameters: {'x': 0.7306398637570002, 'y': 4.6857695967337865}. Best is trial 0 with value: 22.49027132418556.
[I 2025-01-28 14:24:22,952] Trial 1 finished with value: 14.056700057359325 and parameters: {'x': 3.3899831368128694, 'y': 1.6014725690700118}. Best is trial 1 with value: 14.056700057359325.
[I 2025-01-28 14:24:22,956] Trial 2 finished with value: 18.992867507613074 and parameters: {'x': -3.3314086394111717, 'y': -2.8097302334689855}. Best is trial 1 with value: 14.056700057359325.
[I 2025-01-28 14:24:22,958] Trial 3 finished with value: 1.9205653153291942 and parameters: {'x': -1.1102004951576072, 'y': 0.8294698161361858}. Best is trial 3 with value: 1.9205653153291942.
[I 2025-01-28 14:24:22,961] Trial 4 finished with value: 8.344014770453134 and parameters: {'x': 2.5562116575678413, 

1.9205653153291942 {'x': -1.1102004951576072, 'y': 0.8294698161361858}


In [12]:
# Base parameters for regression
base_params = {
    'verbosity': 0,
    'booster': 'gbtree',
    'objective': 'reg:squarederror',  # Regression objective
    'eval_metric': 'rmse',            # Root Mean Squared Error
}

# Provided hyperparameters
params = {
    'learning_rate': 0.09963558437961703,
    'max_depth': 4,
    'gamma': 6.140489891496016e-05,
    'subsample': 0.7329970919203579,
    'reg_alpha': 1.5492015574694834e-05,
    'reg_lambda': 0.002889192981174229,
    'colsample_bytree': 0.7341652208091972,
    'min_child_weight': 0,
    'n_estimators': 443
}

# Update with base parameters
params.update(base_params)
print("Final Parameters:", params)


cv_results = xgb.cv(
        params=params,
        dtrain=xgb.DMatrix(X_train, label=y_train),
        folds=KFold(n_splits=5),
        metrics='rmse',
        num_boost_round=50000,
        as_pandas=True,
        early_stopping_rounds=10,
        verbose_eval=50,
        seed=0,
    )




Final Parameters: {'learning_rate': 0.09963558437961703, 'max_depth': 4, 'gamma': 6.140489891496016e-05, 'subsample': 0.7329970919203579, 'reg_alpha': 1.5492015574694834e-05, 'reg_lambda': 0.002889192981174229, 'colsample_bytree': 0.7341652208091972, 'min_child_weight': 0, 'n_estimators': 443, 'verbosity': 0, 'booster': 'gbtree', 'objective': 'reg:squarederror', 'eval_metric': 'rmse'}
[0]	train-rmse:1735.63164+29.99006	test-rmse:1734.40992+137.68426
[50]	train-rmse:866.11104+11.48655	test-rmse:1008.84925+56.91576
[100]	train-rmse:707.79288+13.81674	test-rmse:921.10594+50.70601
[150]	train-rmse:615.71407+12.47847	test-rmse:889.51619+49.38066
[200]	train-rmse:548.94559+10.11135	test-rmse:872.32877+49.65981
[250]	train-rmse:496.32228+7.22785	test-rmse:866.89885+47.99747
[300]	train-rmse:452.16041+6.59803	test-rmse:863.63021+47.93792
[310]	train-rmse:444.39670+6.46992	test-rmse:864.11956+47.63337


In [None]:

# Define the objective function for Optuna
def objective(trial):
    # Hyperparameter search space
    params = {
        'verbosity': 0,
        'booster': trial.suggest_categorical('booster', ['gbtree', 'gblinear', 'dart']),
        'objective': 'reg:squarederror',  # Regression objective
        'eval_metric': 'rmse',            # Root Mean Squared Error
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.2, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 0, 10),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
    }


    # Cross-validation
    cv_results = xgb.cv(
        params=params,
        dtrain=xgb.DMatrix(X_train, label=y_train),
        folds=KFold(n_splits=5),
        metrics='rmse',
        num_boost_round=params['n_estimators'],
        as_pandas=True,
        verbose_eval=50,
        early_stopping_rounds=10
    )

    #Added a try-except block to catch exceptions during a trial
    try:
        return cv_results['test-rmse-mean'].min() # Return the minimum RMSE
    except Exception as e:
        # Print or log the exception to understand the issue
        print(f"Trial failed with exception: {e}")
        # Consider returning a large value to indicate trial failure to Optuna
        return float('inf')
# Run Optuna optimization (minization problem)
study = optuna.create_study(study_name="xgboost_model_study", direction="minimize")
study.optimize(objective, n_trials=2)

# Best hyperparameters
# print("Best hyperparameters:", study.best_params)


[I 2025-01-28 14:40:56,693] A new study created in memory with name: xgboost_model_study


[0]	train-rmse:1823.06031+36.50767	test-rmse:1817.69878+149.91198
[50]	train-rmse:1818.67185+36.66557	test-rmse:1813.47115+150.02874
[100]	train-rmse:1814.94732+36.79940	test-rmse:1809.85389+150.36170
[150]	train-rmse:1811.65067+36.90678	test-rmse:1806.64505+150.73042
[200]	train-rmse:1808.70661+36.99169	test-rmse:1803.77944+151.08450
[250]	train-rmse:1806.06078+37.05793	test-rmse:1801.20582+151.41208
[300]	train-rmse:1803.66821+37.10868	test-rmse:1798.88046+151.71183
[350]	train-rmse:1801.49108+37.14671	test-rmse:1796.76606+151.98502
[400]	train-rmse:1799.49765+37.17424	test-rmse:1794.83123+152.23350
[450]	train-rmse:1797.66124+37.19320	test-rmse:1793.04967+152.45925
[500]	train-rmse:1795.95943+37.20513	test-rmse:1791.39939+152.66424
[542]	train-rmse:1794.62005+37.21068	test-rmse:1790.10078+152.82158


[I 2025-01-28 14:40:58,865] Trial 0 finished with value: 1790.100781801315 and parameters: {'booster': 'gblinear', 'learning_rate': 0.002753680942762691, 'max_depth': 10, 'gamma': 0.00220622818381016, 'subsample': 0.4708139065125287, 'reg_alpha': 1.631672431296607e-05, 'reg_lambda': 3.1509989017498665e-06, 'colsample_bytree': 0.22830260137573088, 'min_child_weight': 1, 'n_estimators': 543}. Best is trial 0 with value: 1790.100781801315.


[0]	train-rmse:1809.66580+36.40265	test-rmse:1804.77652+147.48467
[50]	train-rmse:1423.00643+13.25575	test-rmse:1462.73277+96.97814
[100]	train-rmse:1190.90471+8.65370	test-rmse:1260.43602+74.79056
[150]	train-rmse:1049.50473+7.66450	test-rmse:1151.88599+65.59548
[200]	train-rmse:941.91733+7.73058	test-rmse:1071.37299+59.49048
[250]	train-rmse:865.27804+8.71239	test-rmse:1019.76787+55.69555
[300]	train-rmse:806.32393+9.58161	test-rmse:981.65656+54.43342
[350]	train-rmse:761.20495+9.99372	test-rmse:955.35853+53.15081
[400]	train-rmse:722.10982+11.34937	test-rmse:933.45562+50.89833
[450]	train-rmse:691.73653+12.01483	test-rmse:920.60522+50.99076
[500]	train-rmse:665.71317+11.66290	test-rmse:907.95061+50.80302


In [None]:
bst_malaria = xgb.train(
    params,
    dtrain_malaria,
    num_boost_round=best_boost_rounds,
    evals=[(dtrain_malaria, 'train'), (dval_malaria, 'val')],
    verbose_eval=50
)
