<a href="https://colab.research.google.com/github/DHarley22/Prediction_case_malaria_mozambique/blob/1-xgboost-predictor-python-version/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip install gdown optuna lightgbm catboost


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import geopandas as gpd
import os
import numpy as np
import xgboost as xgb
import sklearn
import gdown
import optuna
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split as split_into_two
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

In [None]:
%%capture

# The mozambique dataset is hosted here
# https://drive.google.com/file/d/1uHGOjxlxfDYY5E-aL9HVRGvheDa_wNyq/view?usp=sharing
dataset_drive_id = '1uHGOjxlxfDYY5E-aL9HVRGvheDa_wNyq'
dataset_output_file = 'downloaded_sheet.xlsx'

# The shape file relatd to the dataset is hosted here
# https://drive.google.com/drive/folders/14UJ7ZXWmNeL28sYAv6dObNsC42kQr4Ja?usp=sharing
shape_file_drive_id = '14UJ7ZXWmNeL28sYAv6dObNsC42kQr4Ja'
shape_output_folder = 'shape_files'

gdown.download_folder(id=shape_file_drive_id, output=shape_output_folder, quiet=False)
gdown.download(id=dataset_drive_id, output=dataset_output_file, quiet=False)


In [None]:
dataset = pd.read_csv(f"/content/{dataset_output_file}")
shape_file = gpd.read_file(f"/content/{shape_output_folder}/shape_file.shp")

In [None]:
# (Left) join the dataset to the shape_file using the "district" field
shaped_dataset = dataset.merge(
    shape_file[["district", "geometry"]],
    on="district",
    how="left"
)

# Convert the Dataframe into GeoDataframe and add some fields to it
shaped_dataset =  gpd.GeoDataFrame(shaped_dataset, geometry="geometry")

shaped_dataset = shaped_dataset.to_crs(epsg=3857)

# Calculate the area and centroid using the projected CRS
shaped_dataset["area"] = shaped_dataset.geometry.area
shaped_dataset["centroid_x"] = shaped_dataset.geometry.centroid.x
shaped_dataset["centroid_y"] = shaped_dataset.geometry.centroid.y

In [None]:
TARGET_DISEASE_FIELD = "malaria_cases_u5"


# Relevant features
relevant_features = [
    # Environemental features
    'tmin', 'tmax', 'precipitation', 'ndvi','RH',
    # Socio-economic features
    'prop_poor', 'prop_Rural', 'prop_drinking_TreatedWater',
    # Spatial features
     'centroid_x', 'centroid_y'
]

# Test dataset predicate; use lines between March and June for testing
test_data_predicate = (shaped_dataset['year'] == 2018) & (shaped_dataset['month'].between(3, 6))

# Test dataset
X_test = shaped_dataset[test_data_predicate][relevant_features]
y_test = shaped_dataset[test_data_predicate][TARGET_DISEASE_FIELD]

# The rest of the dataset separated as X, y
X = shaped_dataset[~test_data_predicate][relevant_features]
y = shaped_dataset[~test_data_predicate][TARGET_DISEASE_FIELD]

print(X.shape, y.shape)
X_train, X_val, y_train, y_val = split_into_two(X, y, test_size=0.3, random_state=42)




(5088, 10) (5088,)


In [None]:
import optuna
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'max_depth': trial.suggest_int('max_depth', 2, 30),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
    }

    model = RandomForestRegressor(**params, random_state=42, n_jobs=-1)
    score = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error').mean()

    return -score

study = optuna.create_study(study_name="random_forest_study", direction="minimize")
study.optimize(objective, n_trials=100)

print("Best hyperparameters:", study.best_params)


[I 2025-01-29 10:21:38,181] A new study created in memory with name: random_forest_study
[I 2025-01-29 10:21:53,853] Trial 0 finished with value: 1291.4632417375792 and parameters: {'n_estimators': 758, 'max_depth': 5, 'min_samples_split': 19, 'min_samples_leaf': 10, 'max_features': 'sqrt'}. Best is trial 0 with value: 1291.4632417375792.
[I 2025-01-29 10:22:15,905] Trial 1 finished with value: 958.7682858501336 and parameters: {'n_estimators': 633, 'max_depth': 16, 'min_samples_split': 10, 'min_samples_leaf': 6, 'max_features': 'sqrt'}. Best is trial 1 with value: 958.7682858501336.
[I 2025-01-29 10:22:28,324] Trial 2 finished with value: 1077.4630311599951 and parameters: {'n_estimators': 448, 'max_depth': 10, 'min_samples_split': 10, 'min_samples_leaf': 10, 'max_features': 'sqrt'}. Best is trial 1 with value: 958.7682858501336.
[I 2025-01-29 10:22:45,408] Trial 3 finished with value: 949.6613138843462 and parameters: {'n_estimators': 545, 'max_depth': 16, 'min_samples_split': 13, 'm

Best hyperparameters: {'n_estimators': 890, 'max_depth': 30, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': None}
