<a href="https://colab.research.google.com/github/DHarley22/Prediction_case_malaria_mozambique/blob/1-xgboost-predictor-python-version/catboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip install gdown optuna lightgbm catboost


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import geopandas as gpd
import os
import numpy as np
import xgboost as xgb
import sklearn
import gdown
import optuna
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split as split_into_two
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

In [None]:
%%capture

# The mozambique dataset is hosted here
# https://drive.google.com/file/d/1uHGOjxlxfDYY5E-aL9HVRGvheDa_wNyq/view?usp=sharing
dataset_drive_id = '1uHGOjxlxfDYY5E-aL9HVRGvheDa_wNyq'
dataset_output_file = 'downloaded_sheet.xlsx'

# The shape file relatd to the dataset is hosted here
# https://drive.google.com/drive/folders/14UJ7ZXWmNeL28sYAv6dObNsC42kQr4Ja?usp=sharing
shape_file_drive_id = '14UJ7ZXWmNeL28sYAv6dObNsC42kQr4Ja'
shape_output_folder = 'shape_files'

gdown.download_folder(id=shape_file_drive_id, output=shape_output_folder, quiet=False)
gdown.download(id=dataset_drive_id, output=dataset_output_file, quiet=False)


In [None]:
dataset = pd.read_csv(f"/content/{dataset_output_file}")
shape_file = gpd.read_file(f"/content/{shape_output_folder}/shape_file.shp")

In [None]:
# (Left) join the dataset to the shape_file using the "district" field
shaped_dataset = dataset.merge(
    shape_file[["district", "geometry"]],
    on="district",
    how="left"
)

# Convert the Dataframe into GeoDataframe and add some fields to it
shaped_dataset =  gpd.GeoDataFrame(shaped_dataset, geometry="geometry")

shaped_dataset = shaped_dataset.to_crs(epsg=3857)

# Calculate the area and centroid using the projected CRS
shaped_dataset["area"] = shaped_dataset.geometry.area
shaped_dataset["centroid_x"] = shaped_dataset.geometry.centroid.x
shaped_dataset["centroid_y"] = shaped_dataset.geometry.centroid.y

In [None]:
TARGET_DISEASE_FIELD = "malaria_cases_u5"


# Relevant features
relevant_features = [
    # Environemental features
    'tmin', 'tmax', 'precipitation', 'ndvi','RH',
    # Socio-economic features
    'prop_poor', 'prop_Rural', 'prop_drinking_TreatedWater',
    # Spatial features
     'centroid_x', 'centroid_y', 'area',
]

# Test dataset predicate; use lines between March and June for testing
test_data_predicate = (shaped_dataset['year'] == 2018) & (shaped_dataset['month'].between(3, 6))

# Test dataset
X_test = shaped_dataset[test_data_predicate][relevant_features]
y_test = shaped_dataset[test_data_predicate][TARGET_DISEASE_FIELD]

# The rest of the dataset separated as X, y
X = shaped_dataset[~test_data_predicate][relevant_features]
y = shaped_dataset[~test_data_predicate][TARGET_DISEASE_FIELD]

print(X.shape, y.shape)
X_train, X_val, y_train, y_val = split_into_two(X, y, test_size=0.3, random_state=42)




(5088, 10) (5088,)


In [None]:
def objective(trial):
    params = {
        'depth': trial.suggest_int('depth', 2, 10),
        'learning_rate': trial.suggest_float('learning_rate', 1e-5, 0.1, log=True),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-5, 10, log=True),
        'iterations': trial.suggest_int('iterations', 100, 5000),
    }

    model = CatBoostRegressor(**params, verbose=0)
    score = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error').mean()

    return -score

study = optuna.create_study(study_name="catboost_model_study", direction="minimize")
study.optimize(objective, n_trials=100)

print("Best hyperparameters:", study.best_params)

[I 2025-01-29 13:21:12,974] A new study created in memory with name: catboost_model_study
[I 2025-01-29 13:21:28,818] Trial 0 finished with value: 874.6608294538921 and parameters: {'depth': 3, 'learning_rate': 0.04067714466871558, 'l2_leaf_reg': 0.004469207689830918, 'iterations': 2523}. Best is trial 0 with value: 874.6608294538921.
[I 2025-01-29 13:21:57,652] Trial 1 finished with value: 855.3845742915167 and parameters: {'depth': 5, 'learning_rate': 0.014758970576737505, 'l2_leaf_reg': 0.41284989175180653, 'iterations': 2954}. Best is trial 1 with value: 855.3845742915167.
[I 2025-01-29 13:22:08,917] Trial 2 finished with value: 1756.2106002229825 and parameters: {'depth': 6, 'learning_rate': 0.00014228848412001485, 'l2_leaf_reg': 0.0033531647362556915, 'iterations': 642}. Best is trial 1 with value: 855.3845742915167.
[I 2025-01-29 13:25:54,771] Trial 3 finished with value: 1542.3197367351167 and parameters: {'depth': 10, 'learning_rate': 0.00022607671547098988, 'l2_leaf_reg': 0.0

In [24]:
best_params = {'depth': 8, 'learning_rate': 0.011833211189298013, 'l2_leaf_reg': 1.1690439227702154, 'iterations': 4941} # from the 22 trial

# Train the final model using the best hyperparameters
final_model = CatBoostRegressor(**best_params, verbose=0)
final_model.fit(X, y)

# Make predictions on the test set
y_pred = final_model.predict(X_test)

# Print the first few predictions
print("Sample Predictions:", y_pred[:5])

Sample Predictions: [5326.86530594 4956.17383897 4172.9055363  3656.72163205 3078.42213316]


In [27]:
truth = pd.read_csv("./groud_truth.csv").Malaria_prediction

In [28]:
y_predicted = final_model.predict(X_test)

In [31]:
from sklearn.metrics import mean_squared_error

rmse = np.sqrt(mean_squared_error(truth, y_pred))
print(f"Final Test RMSE: {rmse:.4f}")

Final Test RMSE: 700.5730
