## Model long COVID Intensity using Gradient Boosting (GB)

In [2]:
import pathlib
from pprint import pprint
import numpy as np
from package.data.utils import *

from sklearn.ensemble import GradientBoostingRegressor

import plotly
import plotly.express as px
import matplotlib.pyplot as plt

from package.data.lifeline_dataset import LifeLineDataSet
from package.data.data_manager import DataManager

## Load the dataset

In [3]:
data_path = pathlib.Path().absolute().parent.parent.parent
dataset = LifeLineDataSet(data_path=data_path / "data" / "extract" / "merged", 
                          dataset_name="merged_vaccin_only_1_full.csv",
                          target_variable="long_covid_intensity")
dataset.get_encoded_data()

In [4]:
data_manager = DataManager()
data_manager.split_data_train_val_test(dataset, train_size=0.7, val_test_prop=0.8)

In [5]:
print(data_manager.train_dataset.features.shape)
print(data_manager.val_dataset.features.shape)
print(data_manager.test_dataset.features.shape)

## Gradient Boosting

In [6]:
from package.model.stat_models import StatisticalModels
from package.model.statistical_models.gradient_boosting_regressor import StatGradientBoostingRegressor
from package.data.scaler import StandardScaler
from package.data.utils import round_off_rating
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

In [7]:
CONFIG_PATH = pathlib.Path().resolve().parent / "configurations" / "models" / "gradient_boosting.ini"

gb_model = StatisticalModels(StatGradientBoostingRegressor,
                             config_path=CONFIG_PATH,
                             config_name="DEFAULT",
                             name="gradient_boosting",
                             #scaler=StandardScaler
                            )

In [8]:
gb_model.params

Train the model

In [9]:
gb_model.train(data_manager.train_dataset)

Predict using test dataset

In [16]:
predictions = gb_model.predict(data_manager.test_dataset)
y_predict_rounded = np.array([round_off_rating(el) for el in predictions])
test_labels_rounded = np.array([round_off_rating(el) for el in data_manager.test_dataset.targets.ravel()])

Compute the evaluation criteria using predictions obtained using GB model and ground-truth observations

In [11]:
from package.model.utils import compute_metrics

In [17]:
compute_metrics(test_labels_rounded, y_predict_rounded, index=-1)

## Model interpretation 
Visualize the feature importances obtained using GB

In [18]:
from package.model.random_forest import plot_feature_importances

In [19]:
plot_feature_importances(gb_model._model.model.feature_importances_, dataset.feature_list)

## Cross validation

In [21]:
CONFIG_PATH = pathlib.Path().resolve().parent / "configurations" / "models" / "gradient_boosting.ini"

rf_model = StatisticalModels(StatGradientBoostingRegressor,
                             config_path=CONFIG_PATH,
                             config_name="DEFAULT",
                             name="gradient_boosting",
                             #scaler=StandardScaler
                            )

In [22]:
all_metrics, models, predictions, true_labels = gb_model.cross_validate(dataset, n_splits=5)

In [23]:
from package.model.utils import compute_metrics
maes = []
pearson_tests = []
test_value = []
p_values = []
for i in range(5):
    metrics = compute_metrics(true_labels[i], predictions[i], index=-1)
    maes.append(metrics["mape"])
    pearson_tests.append(metrics["pearson"])
    test_value.append(pearson_tests[i].statistic)
    p_values.append(pearson_tests[i].pvalue)
print(np.mean(test_value))
print(np.mean(p_values))