## Model long COVID Intensity using Random Forest (RF)

In [2]:
import pathlib
import numpy as np
from package.data.utils import *

from package.data.lifeline_dataset import LifeLineDataSet
from package.data.data_manager import DataManager

## Load the dataset

In [3]:
data_path = pathlib.Path().absolute().parent.parent.parent
dataset = LifeLineDataSet(data_path=data_path / "data" / "extract" / "merged", 
                          dataset_name="merged_vaccin_only_1_full.csv",
                          target_variable="long_covid_intensity")
dataset.get_encoded_data()

In [4]:
data_manager = DataManager()
data_manager.split_data_train_val_test(dataset, train_size=0.7, val_test_prop=0.8)

In [None]:
print(data_manager.train_dataset.features.shape)
print(data_manager.val_dataset.features.shape)
print(data_manager.test_dataset.features.shape)

## Random Forest

In [7]:
from package.model.stat_models import StatisticalModels
from package.model.statistical_models.random_forest_regressor import StatRandomForestRegressor
from package.data.scaler import StandardScaler
from package.data.utils import round_off_rating
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

In [8]:
CONFIG_PATH = pathlib.Path().resolve().parent / "configurations" / "models" / "random_forest.ini"

rf_model = StatisticalModels(StatRandomForestRegressor,
                             config_path=CONFIG_PATH,
                             config_name="DEFAULT",
                             name="random_forest",
                             scaler=StandardScaler
                            )

In [None]:
rf_model.params

Train the model

In [11]:
rf_model.train(data_manager.train_dataset)

Predict on test dataset

In [12]:
predictions = rf_model.predict(data_manager.test_dataset)
y_predict_rounded = np.array([round_off_rating(el) for el in predictions])
test_labels_rounded = np.array([round_off_rating(el) for el in data_manager.test_dataset.targets.ravel()])

Evaluate the performance using KPIs

In [None]:
# evaluate metrics
test_labels = data_manager.test_dataset.targets.ravel()
print("MAE")
print(mean_absolute_error(test_labels, predictions))
print(mean_absolute_error(test_labels_rounded, y_predict_rounded))
print("MSE")
print(mean_squared_error(test_labels, predictions))
print(mean_squared_error(test_labels_rounded, y_predict_rounded))
print("MAPE")
print(mean_absolute_percentage_error(test_labels, predictions))
print(mean_absolute_percentage_error(test_labels_rounded, y_predict_rounded))

## Model interpretation

Feature importance

In [62]:
from package.model.random_forest import plot_feature_importances

In [None]:
plot_feature_importances(rf_model._model.model.feature_importances_, dataset.feature_list)

Visualize the decision trees

In [None]:
# visualize the decision tree
rf_model_small = StatisticalModels(StatRandomForestRegressor,
                             config_path=CONFIG_PATH,
                             config_name="DEFAULT",
                             name="random_forest",
                             n_estimator=10,
                             max_depth=3
                             #scaler=StandardScaler
                            )
print(rf_model_small.params)
rf_model_small.train(data_manager.train_dataset)

In [None]:
print(len(rf_model_small._model.model.estimators_))
estimator = rf_model_small._model.model.estimators_[1]

In [None]:
from sklearn import tree
from matplotlib import pyplot as plt
fig = plt.figure(figsize=(15,10))
_ = tree.plot_tree(estimator, feature_names=dataset.feature_list, filled=True)


In [None]:
from dtreeviz.trees import dtreeviz
viz = dtreeviz(estimator, dataset.features, dataset.targets,
                target_name="target",
                feature_names=dataset.feature_list)
viz

### Cross validation

In [55]:
CONFIG_PATH = pathlib.Path().resolve().parent / "configurations" / "models" / "random_forest.ini"

rf_model = StatisticalModels(StatRandomForestRegressor,
                             config_path=CONFIG_PATH,
                             config_name="DEFAULT",
                             name="random_forest",
                             #scaler=StandardScaler
                            )

In [None]:
all_metrics, models, predictions, true_labels = rf_model.cross_validate(dataset, n_splits=5, )

In [None]:
# for Statistical test
from package.model.utils import compute_metrics
maes = []
pearson_tests = []
test_value = []
p_values = []
for i in range(5):
    metrics = compute_metrics(true_labels[i], predictions[i], index=-1)
    maes.append(metrics["mape"])
    pearson_tests.append(metrics["pearson"])
    test_value.append(pearson_tests[i].statistic)
    p_values.append(pearson_tests[i].pvalue)
print(np.mean(test_value))
print(np.mean(p_values))