## Model long COVID Intensity using Linear Regression (LR)

In [2]:
import pathlib
from pprint import pprint
import numpy as np
from package.data.utils import *

from sklearn.ensemble import RandomForestRegressor

import plotly
import plotly.express as px
import matplotlib.pyplot as plt

from package.data.lifeline_dataset import LifeLineDataSet
from package.data.data_manager import DataManager

## Load the dataset

In [3]:
data_path = pathlib.Path().absolute().parent.parent.parent
dataset = LifeLineDataSet(data_path=data_path / "data" / "extract" / "merged", 
                          dataset_name="merged_vaccin_only_1_full.csv",
                          target_variable="long_covid_intensity")
dataset.get_encoded_data()

In [4]:
data_manager = DataManager()
data_manager.split_data_train_val_test(dataset, train_size=0.7, val_test_prop=0.8)

In [None]:
print(data_manager.train_dataset.features.shape)
print(data_manager.val_dataset.features.shape)
print(data_manager.test_dataset.features.shape)

## Linear Ridge Regression

In [6]:
from package.model.stat_models import StatisticalModels
from package.model.statistical_models.linear_regression_regressor import StatLinearRegressionRegressor
from package.data.scaler import StandardScaler
from package.data.utils import round_off_rating
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

In [9]:
CONFIG_PATH = pathlib.Path().resolve().parent / "configurations" / "models" / "linear_regression.ini"

lr_model = StatisticalModels(StatLinearRegressionRegressor,
                             config_path=CONFIG_PATH,
                             config_name="DEFAULT",
                             name="random_forest",
                             #scaler=StandardScaler
                            )

In [None]:
lr_model.params

Train the model

In [11]:
lr_model.train(data_manager.train_dataset)

predict using test data

In [12]:
predictions = lr_model.predict(data_manager.test_dataset)
y_predict_rounded = np.array([round_off_rating(el) for el in predictions])
test_labels_rounded = np.array([round_off_rating(el) for el in data_manager.test_dataset.targets.ravel()])

Compute the metrics 

In [None]:
# evaluate metrics
test_labels = data_manager.test_dataset.targets.ravel()
print("MAE")
print(mean_absolute_error(test_labels, predictions))
print(mean_absolute_error(test_labels_rounded, y_predict_rounded))
print("MSE")
print(mean_squared_error(test_labels, predictions))
print(mean_squared_error(test_labels_rounded, y_predict_rounded))
print("MAPE")
print(mean_absolute_percentage_error(test_labels, predictions))
print(mean_absolute_percentage_error(test_labels_rounded, y_predict_rounded))

## Cross Validation

In [8]:
CONFIG_PATH = pathlib.Path().resolve().parent / "configurations" / "models" / "linear_regression.ini"

lr_model = StatisticalModels(StatLinearRegressionRegressor,
                             config_path=CONFIG_PATH,
                             config_name="DEFAULT",
                             name="linear_regression",
                             #scaler=StandardScaler
                            )

In [None]:
all_metrics, models, predictions, true_labels = lr_model.cross_validate(dataset, n_splits=5)

In [None]:
from package.model.utils import compute_metrics
maes = []
pearson_tests = []
test_value = []
p_values = []
for i in range(5):
    metrics = compute_metrics(true_labels[i], predictions[i], index=-1)
    maes.append(metrics["mape"])
    pearson_tests.append(metrics["pearson"])
    test_value.append(pearson_tests[i].statistic)
    p_values.append(pearson_tests[i].pvalue)
print(np.mean(test_value))
print(np.mean(p_values))