In [3]:
from darts import TimeSeries
import pandas as pd
from darts.models import NaiveSeasonal
from darts.models import NaiveMean
from darts import TimeSeries
from sklearn.metrics import mean_absolute_percentage_error
from typing import Dict
from darts.models import (StatsForecastAutoARIMA, StatsForecastAutoETS, 
                          StatsForecastAutoTheta, StatsForecastAutoCES,
                          FourTheta, KalmanForecaster, CatBoostModel, Croston
                         )

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
df = pd.read_parquet("../../data/processed/dataset.parquet")
#df = df[[column for column in df.columns if 'feat' not in column]]
series = TimeSeries.from_dataframe(df, time_col='ds', value_cols=[column for column in df.columns if 'feat' not in column and column != 'ds'])

In [5]:
df

Unnamed: 0,Денситометр,КТ,КТ с КУ 1 зона,КТ с КУ 2 и более зон,ММГ,МРТ,МРТ с КУ 1 зона,МРТ с КУ 2 и более зон,РГ,Флюорограф,ds,feat_КТ с КУ 1 зона_lag-4_КТ,feat_КТ_lag-1_КТ с КУ 1 зона,feat_МРТ_lag-6_КТ с КУ 2 и более зон,feat_МРТ с КУ 1 зона_lag-1_МРТ,feat_МРТ_lag-1_МРТ с КУ 1 зона,feat_КТ с КУ 1 зона_lag-1_ММГ
1,17.0,6146,43.0,100.0,483,415,169.0,2.0,12450,392.0,2022-01-03,6146.0,43.0,100.0,415.0,169.0,483.0
2,1026.0,10868,424.0,451.0,9567,2156,669.0,9.0,48904,22626.0,2022-01-10,6146.0,43.0,100.0,415.0,169.0,483.0
3,910.0,12266,430.0,490.0,8791,2162,710.0,14.0,47364,20496.0,2022-01-17,6146.0,424.0,100.0,2156.0,669.0,9567.0
4,679.0,12793,336.0,471.0,7465,2066,667.0,7.0,40234,15227.0,2022-01-24,6146.0,430.0,100.0,2162.0,710.0,8791.0
5,571.0,13235,302.0,446.0,6124,1900,609.0,6.0,36502,12586.0,2022-01-31,6146.0,336.0,100.0,2066.0,667.0,7465.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104,1294.0,3313,491.0,476.0,14856,1327,715.0,11.0,81751,5979.0,2023-12-25,4473.0,473.0,784.0,1681.0,863.0,16615.0
105,84.0,950,70.0,73.0,1185,544,131.0,1.0,16148,241.0,2024-01-01,4293.0,491.0,800.0,1327.0,715.0,14856.0
106,1427.0,3317,431.0,369.0,13964,1461,593.0,12.0,80644,5394.0,2024-01-08,4764.0,70.0,743.0,544.0,131.0,1185.0
107,1816.0,3939,563.0,518.0,17769,1712,809.0,17.0,98705,6580.0,2024-01-15,4087.0,431.0,782.0,1461.0,593.0,13964.0


In [7]:
def cross_val(
        models: Dict[str, object],
        series: TimeSeries,
        remains_rows_to_val: int = 24,
        n_rows_to_forecast: int = 4,
        step: int = 1,
) -> Dict[str, Dict[str, pd.Series]]:
    """
    Perform cross-validation

    Args:
        models (Dict[str, object]): A dictionary of models to be used for cross-validation.
        series (TimeSeries): The time series data to be used for cross-validation.
        remains_rows_to_val (int, optional): The number of rows to be used for validation. Defaults to 24.
        n_rows_to_forecast (int, optional): The number of rows to be forecasted. Defaults to 4.
        step (int, optional): The step size for iterating over the training data. Defaults to 1.

    Returns:
        Dict[str, Dict[str, pd.Series]]: A dictionary containing the cross-validation results for each combination of training and forecasting
        rows. The keys are strings representing the combination of rows, and the values are dictionaries containing
        the MAPE (mean absolute percentage error) for each model by each column.
      Example: []
    """
    remains_rows_to_val = 24
    n_rows_to_forecast = 4
    step = 1
    if remains_rows_to_val < n_rows_to_forecast:
        raise ValueError("remains_rows_to_val count must be higher then n_rows_to_forecast")

    cross_val_results = {}
    # 109 - 24          85 + 4, 86 + 4, 87 + 4 ...........
    for n_rows_to_train in range(len(series) - remains_rows_to_val, len(series) - n_rows_to_forecast + 1, step):
        print(f"n_rows_to_train: {n_rows_to_train}", f"i_rows_for_forecast: {n_rows_to_train + 1} - {n_rows_to_train + n_rows_to_forecast}")
        train = series[:n_rows_to_train]
        val = series[n_rows_to_train:n_rows_to_train + n_rows_to_forecast]

        iteration_results = {}
        for name, model in models.items():
            model.fit(train)
            preds = model.predict(n_rows_to_forecast)
            iteration_results[name] = calc_mape_for_each_column(val, preds)

        cross_val_results[f'{n_rows_to_train} + {n_rows_to_forecast}'] = iteration_results

    return (cross_val_results)


def represent_cross_validation_results(
    models: Dict[str, object],
    cross_val_results: Dict[str, Dict[str, pd.Series]]
) -> None:
    """
    Represent the cross-validation results for each model.

    Args:
        models (Dict[str, object]): A dictionary of models.
        cross_val_results (Dict[str, Dict[str, float]]): A dictionary containing the cross-validation results
            
    Returns:
        None
    """
    for model in models:
        represent_model_dict = {}
        for validation_step in cross_val_results:
            represent_model_dict[validation_step] = cross_val_results[validation_step][model]

        represent_model_df = pd.DataFrame(represent_model_dict).T  # transpose DataFrame
        model_df_with_stats = pd.concat([represent_model_df, represent_model_df.describe()])

        display(model_df_with_stats.style.set_caption(f'{model}'))

    return


def calc_mape_for_each_column(
        val: TimeSeries, 
        preds: TimeSeries,
) -> pd.Series:
    """
    Calculate the Mean Absolute Percentage Error (MAPE) for each column between the actual values (val) and predicted values (preds).

    Args:
        val (TimeSeries): The actual values TimeSeries.
        preds (TimeSeries): The predicted values TimeSeries.

    Returns:
        pd.Series: A Series containing the MAPE for each column.
    """
    mape_dict = {}
    for column in preds.columns:
        mape_dict[column] = mean_absolute_percentage_error(val[column].values(), preds[column].values())

    # mape_series = pd.Series(mape_dict)
    # mape_stats: pd.Series = mape_series.describe()
    # mape_and_mape_stats = pd.concat([mape_series, mape_stats])

    return pd.Series(mape_dict)



models = {
    'NavieMean' : NaiveMean(),
    #'KalmanForecaster': KalmanForecaster(dim_x=12),
    #'CatBoostModel': CatBoostModel(lags=26),
}
ress = cross_val(models, series)
represent_cross_validation_results(models, ress)



n_rows_to_train: 84 n_rows_for_forecast: 84 - 88
n_rows_to_train: 85 n_rows_for_forecast: 85 - 89
n_rows_to_train: 86 n_rows_for_forecast: 86 - 90
n_rows_to_train: 87 n_rows_for_forecast: 87 - 91
n_rows_to_train: 88 n_rows_for_forecast: 88 - 92
n_rows_to_train: 89 n_rows_for_forecast: 89 - 93
n_rows_to_train: 90 n_rows_for_forecast: 90 - 94
n_rows_to_train: 91 n_rows_for_forecast: 91 - 95
n_rows_to_train: 92 n_rows_for_forecast: 92 - 96
n_rows_to_train: 93 n_rows_for_forecast: 93 - 97
n_rows_to_train: 94 n_rows_for_forecast: 94 - 98
n_rows_to_train: 95 n_rows_for_forecast: 95 - 99
n_rows_to_train: 96 n_rows_for_forecast: 96 - 100
n_rows_to_train: 97 n_rows_for_forecast: 97 - 101
n_rows_to_train: 98 n_rows_for_forecast: 98 - 102
n_rows_to_train: 99 n_rows_for_forecast: 99 - 103
n_rows_to_train: 100 n_rows_for_forecast: 100 - 104
n_rows_to_train: 101 n_rows_for_forecast: 101 - 105
n_rows_to_train: 102 n_rows_for_forecast: 102 - 106
n_rows_to_train: 103 n_rows_for_forecast: 103 - 107
n_ro

Unnamed: 0,Денситометр,КТ,КТ с КУ 1 зона,КТ с КУ 2 и более зон,ММГ,МРТ,МРТ с КУ 1 зона,МРТ с КУ 2 и более зон,РГ,Флюорограф
84 + 4,0.142643,0.512368,0.12328,0.049092,0.196927,0.163911,0.036479,0.482927,0.157329,0.220791
85 + 4,0.147386,0.407202,0.084606,0.070173,0.207788,0.17611,0.02266,0.380893,0.190157,0.220101
86 + 4,0.132387,0.272382,0.07757,0.102566,0.158061,0.167886,0.032488,0.378733,0.169489,0.324252
87 + 4,0.150695,0.197734,0.059224,0.137212,0.166221,0.14651,0.022306,0.30976,0.196733,0.334562
88 + 4,0.169021,0.139552,0.083957,0.174461,0.188893,0.134135,0.044754,0.268473,0.220298,0.340382
89 + 4,0.199495,0.085357,0.113534,0.225942,0.207281,0.114715,0.069639,0.375964,0.232903,0.355534
90 + 4,0.249618,0.06123,0.14795,0.245889,0.280716,0.116474,0.084509,0.360089,0.299173,0.240917
91 + 4,0.268087,0.033211,0.177012,0.279956,0.29517,0.132259,0.123695,0.435487,0.309397,0.240508
92 + 4,0.286007,0.022567,0.189093,0.294819,0.305004,0.136981,0.133836,0.526247,0.312077,0.237427
93 + 4,0.305635,0.022894,0.145008,0.275536,0.293004,0.146177,0.122237,0.557646,0.304101,0.276151
