# Ridge and Huber regression

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso, Ridge, HuberRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from scipy import optimize

SEED = 333

In [6]:
def transform_df(df: pd.DataFrame, encoder: OneHotEncoder) -> pd.DataFrame:
    # select all object columns
    obj_cols = df.select_dtypes('object')
    # apply transformation by encoder fitted on the training split
    transformed_cols = encoder.transform(obj_cols)
    # get new feature names
    feature_names = encoder.get_feature_names_out()
    # create new dataframe
    transformed_df = pd.DataFrame(
        transformed_cols, index=df.index, columns=feature_names).astype(bool)
    new_df = pd.concat(
        [df.select_dtypes(exclude='object'), transformed_df], axis=1)
    return new_df


def eval_tscv(tscv: TimeSeriesSplit, alpha: float, X, y, weights, verbose: bool = True):
    rmses = []
    maes = []
    for train_index, test_index in tscv.split(X):
        scaler = StandardScaler()
        if isinstance(X, pd.DataFrame):
            X_train = scaler.fit_transform(X.iloc[train_index, :])
            X_test = scaler.transform(X.iloc[test_index, :])
        else:
            X_train = scaler.fit_transform(X[train_index, :])
            X_test = scaler.transform(X[test_index, :])
        y_train = y[train_index]
        y_test = y[test_index]
        tmp_ridge = Ridge(alpha=alpha, random_state=SEED)
        tmp_ridge.fit(X_train, y_train)
        y_pred = tmp_ridge.predict(X_test)
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        mae = mean_absolute_error(y_test, y_pred)
        if verbose:
            print("pred", tmp_ridge.predict(X_test),
                  "test:", y_test, "rmse", rmse, "mae", mae)
            print(tmp_ridge.coef_)
            print(tmp_ridge.intercept_)
            print()
        rmses.append(rmse)
        maes.append(mae)

    weighted_rmse = np.average(rmses, weights=weights)
    weighted_mae = np.average(maes, weights=weights)
    sum_weighted_rmse = np.sum(rmses * weights)
    sum_weighted_mae = np.sum(maes * weights)
    print("rmse", weighted_rmse)
    print("mae", weighted_mae)
    print("sum weighted rmse", sum_weighted_rmse)
    print("sum weighted mae", sum_weighted_mae)

def eval_tscv_huber(tscv: TimeSeriesSplit, alpha: float, X, y, weights, verbose: bool = True):
    rmses = []
    maes = []
    for train_index, test_index in tscv.split(X):
        scaler = StandardScaler()
        if isinstance(X, pd.DataFrame):
            X_train = scaler.fit_transform(X.iloc[train_index, :])
            X_test = scaler.transform(X.iloc[test_index, :])
        else:
            X_train = scaler.fit_transform(X[train_index, :])
            X_test = scaler.transform(X[test_index, :])
        y_train = y[train_index]
        y_test = y[test_index]
        tmp_ridge = HuberRegressor(alpha=alpha)
        tmp_ridge.fit(X_train, y_train)
        y_pred = tmp_ridge.predict(X_test)
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        mae = mean_absolute_error(y_test, y_pred)
        if verbose:
            print("pred", tmp_ridge.predict(X_test),
                  "test:", y_test, "rmse", rmse, "mae", mae)
            print(tmp_ridge.coef_)
            print(tmp_ridge.intercept_)
            print()
        rmses.append(rmse)
        maes.append(mae)

    weighted_rmse = np.average(rmses, weights=weights)
    weighted_mae = np.average(maes, weights=weights)
    sum_weighted_rmse = np.sum(rmses * weights)
    sum_weighted_mae = np.sum(maes * weights)
    print("rmse", weighted_rmse)
    print("mae", weighted_mae)
    print("sum weighted rmse", sum_weighted_rmse)
    print("sum weighted mae", sum_weighted_mae)

## Prepare Data

In [7]:
orig_df = pd.read_csv('../dataset/team_A_dataset.csv')
orig_df.shape

(2478, 117)

In [8]:
display(orig_df.isna().sum())

month                               0
year                                0
kraj                                0
general_thefts                      0
break_in_thefts                     0
                                   ..
noveHlasenaAUvolnenaVPM             0
obsazenaAZrusenaVPM                 0
absolventiSkolAMladistvi            0
uchazeciOZamestnaniUoZMuzi_ratio    0
uchazeciOZamestnaniUoZZeny_ratio    0
Length: 117, dtype: int64

In [9]:
orig_df.describe()

Unnamed: 0,month,year,general_thefts,break_in_thefts,avg_monthly_salary,celkem,m_do_65,z_do_65,celkem_w2,celkem_w3,...,avg_natural_gas_price,uchazeciOZamestnaniUoZ,uchazeciOZamestnaniUoZZeny,uchazeciOZamestnaniUoZMuzi,noveHlaseniUchazeci,noveHlasenaAUvolnenaVPM,obsazenaAZrusenaVPM,absolventiSkolAMladistvi,uchazeciOZamestnaniUoZMuzi_ratio,uchazeciOZamestnaniUoZZeny_ratio
count,2478.0,2478.0,2478.0,2478.0,2478.0,266.0,266.0,266.0,266.0,266.0,...,2478.0,2478.0,2478.0,2478.0,2478.0,2478.0,2478.0,2478.0,2478.0,2478.0
mean,6.423729,2015.881356,489.349475,233.924939,28453.470944,1394.827068,347.5,618.56015,2762.87218,4105.74812,...,3.499783,27967.878531,14174.182809,13793.695722,3371.029459,1975.188458,1899.223164,1480.798628,0.486333,0.513667
std,3.429731,4.259787,681.754931,214.294818,6730.770886,5885.779007,785.677495,2600.293961,8979.240558,11466.193698,...,1.23672,17887.5741,8725.67403,9234.474479,1770.41761,1716.588043,1706.252205,1062.353914,0.030833,0.030833
min,1.0,2009.0,19.0,19.0,17704.0,-26447.0,-3008.0,-12127.0,-24614.0,-22643.0,...,1.7147,5329.0,2808.0,2413.0,817.0,0.0,0.0,165.0,0.408138,0.431466
25%,3.0,2012.0,139.0,80.0,22815.0,276.25,104.25,125.25,543.25,801.25,...,2.664,13350.25,6925.75,6407.5,1965.0,883.25,893.25,633.0,0.464473,0.491328
50%,6.0,2016.0,252.0,162.0,26431.5,509.5,177.5,247.5,1072.5,1629.0,...,3.2143,23996.0,12384.5,11521.0,2892.0,1529.0,1450.0,1244.0,0.486783,0.513217
75%,9.0,2020.0,479.0,282.0,33427.0,1317.0,446.5,660.0,3221.25,5672.75,...,4.08,34709.0,17910.5,17183.5,4402.5,2536.25,2360.0,1895.0,0.508672,0.535527
max,12.0,2023.0,4599.0,1107.0,53070.0,60636.0,6854.0,27163.0,74113.0,83545.0,...,8.7705,93714.0,43201.0,50572.0,12418.0,12114.0,17285.0,6058.0,0.568534,0.591862


In [10]:
orig_df.columns

Index(['month', 'year', 'kraj', 'general_thefts', 'break_in_thefts',
       'avg_monthly_salary', 'celkem', 'm_do_65', 'z_do_65', 'celkem_w2',
       ...
       'avg_natural_gas_price', 'uchazeciOZamestnaniUoZ',
       'uchazeciOZamestnaniUoZZeny', 'uchazeciOZamestnaniUoZMuzi',
       'noveHlaseniUchazeci', 'noveHlasenaAUvolnenaVPM', 'obsazenaAZrusenaVPM',
       'absolventiSkolAMladistvi', 'uchazeciOZamestnaniUoZMuzi_ratio',
       'uchazeciOZamestnaniUoZZeny_ratio'],
      dtype='object', length=117)

### Add Lagged seasonal Data

In [11]:
orig_df

Unnamed: 0,month,year,kraj,general_thefts,break_in_thefts,avg_monthly_salary,celkem,m_do_65,z_do_65,celkem_w2,...,avg_natural_gas_price,uchazeciOZamestnaniUoZ,uchazeciOZamestnaniUoZZeny,uchazeciOZamestnaniUoZMuzi,noveHlaseniUchazeci,noveHlasenaAUvolnenaVPM,obsazenaAZrusenaVPM,absolventiSkolAMladistvi,uchazeciOZamestnaniUoZMuzi_ratio,uchazeciOZamestnaniUoZZeny_ratio
0,1,2009,HKK,271.0,174.0,19132.0,,,,,...,5.0707,17240,8445,8795,4781,1417,2948,1221,0.510151,0.489849
1,1,2009,JHC,275.0,181.0,19576.0,,,,,...,5.0707,20611,10352,10259,5538,1053,2289,1512,0.497744,0.502256
2,1,2009,JHM,909.0,286.0,21065.0,,,,,...,5.0707,48155,24333,23822,9824,3140,4985,3078,0.494694,0.505306
3,1,2009,KVK,198.0,96.0,18652.0,,,,,...,5.0707,15399,7386,8013,3325,496,1050,960,0.520358,0.479642
4,1,2009,LBK,343.0,130.0,19653.0,,,,,...,5.0707,19266,9563,9703,4814,1134,1665,1126,0.503633,0.496367
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2473,9,2023,PLK,145.0,130.0,41220.0,703.0,195.0,386.0,1320.0,...,2.7372,11322,6538,4784,2360,1473,1221,809,0.422540,0.577460
2474,9,2023,STC,262.0,254.0,42990.0,985.0,333.0,502.0,1860.0,...,2.7372,30943,17887,13056,5929,2502,1777,2149,0.421937,0.578063
2475,9,2023,ULK,243.0,200.0,39664.0,186.0,80.0,69.0,410.0,...,2.7372,30864,17728,13136,4871,1380,1217,2084,0.425609,0.574391
2476,9,2023,VYS,62.0,39.0,39315.0,334.0,117.0,163.0,752.0,...,2.7372,9409,5303,4106,2006,885,865,769,0.436391,0.563609


In [12]:
seasonal_cols = ['avg_monthly_salary', 'general_thefts', 'break_in_thefts', 'noveHlaseniUchazeci',
                  'absolventiSkolAMladistvi', 'noveHlasenaAUvolnenaVPM', 'obsazenaAZrusenaVPM']

for col in seasonal_cols:
    orig_df[col + '_prev_year'] = orig_df[col].shift(168)

display(orig_df)

Unnamed: 0,month,year,kraj,general_thefts,break_in_thefts,avg_monthly_salary,celkem,m_do_65,z_do_65,celkem_w2,...,absolventiSkolAMladistvi,uchazeciOZamestnaniUoZMuzi_ratio,uchazeciOZamestnaniUoZZeny_ratio,avg_monthly_salary_prev_year,general_thefts_prev_year,break_in_thefts_prev_year,noveHlaseniUchazeci_prev_year,absolventiSkolAMladistvi_prev_year,noveHlasenaAUvolnenaVPM_prev_year,obsazenaAZrusenaVPM_prev_year
0,1,2009,HKK,271.0,174.0,19132.0,,,,,...,1221,0.510151,0.489849,,,,,,,
1,1,2009,JHC,275.0,181.0,19576.0,,,,,...,1512,0.497744,0.502256,,,,,,,
2,1,2009,JHM,909.0,286.0,21065.0,,,,,...,3078,0.494694,0.505306,,,,,,,
3,1,2009,KVK,198.0,96.0,18652.0,,,,,...,960,0.520358,0.479642,,,,,,,
4,1,2009,LBK,343.0,130.0,19653.0,,,,,...,1126,0.503633,0.496367,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2473,9,2023,PLK,145.0,130.0,41220.0,703.0,195.0,386.0,1320.0,...,809,0.422540,0.577460,38290.0,142.0,158.0,2411.0,734.0,1353.0,1529.0
2474,9,2023,STC,262.0,254.0,42990.0,985.0,333.0,502.0,1860.0,...,2149,0.421937,0.578063,39839.0,274.0,310.0,6165.0,1913.0,3029.0,3157.0
2475,9,2023,ULK,243.0,200.0,39664.0,186.0,80.0,69.0,410.0,...,2084,0.425609,0.574391,36731.0,250.0,227.0,4932.0,1812.0,1279.0,1022.0
2476,9,2023,VYS,62.0,39.0,39315.0,334.0,117.0,163.0,752.0,...,769,0.436391,0.563609,36588.0,60.0,57.0,2182.0,779.0,917.0,992.0


In [13]:
for i in range(len(orig_df)):
    for col in seasonal_cols:
        if np.isnan(orig_df.loc[orig_df.index[i], col + '_prev_year']):
            orig_df.loc[orig_df.index[i], col + '_prev_year'] = orig_df.loc[orig_df.index[i], col]

In [14]:
display(orig_df)

Unnamed: 0,month,year,kraj,general_thefts,break_in_thefts,avg_monthly_salary,celkem,m_do_65,z_do_65,celkem_w2,...,absolventiSkolAMladistvi,uchazeciOZamestnaniUoZMuzi_ratio,uchazeciOZamestnaniUoZZeny_ratio,avg_monthly_salary_prev_year,general_thefts_prev_year,break_in_thefts_prev_year,noveHlaseniUchazeci_prev_year,absolventiSkolAMladistvi_prev_year,noveHlasenaAUvolnenaVPM_prev_year,obsazenaAZrusenaVPM_prev_year
0,1,2009,HKK,271.0,174.0,19132.0,,,,,...,1221,0.510151,0.489849,19132.0,271.0,174.0,4781.0,1221.0,1417.0,2948.0
1,1,2009,JHC,275.0,181.0,19576.0,,,,,...,1512,0.497744,0.502256,19576.0,275.0,181.0,5538.0,1512.0,1053.0,2289.0
2,1,2009,JHM,909.0,286.0,21065.0,,,,,...,3078,0.494694,0.505306,21065.0,909.0,286.0,9824.0,3078.0,3140.0,4985.0
3,1,2009,KVK,198.0,96.0,18652.0,,,,,...,960,0.520358,0.479642,18652.0,198.0,96.0,3325.0,960.0,496.0,1050.0
4,1,2009,LBK,343.0,130.0,19653.0,,,,,...,1126,0.503633,0.496367,19653.0,343.0,130.0,4814.0,1126.0,1134.0,1665.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2473,9,2023,PLK,145.0,130.0,41220.0,703.0,195.0,386.0,1320.0,...,809,0.422540,0.577460,38290.0,142.0,158.0,2411.0,734.0,1353.0,1529.0
2474,9,2023,STC,262.0,254.0,42990.0,985.0,333.0,502.0,1860.0,...,2149,0.421937,0.578063,39839.0,274.0,310.0,6165.0,1913.0,3029.0,3157.0
2475,9,2023,ULK,243.0,200.0,39664.0,186.0,80.0,69.0,410.0,...,2084,0.425609,0.574391,36731.0,250.0,227.0,4932.0,1812.0,1279.0,1022.0
2476,9,2023,VYS,62.0,39.0,39315.0,334.0,117.0,163.0,752.0,...,769,0.436391,0.563609,36588.0,60.0,57.0,2182.0,779.0,917.0,992.0


### Dataset version 1 - predicting uchazeciOZamestnaniUoZZeny_ratio, all rows, without refugee info -> big (economic) model

In [35]:
drop_cols = [f"celkem_w{w}" for w in range(2,20)]
drop_cols += [f"m_do_65_w{w}" for w in range(2,20)]
drop_cols += [f"z_do_65_w{w}" for w in range(2,20)]
drop_cols += [f"m_do_65_w{w}_ratio" for w in range(2,20)]
drop_cols += [f"z_do_65_w{w}_ratio" for w in range(2,20)]
drop_cols += ["celkem", "m_do_65", "z_do_65","m_do_65_ratio", "z_do_65_ratio", "uchazeciOZamestnaniUoZ",	"uchazeciOZamestnaniUoZZeny", "uchazeciOZamestnaniUoZMuzi", "uchazeciOZamestnaniUoZMuzi_ratio"]


In [36]:
df1 = orig_df.drop(columns=drop_cols)
display(df1.columns)
display(df1.info())

Index(['month', 'year', 'kraj', 'general_thefts', 'break_in_thefts',
       'avg_monthly_salary', 'monthly_min_wage',
       'monthly_inflation_rate_wrt_last_year', 'reer', 'bilance',
       'avg_energy_price', 'avg_gasoline_price', 'avg_natural_gas_price',
       'noveHlaseniUchazeci', 'noveHlasenaAUvolnenaVPM', 'obsazenaAZrusenaVPM',
       'absolventiSkolAMladistvi', 'uchazeciOZamestnaniUoZZeny_ratio'],
      dtype='object')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2478 entries, 0 to 2477
Data columns (total 18 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   month                                 2478 non-null   int64  
 1   year                                  2478 non-null   int64  
 2   kraj                                  2478 non-null   object 
 3   general_thefts                        2478 non-null   float64
 4   break_in_thefts                       2478 non-null   float64
 5   avg_monthly_salary                    2478 non-null   float64
 6   monthly_min_wage                      2478 non-null   int64  
 7   monthly_inflation_rate_wrt_last_year  2478 non-null   float64
 8   reer                                  2478 non-null   float64
 9   bilance                               2478 non-null   float64
 10  avg_energy_price                      2478 non-null   float64
 11  avg_gasoline_pric

None

In [37]:
df1

Unnamed: 0,month,year,kraj,general_thefts,break_in_thefts,avg_monthly_salary,monthly_min_wage,monthly_inflation_rate_wrt_last_year,reer,bilance,avg_energy_price,avg_gasoline_price,avg_natural_gas_price,noveHlaseniUchazeci,noveHlasenaAUvolnenaVPM,obsazenaAZrusenaVPM,absolventiSkolAMladistvi,uchazeciOZamestnaniUoZZeny_ratio
0,1,2009,HKK,271.0,174.0,19132.0,8000,2.2,89.45,-2.730,70.209,1.1317,5.0707,4781,1417,2948,1221,0.489849
1,1,2009,JHC,275.0,181.0,19576.0,8000,2.2,89.45,-2.730,70.209,1.1317,5.0707,5538,1053,2289,1512,0.502256
2,1,2009,JHM,909.0,286.0,21065.0,8000,2.2,89.45,-2.730,70.209,1.1317,5.0707,9824,3140,4985,3078,0.505306
3,1,2009,KVK,198.0,96.0,18652.0,8000,2.2,89.45,-2.730,70.209,1.1317,5.0707,3325,496,1050,960,0.479642
4,1,2009,LBK,343.0,130.0,19653.0,8000,2.2,89.45,-2.730,70.209,1.1317,5.0707,4814,1134,1665,1126,0.496367
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2473,9,2023,PLK,145.0,130.0,41220.0,17300,6.9,115.41,11.874,138.980,1.7623,2.7372,2360,1473,1221,809,0.577460
2474,9,2023,STC,262.0,254.0,42990.0,17300,6.9,115.41,11.874,138.980,1.7623,2.7372,5929,2502,1777,2149,0.578063
2475,9,2023,ULK,243.0,200.0,39664.0,17300,6.9,115.41,11.874,138.980,1.7623,2.7372,4871,1380,1217,2084,0.574391
2476,9,2023,VYS,62.0,39.0,39315.0,17300,6.9,115.41,11.874,138.980,1.7623,2.7372,2006,885,865,769,0.563609


categorical column is kraj (month and year are probably not - in terms of time series)

In [41]:
encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False, drop="first")

obj_cols = df1.select_dtypes('object')
encoder.fit(obj_cols)

df1 = transform_df(df1, encoder=encoder)

In [42]:
df1

Unnamed: 0,month,year,general_thefts,break_in_thefts,avg_monthly_salary,monthly_min_wage,monthly_inflation_rate_wrt_last_year,reer,bilance,avg_energy_price,...,kraj_LBK,kraj_MSK,kraj_OLK,kraj_PAK,kraj_PHA,kraj_PLK,kraj_STC,kraj_ULK,kraj_VYS,kraj_ZLK
0,1,2009,271.0,174.0,19132.0,8000,2.2,89.45,-2.730,70.209,...,False,False,False,False,False,False,False,False,False,False
1,1,2009,275.0,181.0,19576.0,8000,2.2,89.45,-2.730,70.209,...,False,False,False,False,False,False,False,False,False,False
2,1,2009,909.0,286.0,21065.0,8000,2.2,89.45,-2.730,70.209,...,False,False,False,False,False,False,False,False,False,False
3,1,2009,198.0,96.0,18652.0,8000,2.2,89.45,-2.730,70.209,...,False,False,False,False,False,False,False,False,False,False
4,1,2009,343.0,130.0,19653.0,8000,2.2,89.45,-2.730,70.209,...,True,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2473,9,2023,145.0,130.0,41220.0,17300,6.9,115.41,11.874,138.980,...,False,False,False,False,False,True,False,False,False,False
2474,9,2023,262.0,254.0,42990.0,17300,6.9,115.41,11.874,138.980,...,False,False,False,False,False,False,True,False,False,False
2475,9,2023,243.0,200.0,39664.0,17300,6.9,115.41,11.874,138.980,...,False,False,False,False,False,False,False,True,False,False
2476,9,2023,62.0,39.0,39315.0,17300,6.9,115.41,11.874,138.980,...,False,False,False,False,False,False,False,False,True,False


In [43]:
df1.isna().sum()

month                                   0
year                                    0
general_thefts                          0
break_in_thefts                         0
avg_monthly_salary                      0
monthly_min_wage                        0
monthly_inflation_rate_wrt_last_year    0
reer                                    0
bilance                                 0
avg_energy_price                        0
avg_gasoline_price                      0
avg_natural_gas_price                   0
noveHlaseniUchazeci                     0
noveHlasenaAUvolnenaVPM                 0
obsazenaAZrusenaVPM                     0
absolventiSkolAMladistvi                0
uchazeciOZamestnaniUoZZeny_ratio        0
kraj_JHC                                0
kraj_JHM                                0
kraj_KVK                                0
kraj_LBK                                0
kraj_MSK                                0
kraj_OLK                                0
kraj_PAK                          

### Dataset version 2 - predicting uchazeciOZamestnaniUoZZeny_ratio, refugees rows -> refugee model

In [32]:
drop_cols2 = [f"celkem_w{w}" for w in range(2,20)]
drop_cols2 += [f"m_do_65_w{w}" for w in range(2,20)]
drop_cols2 += [f"z_do_65_w{w}" for w in range(2,20)]
drop_cols2 += ["celkem", "m_do_65", "z_do_65", "uchazeciOZamestnaniUoZ", "uchazeciOZamestnaniUoZZeny", "uchazeciOZamestnaniUoZMuzi", "uchazeciOZamestnaniUoZMuzi_ratio"]

In [33]:
df2 = orig_df.drop(columns=drop_cols2)

In [34]:
df2 = df2[(df2.year > 2022) | ((df2.year == 2022) & (df2.month > 2))]
df2

Unnamed: 0,month,year,kraj,general_thefts,break_in_thefts,avg_monthly_salary,monthly_min_wage,monthly_inflation_rate_wrt_last_year,reer,m_do_65_ratio,...,obsazenaAZrusenaVPM,absolventiSkolAMladistvi,uchazeciOZamestnaniUoZZeny_ratio,avg_monthly_salary_prev_year,general_thefts_prev_year,break_in_thefts_prev_year,noveHlaseniUchazeci_prev_year,absolventiSkolAMladistvi_prev_year,noveHlasenaAUvolnenaVPM_prev_year,obsazenaAZrusenaVPM_prev_year
2212,3,2022,HKK,60.0,53.0,34689.0,16200,12.7,107.78,0.117205,...,2094,527,0.507011,33032.0,53.0,32.0,1412.0,555.0,2091.0,1775.0
2213,3,2022,JHC,113.0,71.0,33998.0,16200,12.7,107.78,0.123191,...,4076,521,0.520700,32213.0,75.0,68.0,1852.0,727.0,3097.0,2820.0
2214,3,2022,JHM,224.0,327.0,37027.0,16200,12.7,107.78,0.129461,...,6142,1624,0.511907,34989.0,137.0,193.0,3620.0,1751.0,5790.0,5105.0
2215,3,2022,KVK,54.0,63.0,32424.0,16200,12.7,107.78,0.117503,...,1588,372,0.532359,30189.0,56.0,55.0,1166.0,457.0,1241.0,1148.0
2216,3,2022,LBK,129.0,116.0,33745.0,16200,12.7,107.78,0.095375,...,3957,566,0.532358,31956.0,72.0,55.0,1547.0,601.0,2858.0,2790.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2473,9,2023,PLK,145.0,130.0,41220.0,17300,6.9,115.41,0.277383,...,1221,809,0.577460,38290.0,142.0,158.0,2411.0,734.0,1353.0,1529.0
2474,9,2023,STC,262.0,254.0,42990.0,17300,6.9,115.41,0.338071,...,1777,2149,0.578063,39839.0,274.0,310.0,6165.0,1913.0,3029.0,3157.0
2475,9,2023,ULK,243.0,200.0,39664.0,17300,6.9,115.41,0.430108,...,1217,2084,0.574391,36731.0,250.0,227.0,4932.0,1812.0,1279.0,1022.0
2476,9,2023,VYS,62.0,39.0,39315.0,17300,6.9,115.41,0.350299,...,865,769,0.563609,36588.0,60.0,57.0,2182.0,779.0,917.0,992.0


In [35]:
from sklearn.preprocessing import OneHotEncoder


encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False, drop="first")

obj_cols = df2.select_dtypes('object')
encoder.fit(obj_cols)

df2 = transform_df(df2, encoder=encoder)

In [36]:
display(df2)

Unnamed: 0,month,year,general_thefts,break_in_thefts,avg_monthly_salary,monthly_min_wage,monthly_inflation_rate_wrt_last_year,reer,m_do_65_ratio,z_do_65_ratio,...,kraj_LBK,kraj_MSK,kraj_OLK,kraj_PAK,kraj_PHA,kraj_PLK,kraj_STC,kraj_ULK,kraj_VYS,kraj_ZLK
2212,3,2022,60.0,53.0,34689.0,16200,12.7,107.78,0.117205,0.417727,...,False,False,False,False,False,False,False,False,False,False
2213,3,2022,113.0,71.0,33998.0,16200,12.7,107.78,0.123191,0.430853,...,False,False,False,False,False,False,False,False,False,False
2214,3,2022,224.0,327.0,37027.0,16200,12.7,107.78,0.129461,0.426214,...,False,False,False,False,False,False,False,False,False,False
2215,3,2022,54.0,63.0,32424.0,16200,12.7,107.78,0.117503,0.437780,...,False,False,False,False,False,False,False,False,False,False
2216,3,2022,129.0,116.0,33745.0,16200,12.7,107.78,0.095375,0.436593,...,True,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2473,9,2023,145.0,130.0,41220.0,17300,6.9,115.41,0.277383,0.549075,...,False,False,False,False,False,True,False,False,False,False
2474,9,2023,262.0,254.0,42990.0,17300,6.9,115.41,0.338071,0.509645,...,False,False,False,False,False,False,True,False,False,False
2475,9,2023,243.0,200.0,39664.0,17300,6.9,115.41,0.430108,0.370968,...,False,False,False,False,False,False,False,True,False,False
2476,9,2023,62.0,39.0,39315.0,17300,6.9,115.41,0.350299,0.488024,...,False,False,False,False,False,False,False,False,True,False


### Dataset version 1.2 predicting uchazeciOZamestnaniUoZZeny, all rows, without refugee info -> big economic model

In [372]:
drop_cols = [f"celkem_w{w}" for w in range(2,20)]
drop_cols += [f"m_do_65_w{w}" for w in range(2,20)]
drop_cols += [f"z_do_65_w{w}" for w in range(2,20)]
drop_cols += [f"m_do_65_w{w}_ratio" for w in range(2,20)]
drop_cols += [f"z_do_65_w{w}_ratio" for w in range(2,20)]
drop_cols += ["celkem", "m_do_65", "z_do_65","m_do_65_ratio", "z_do_65_ratio", "uchazeciOZamestnaniUoZ",	"uchazeciOZamestnaniUoZZeny_ratio", "uchazeciOZamestnaniUoZMuzi", "uchazeciOZamestnaniUoZMuzi_ratio"]

In [373]:
df12 = orig_df.drop(columns=drop_cols)
display(df12.columns)
display(df12.info())

Index(['month', 'year', 'kraj', 'general_thefts', 'break_in_thefts',
       'avg_monthly_salary', 'monthly_min_wage',
       'monthly_inflation_rate_wrt_last_year', 'reer', 'bilance',
       'avg_energy_price', 'avg_gasoline_price', 'avg_natural_gas_price',
       'uchazeciOZamestnaniUoZZeny', 'noveHlaseniUchazeci',
       'noveHlasenaAUvolnenaVPM', 'obsazenaAZrusenaVPM',
       'absolventiSkolAMladistvi', 'avg_monthly_salary_prev_year',
       'general_thefts_prev_year', 'break_in_thefts_prev_year',
       'noveHlaseniUchazeci_prev_year', 'absolventiSkolAMladistvi_prev_year',
       'noveHlasenaAUvolnenaVPM_prev_year', 'obsazenaAZrusenaVPM_prev_year'],
      dtype='object')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2478 entries, 0 to 2477
Data columns (total 25 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   month                                 2478 non-null   int64  
 1   year                                  2478 non-null   int64  
 2   kraj                                  2478 non-null   object 
 3   general_thefts                        2478 non-null   float64
 4   break_in_thefts                       2478 non-null   float64
 5   avg_monthly_salary                    2478 non-null   float64
 6   monthly_min_wage                      2478 non-null   int64  
 7   monthly_inflation_rate_wrt_last_year  2478 non-null   float64
 8   reer                                  2478 non-null   float64
 9   bilance                               2478 non-null   float64
 10  avg_energy_price                      2478 non-null   float64
 11  avg_gasoline_pric

None

In [374]:
encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False, drop="first")

obj_cols = df12.select_dtypes('object')
encoder.fit(obj_cols)

df12 = transform_df(df12, encoder=encoder)

### Dataset version 2.2 - predicting uchazeciOZamestnaniUoZZeny, refugees rows -> small refugee model

In [15]:
drop_cols2 = [f"celkem_w{w}" for w in range(2,20)]
drop_cols2 += [f"m_do_65_w{w}" for w in range(2,20)]
drop_cols2 += [f"z_do_65_w{w}" for w in range(2,20)]
drop_cols2 += ["celkem", "m_do_65", "z_do_65", "uchazeciOZamestnaniUoZ", "uchazeciOZamestnaniUoZZeny_ratio", "uchazeciOZamestnaniUoZMuzi", "uchazeciOZamestnaniUoZMuzi_ratio"]

In [16]:
df22 = orig_df.drop(columns=drop_cols2)
df22 = df22[(df22.year > 2022) | ((df22.year == 2022) & (df22.month > 2))]
df22

Unnamed: 0,month,year,kraj,general_thefts,break_in_thefts,avg_monthly_salary,monthly_min_wage,monthly_inflation_rate_wrt_last_year,reer,m_do_65_ratio,...,noveHlasenaAUvolnenaVPM,obsazenaAZrusenaVPM,absolventiSkolAMladistvi,avg_monthly_salary_prev_year,general_thefts_prev_year,break_in_thefts_prev_year,noveHlaseniUchazeci_prev_year,absolventiSkolAMladistvi_prev_year,noveHlasenaAUvolnenaVPM_prev_year,obsazenaAZrusenaVPM_prev_year
2212,3,2022,HKK,60.0,53.0,34689.0,16200,12.7,107.78,0.117205,...,2010,2094,527,33032.0,53.0,32.0,1412.0,555.0,2091.0,1775.0
2213,3,2022,JHC,113.0,71.0,33998.0,16200,12.7,107.78,0.123191,...,3249,4076,521,32213.0,75.0,68.0,1852.0,727.0,3097.0,2820.0
2214,3,2022,JHM,224.0,327.0,37027.0,16200,12.7,107.78,0.129461,...,6761,6142,1624,34989.0,137.0,193.0,3620.0,1751.0,5790.0,5105.0
2215,3,2022,KVK,54.0,63.0,32424.0,16200,12.7,107.78,0.117503,...,2096,1588,372,30189.0,56.0,55.0,1166.0,457.0,1241.0,1148.0
2216,3,2022,LBK,129.0,116.0,33745.0,16200,12.7,107.78,0.095375,...,3192,3957,566,31956.0,72.0,55.0,1547.0,601.0,2858.0,2790.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2473,9,2023,PLK,145.0,130.0,41220.0,17300,6.9,115.41,0.277383,...,1473,1221,809,38290.0,142.0,158.0,2411.0,734.0,1353.0,1529.0
2474,9,2023,STC,262.0,254.0,42990.0,17300,6.9,115.41,0.338071,...,2502,1777,2149,39839.0,274.0,310.0,6165.0,1913.0,3029.0,3157.0
2475,9,2023,ULK,243.0,200.0,39664.0,17300,6.9,115.41,0.430108,...,1380,1217,2084,36731.0,250.0,227.0,4932.0,1812.0,1279.0,1022.0
2476,9,2023,VYS,62.0,39.0,39315.0,17300,6.9,115.41,0.350299,...,885,865,769,36588.0,60.0,57.0,2182.0,779.0,917.0,992.0


In [17]:
encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False, drop="first")

obj_cols = df22.select_dtypes('object')
encoder.fit(obj_cols)

df22 = transform_df(df22, encoder=encoder)

In [19]:
df22.columns

Index(['month', 'year', 'general_thefts', 'break_in_thefts',
       'avg_monthly_salary', 'monthly_min_wage',
       'monthly_inflation_rate_wrt_last_year', 'reer', 'm_do_65_ratio',
       'z_do_65_ratio', 'm_do_65_w2_ratio', 'm_do_65_w3_ratio',
       'm_do_65_w4_ratio', 'm_do_65_w5_ratio', 'm_do_65_w6_ratio',
       'm_do_65_w7_ratio', 'm_do_65_w8_ratio', 'm_do_65_w9_ratio',
       'm_do_65_w10_ratio', 'm_do_65_w11_ratio', 'm_do_65_w12_ratio',
       'm_do_65_w13_ratio', 'm_do_65_w14_ratio', 'm_do_65_w15_ratio',
       'm_do_65_w16_ratio', 'm_do_65_w17_ratio', 'm_do_65_w18_ratio',
       'm_do_65_w19_ratio', 'z_do_65_w2_ratio', 'z_do_65_w3_ratio',
       'z_do_65_w4_ratio', 'z_do_65_w5_ratio', 'z_do_65_w6_ratio',
       'z_do_65_w7_ratio', 'z_do_65_w8_ratio', 'z_do_65_w9_ratio',
       'z_do_65_w10_ratio', 'z_do_65_w11_ratio', 'z_do_65_w12_ratio',
       'z_do_65_w13_ratio', 'z_do_65_w14_ratio', 'z_do_65_w15_ratio',
       'z_do_65_w16_ratio', 'z_do_65_w17_ratio', 'z_do_65_w18_ratio'

In [20]:
display(df22)

Unnamed: 0,month,year,general_thefts,break_in_thefts,avg_monthly_salary,monthly_min_wage,monthly_inflation_rate_wrt_last_year,reer,m_do_65_ratio,z_do_65_ratio,...,kraj_LBK,kraj_MSK,kraj_OLK,kraj_PAK,kraj_PHA,kraj_PLK,kraj_STC,kraj_ULK,kraj_VYS,kraj_ZLK
2212,3,2022,60.0,53.0,34689.0,16200,12.7,107.78,0.117205,0.417727,...,False,False,False,False,False,False,False,False,False,False
2213,3,2022,113.0,71.0,33998.0,16200,12.7,107.78,0.123191,0.430853,...,False,False,False,False,False,False,False,False,False,False
2214,3,2022,224.0,327.0,37027.0,16200,12.7,107.78,0.129461,0.426214,...,False,False,False,False,False,False,False,False,False,False
2215,3,2022,54.0,63.0,32424.0,16200,12.7,107.78,0.117503,0.437780,...,False,False,False,False,False,False,False,False,False,False
2216,3,2022,129.0,116.0,33745.0,16200,12.7,107.78,0.095375,0.436593,...,True,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2473,9,2023,145.0,130.0,41220.0,17300,6.9,115.41,0.277383,0.549075,...,False,False,False,False,False,True,False,False,False,False
2474,9,2023,262.0,254.0,42990.0,17300,6.9,115.41,0.338071,0.509645,...,False,False,False,False,False,False,True,False,False,False
2475,9,2023,243.0,200.0,39664.0,17300,6.9,115.41,0.430108,0.370968,...,False,False,False,False,False,False,False,True,False,False
2476,9,2023,62.0,39.0,39315.0,17300,6.9,115.41,0.350299,0.488024,...,False,False,False,False,False,False,False,False,True,False


### Dataset version 2.3 predicting uchazeciOZamestnaniUoZZeny, refugees rows, without cumulative lags see David -> small refugee model

In [49]:
drop_cols2 = [f"celkem_w{w}" for w in range(2,20)]
#drop_cols2 += [f"m_do_65_w{w}" for w in range(2,19)]
#drop_cols2 += [f"z_do_65_w{w}" for w in range(2,19)]
drop_cols2 += [f"m_do_65_w{w}_ratio" for w in range(2,20)]
drop_cols2 += [f"z_do_65_w{w}_ratio" for w in range(2,20)]
drop_cols2 += ["celkem", "m_do_65_ratio", "z_do_65_ratio", "uchazeciOZamestnaniUoZ", "uchazeciOZamestnaniUoZZeny_ratio", "uchazeciOZamestnaniUoZMuzi", "uchazeciOZamestnaniUoZMuzi_ratio"]

In [50]:
df23 = orig_df.drop(columns=drop_cols2)
df23 = df23[(df23.year > 2022) | ((df23.year == 2022) & (df23.month > 2))]
df23

Unnamed: 0,month,year,kraj,general_thefts,break_in_thefts,avg_monthly_salary,m_do_65,z_do_65,m_do_65_w2,m_do_65_w3,...,noveHlasenaAUvolnenaVPM,obsazenaAZrusenaVPM,absolventiSkolAMladistvi,avg_monthly_salary_prev_year,general_thefts_prev_year,break_in_thefts_prev_year,noveHlaseniUchazeci_prev_year,absolventiSkolAMladistvi_prev_year,noveHlasenaAUvolnenaVPM_prev_year,obsazenaAZrusenaVPM_prev_year
2212,3,2022,HKK,60.0,53.0,34689.0,1280.0,4562.0,1280.0,1280.0,...,2010,2094,527,33032.0,53.0,32.0,1412.0,555.0,2091.0,1775.0
2213,3,2022,JHC,113.0,71.0,33998.0,1558.0,5449.0,1558.0,1558.0,...,3249,4076,521,32213.0,75.0,68.0,1852.0,727.0,3097.0,2820.0
2214,3,2022,JHM,224.0,327.0,37027.0,3138.0,10331.0,3138.0,3138.0,...,6761,6142,1624,34989.0,137.0,193.0,3620.0,1751.0,5790.0,5105.0
2215,3,2022,KVK,54.0,63.0,32424.0,1152.0,4292.0,1152.0,1152.0,...,2096,1588,372,30189.0,56.0,55.0,1166.0,457.0,1241.0,1148.0
2216,3,2022,LBK,129.0,116.0,33745.0,992.0,4541.0,992.0,992.0,...,3192,3957,566,31956.0,72.0,55.0,1547.0,601.0,2858.0,2790.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2473,9,2023,PLK,145.0,130.0,41220.0,195.0,386.0,374.0,591.0,...,1473,1221,809,38290.0,142.0,158.0,2411.0,734.0,1353.0,1529.0
2474,9,2023,STC,262.0,254.0,42990.0,333.0,502.0,637.0,1052.0,...,2502,1777,2149,39839.0,274.0,310.0,6165.0,1913.0,3029.0,3157.0
2475,9,2023,ULK,243.0,200.0,39664.0,80.0,69.0,191.0,349.0,...,1380,1217,2084,36731.0,250.0,227.0,4932.0,1812.0,1279.0,1022.0
2476,9,2023,VYS,62.0,39.0,39315.0,117.0,163.0,227.0,364.0,...,885,865,769,36588.0,60.0,57.0,2182.0,779.0,917.0,992.0


In [51]:
encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False, drop="first")

obj_cols = df23.select_dtypes('object')
encoder.fit(obj_cols)

df23 = transform_df(df23, encoder=encoder)

In [52]:
df23.columns

Index(['month', 'year', 'general_thefts', 'break_in_thefts',
       'avg_monthly_salary', 'm_do_65', 'z_do_65', 'm_do_65_w2', 'm_do_65_w3',
       'm_do_65_w4', 'm_do_65_w5', 'm_do_65_w6', 'm_do_65_w7', 'm_do_65_w8',
       'm_do_65_w9', 'm_do_65_w10', 'm_do_65_w11', 'm_do_65_w12',
       'm_do_65_w13', 'm_do_65_w14', 'm_do_65_w15', 'm_do_65_w16',
       'm_do_65_w17', 'm_do_65_w18', 'm_do_65_w19', 'z_do_65_w2', 'z_do_65_w3',
       'z_do_65_w4', 'z_do_65_w5', 'z_do_65_w6', 'z_do_65_w7', 'z_do_65_w8',
       'z_do_65_w9', 'z_do_65_w10', 'z_do_65_w11', 'z_do_65_w12',
       'z_do_65_w13', 'z_do_65_w14', 'z_do_65_w15', 'z_do_65_w16',
       'z_do_65_w17', 'z_do_65_w18', 'z_do_65_w19', 'monthly_min_wage',
       'monthly_inflation_rate_wrt_last_year', 'reer', 'bilance',
       'avg_energy_price', 'avg_gasoline_price', 'avg_natural_gas_price',
       'uchazeciOZamestnaniUoZZeny', 'noveHlaseniUchazeci',
       'noveHlasenaAUvolnenaVPM', 'obsazenaAZrusenaVPM',
       'absolventiSkolAMladistv

### Dataset version 3 - predicting uchazeciOZamestnaniUoZZeny, all rows, imputation on refugee rows in years before -> big refugee model

In [397]:
drop_cols2 = [f"celkem_w{w}" for w in range(2,20)]
drop_cols2 += [f"m_do_65_w{w}" for w in range(2,20)]
drop_cols2 += [f"z_do_65_w{w}" for w in range(2,20)]
drop_cols2 += ["celkem", "m_do_65", "z_do_65", "uchazeciOZamestnaniUoZ", "uchazeciOZamestnaniUoZZeny_ratio", "uchazeciOZamestnaniUoZMuzi", "uchazeciOZamestnaniUoZMuzi_ratio"]

In [398]:
df3 = orig_df.drop(columns=drop_cols2)
df3

Unnamed: 0,month,year,kraj,general_thefts,break_in_thefts,avg_monthly_salary,monthly_min_wage,monthly_inflation_rate_wrt_last_year,reer,m_do_65_ratio,...,noveHlasenaAUvolnenaVPM,obsazenaAZrusenaVPM,absolventiSkolAMladistvi,avg_monthly_salary_prev_year,general_thefts_prev_year,break_in_thefts_prev_year,noveHlaseniUchazeci_prev_year,absolventiSkolAMladistvi_prev_year,noveHlasenaAUvolnenaVPM_prev_year,obsazenaAZrusenaVPM_prev_year
0,1,2009,HKK,271.0,174.0,19132.0,8000,2.2,89.45,,...,1417,2948,1221,19132.0,271.0,174.0,4781.0,1221.0,1417.0,2948.0
1,1,2009,JHC,275.0,181.0,19576.0,8000,2.2,89.45,,...,1053,2289,1512,19576.0,275.0,181.0,5538.0,1512.0,1053.0,2289.0
2,1,2009,JHM,909.0,286.0,21065.0,8000,2.2,89.45,,...,3140,4985,3078,21065.0,909.0,286.0,9824.0,3078.0,3140.0,4985.0
3,1,2009,KVK,198.0,96.0,18652.0,8000,2.2,89.45,,...,496,1050,960,18652.0,198.0,96.0,3325.0,960.0,496.0,1050.0
4,1,2009,LBK,343.0,130.0,19653.0,8000,2.2,89.45,,...,1134,1665,1126,19653.0,343.0,130.0,4814.0,1126.0,1134.0,1665.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2473,9,2023,PLK,145.0,130.0,41220.0,17300,6.9,115.41,0.277383,...,1473,1221,809,38290.0,142.0,158.0,2411.0,734.0,1353.0,1529.0
2474,9,2023,STC,262.0,254.0,42990.0,17300,6.9,115.41,0.338071,...,2502,1777,2149,39839.0,274.0,310.0,6165.0,1913.0,3029.0,3157.0
2475,9,2023,ULK,243.0,200.0,39664.0,17300,6.9,115.41,0.430108,...,1380,1217,2084,36731.0,250.0,227.0,4932.0,1812.0,1279.0,1022.0
2476,9,2023,VYS,62.0,39.0,39315.0,17300,6.9,115.41,0.350299,...,885,865,769,36588.0,60.0,57.0,2182.0,779.0,917.0,992.0


In [399]:
df3 = df3.fillna(0)
df3

Unnamed: 0,month,year,kraj,general_thefts,break_in_thefts,avg_monthly_salary,monthly_min_wage,monthly_inflation_rate_wrt_last_year,reer,m_do_65_ratio,...,noveHlasenaAUvolnenaVPM,obsazenaAZrusenaVPM,absolventiSkolAMladistvi,avg_monthly_salary_prev_year,general_thefts_prev_year,break_in_thefts_prev_year,noveHlaseniUchazeci_prev_year,absolventiSkolAMladistvi_prev_year,noveHlasenaAUvolnenaVPM_prev_year,obsazenaAZrusenaVPM_prev_year
0,1,2009,HKK,271.0,174.0,19132.0,8000,2.2,89.45,0.000000,...,1417,2948,1221,19132.0,271.0,174.0,4781.0,1221.0,1417.0,2948.0
1,1,2009,JHC,275.0,181.0,19576.0,8000,2.2,89.45,0.000000,...,1053,2289,1512,19576.0,275.0,181.0,5538.0,1512.0,1053.0,2289.0
2,1,2009,JHM,909.0,286.0,21065.0,8000,2.2,89.45,0.000000,...,3140,4985,3078,21065.0,909.0,286.0,9824.0,3078.0,3140.0,4985.0
3,1,2009,KVK,198.0,96.0,18652.0,8000,2.2,89.45,0.000000,...,496,1050,960,18652.0,198.0,96.0,3325.0,960.0,496.0,1050.0
4,1,2009,LBK,343.0,130.0,19653.0,8000,2.2,89.45,0.000000,...,1134,1665,1126,19653.0,343.0,130.0,4814.0,1126.0,1134.0,1665.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2473,9,2023,PLK,145.0,130.0,41220.0,17300,6.9,115.41,0.277383,...,1473,1221,809,38290.0,142.0,158.0,2411.0,734.0,1353.0,1529.0
2474,9,2023,STC,262.0,254.0,42990.0,17300,6.9,115.41,0.338071,...,2502,1777,2149,39839.0,274.0,310.0,6165.0,1913.0,3029.0,3157.0
2475,9,2023,ULK,243.0,200.0,39664.0,17300,6.9,115.41,0.430108,...,1380,1217,2084,36731.0,250.0,227.0,4932.0,1812.0,1279.0,1022.0
2476,9,2023,VYS,62.0,39.0,39315.0,17300,6.9,115.41,0.350299,...,885,865,769,36588.0,60.0,57.0,2182.0,779.0,917.0,992.0


In [400]:
encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False, drop="first")

obj_cols = df3.select_dtypes('object')
encoder.fit(obj_cols)

df3 = transform_df(df3, encoder=encoder)

### Dataset version 3.1 -predicting uchazeciOZamestnaniUoZZeny, all rows, imputation on refugee rows in years before, without ratios -> big refugee model

In [56]:
drop_cols2 = [f"celkem_w{w}" for w in range(2,20)]
#drop_cols2 += [f"m_do_65_w{w}" for w in range(2,19)]
#drop_cols2 += [f"z_do_65_w{w}" for w in range(2,19)]
drop_cols2 += [f"m_do_65_w{w}_ratio" for w in range(2,20)]
drop_cols2 += [f"z_do_65_w{w}_ratio" for w in range(2,20)]
drop_cols2 += ["celkem", "m_do_65_ratio", "z_do_65_ratio", "uchazeciOZamestnaniUoZ", "uchazeciOZamestnaniUoZZeny_ratio", "uchazeciOZamestnaniUoZMuzi", "uchazeciOZamestnaniUoZMuzi_ratio"]

In [57]:
df31 = orig_df.drop(columns=drop_cols2)
df31

Unnamed: 0,month,year,kraj,general_thefts,break_in_thefts,avg_monthly_salary,m_do_65,z_do_65,m_do_65_w2,m_do_65_w3,...,noveHlasenaAUvolnenaVPM,obsazenaAZrusenaVPM,absolventiSkolAMladistvi,avg_monthly_salary_prev_year,general_thefts_prev_year,break_in_thefts_prev_year,noveHlaseniUchazeci_prev_year,absolventiSkolAMladistvi_prev_year,noveHlasenaAUvolnenaVPM_prev_year,obsazenaAZrusenaVPM_prev_year
0,1,2009,HKK,271.0,174.0,19132.0,,,,,...,1417,2948,1221,19132.0,271.0,174.0,4781.0,1221.0,1417.0,2948.0
1,1,2009,JHC,275.0,181.0,19576.0,,,,,...,1053,2289,1512,19576.0,275.0,181.0,5538.0,1512.0,1053.0,2289.0
2,1,2009,JHM,909.0,286.0,21065.0,,,,,...,3140,4985,3078,21065.0,909.0,286.0,9824.0,3078.0,3140.0,4985.0
3,1,2009,KVK,198.0,96.0,18652.0,,,,,...,496,1050,960,18652.0,198.0,96.0,3325.0,960.0,496.0,1050.0
4,1,2009,LBK,343.0,130.0,19653.0,,,,,...,1134,1665,1126,19653.0,343.0,130.0,4814.0,1126.0,1134.0,1665.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2473,9,2023,PLK,145.0,130.0,41220.0,195.0,386.0,374.0,591.0,...,1473,1221,809,38290.0,142.0,158.0,2411.0,734.0,1353.0,1529.0
2474,9,2023,STC,262.0,254.0,42990.0,333.0,502.0,637.0,1052.0,...,2502,1777,2149,39839.0,274.0,310.0,6165.0,1913.0,3029.0,3157.0
2475,9,2023,ULK,243.0,200.0,39664.0,80.0,69.0,191.0,349.0,...,1380,1217,2084,36731.0,250.0,227.0,4932.0,1812.0,1279.0,1022.0
2476,9,2023,VYS,62.0,39.0,39315.0,117.0,163.0,227.0,364.0,...,885,865,769,36588.0,60.0,57.0,2182.0,779.0,917.0,992.0


In [58]:
df31 = df31.fillna(0)
df31

Unnamed: 0,month,year,kraj,general_thefts,break_in_thefts,avg_monthly_salary,m_do_65,z_do_65,m_do_65_w2,m_do_65_w3,...,noveHlasenaAUvolnenaVPM,obsazenaAZrusenaVPM,absolventiSkolAMladistvi,avg_monthly_salary_prev_year,general_thefts_prev_year,break_in_thefts_prev_year,noveHlaseniUchazeci_prev_year,absolventiSkolAMladistvi_prev_year,noveHlasenaAUvolnenaVPM_prev_year,obsazenaAZrusenaVPM_prev_year
0,1,2009,HKK,271.0,174.0,19132.0,0.0,0.0,0.0,0.0,...,1417,2948,1221,19132.0,271.0,174.0,4781.0,1221.0,1417.0,2948.0
1,1,2009,JHC,275.0,181.0,19576.0,0.0,0.0,0.0,0.0,...,1053,2289,1512,19576.0,275.0,181.0,5538.0,1512.0,1053.0,2289.0
2,1,2009,JHM,909.0,286.0,21065.0,0.0,0.0,0.0,0.0,...,3140,4985,3078,21065.0,909.0,286.0,9824.0,3078.0,3140.0,4985.0
3,1,2009,KVK,198.0,96.0,18652.0,0.0,0.0,0.0,0.0,...,496,1050,960,18652.0,198.0,96.0,3325.0,960.0,496.0,1050.0
4,1,2009,LBK,343.0,130.0,19653.0,0.0,0.0,0.0,0.0,...,1134,1665,1126,19653.0,343.0,130.0,4814.0,1126.0,1134.0,1665.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2473,9,2023,PLK,145.0,130.0,41220.0,195.0,386.0,374.0,591.0,...,1473,1221,809,38290.0,142.0,158.0,2411.0,734.0,1353.0,1529.0
2474,9,2023,STC,262.0,254.0,42990.0,333.0,502.0,637.0,1052.0,...,2502,1777,2149,39839.0,274.0,310.0,6165.0,1913.0,3029.0,3157.0
2475,9,2023,ULK,243.0,200.0,39664.0,80.0,69.0,191.0,349.0,...,1380,1217,2084,36731.0,250.0,227.0,4932.0,1812.0,1279.0,1022.0
2476,9,2023,VYS,62.0,39.0,39315.0,117.0,163.0,227.0,364.0,...,885,865,769,36588.0,60.0,57.0,2182.0,779.0,917.0,992.0


In [59]:
encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False, drop="first")

obj_cols = df31.select_dtypes('object')
encoder.fit(obj_cols)

df31 = transform_df(df31, encoder=encoder)

### Dataset version 4 - predicting uchazeciOZamestnaniUoZZeny, refugee rows -> small economic model

In [78]:
drop_cols = [f"celkem_w{w}" for w in range(2,20)]
drop_cols += [f"m_do_65_w{w}" for w in range(2,20)]
drop_cols += [f"z_do_65_w{w}" for w in range(2,20)]
drop_cols += [f"m_do_65_w{w}_ratio" for w in range(2,20)]
drop_cols += [f"z_do_65_w{w}_ratio" for w in range(2,20)]
drop_cols += ["celkem", "m_do_65", "z_do_65","m_do_65_ratio", "z_do_65_ratio", "uchazeciOZamestnaniUoZ",	"uchazeciOZamestnaniUoZZeny_ratio", "uchazeciOZamestnaniUoZMuzi", "uchazeciOZamestnaniUoZMuzi_ratio"]

In [79]:
df4 = orig_df.drop(columns=drop_cols)
df4 = df4[(df4.year > 2022) | ((df4.year == 2022) & (df4.month > 2))]
df4

Unnamed: 0,month,year,kraj,general_thefts,break_in_thefts,avg_monthly_salary,monthly_min_wage,monthly_inflation_rate_wrt_last_year,reer,bilance,...,noveHlasenaAUvolnenaVPM,obsazenaAZrusenaVPM,absolventiSkolAMladistvi,avg_monthly_salary_prev_year,general_thefts_prev_year,break_in_thefts_prev_year,noveHlaseniUchazeci_prev_year,absolventiSkolAMladistvi_prev_year,noveHlasenaAUvolnenaVPM_prev_year,obsazenaAZrusenaVPM_prev_year
2212,3,2022,HKK,60.0,53.0,34689.0,16200,12.7,107.78,-12.741,...,2010,2094,527,33032.0,53.0,32.0,1412.0,555.0,2091.0,1775.0
2213,3,2022,JHC,113.0,71.0,33998.0,16200,12.7,107.78,-12.741,...,3249,4076,521,32213.0,75.0,68.0,1852.0,727.0,3097.0,2820.0
2214,3,2022,JHM,224.0,327.0,37027.0,16200,12.7,107.78,-12.741,...,6761,6142,1624,34989.0,137.0,193.0,3620.0,1751.0,5790.0,5105.0
2215,3,2022,KVK,54.0,63.0,32424.0,16200,12.7,107.78,-12.741,...,2096,1588,372,30189.0,56.0,55.0,1166.0,457.0,1241.0,1148.0
2216,3,2022,LBK,129.0,116.0,33745.0,16200,12.7,107.78,-12.741,...,3192,3957,566,31956.0,72.0,55.0,1547.0,601.0,2858.0,2790.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2473,9,2023,PLK,145.0,130.0,41220.0,17300,6.9,115.41,11.874,...,1473,1221,809,38290.0,142.0,158.0,2411.0,734.0,1353.0,1529.0
2474,9,2023,STC,262.0,254.0,42990.0,17300,6.9,115.41,11.874,...,2502,1777,2149,39839.0,274.0,310.0,6165.0,1913.0,3029.0,3157.0
2475,9,2023,ULK,243.0,200.0,39664.0,17300,6.9,115.41,11.874,...,1380,1217,2084,36731.0,250.0,227.0,4932.0,1812.0,1279.0,1022.0
2476,9,2023,VYS,62.0,39.0,39315.0,17300,6.9,115.41,11.874,...,885,865,769,36588.0,60.0,57.0,2182.0,779.0,917.0,992.0


In [80]:
df4.columns

Index(['month', 'year', 'kraj', 'general_thefts', 'break_in_thefts',
       'avg_monthly_salary', 'monthly_min_wage',
       'monthly_inflation_rate_wrt_last_year', 'reer', 'bilance',
       'avg_energy_price', 'avg_gasoline_price', 'avg_natural_gas_price',
       'uchazeciOZamestnaniUoZZeny', 'noveHlaseniUchazeci',
       'noveHlasenaAUvolnenaVPM', 'obsazenaAZrusenaVPM',
       'absolventiSkolAMladistvi', 'avg_monthly_salary_prev_year',
       'general_thefts_prev_year', 'break_in_thefts_prev_year',
       'noveHlaseniUchazeci_prev_year', 'absolventiSkolAMladistvi_prev_year',
       'noveHlasenaAUvolnenaVPM_prev_year', 'obsazenaAZrusenaVPM_prev_year'],
      dtype='object')

In [81]:
encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False, drop="first")

obj_cols = df4.select_dtypes('object')
encoder.fit(obj_cols)

df4 = transform_df(df4, encoder=encoder)

In [82]:
df4

Unnamed: 0,month,year,general_thefts,break_in_thefts,avg_monthly_salary,monthly_min_wage,monthly_inflation_rate_wrt_last_year,reer,bilance,avg_energy_price,...,kraj_LBK,kraj_MSK,kraj_OLK,kraj_PAK,kraj_PHA,kraj_PLK,kraj_STC,kraj_ULK,kraj_VYS,kraj_ZLK
2212,3,2022,60.0,53.0,34689.0,16200,12.7,107.78,-12.741,187.10,...,False,False,False,False,False,False,False,False,False,False
2213,3,2022,113.0,71.0,33998.0,16200,12.7,107.78,-12.741,187.10,...,False,False,False,False,False,False,False,False,False,False
2214,3,2022,224.0,327.0,37027.0,16200,12.7,107.78,-12.741,187.10,...,False,False,False,False,False,False,False,False,False,False
2215,3,2022,54.0,63.0,32424.0,16200,12.7,107.78,-12.741,187.10,...,False,False,False,False,False,False,False,False,False,False
2216,3,2022,129.0,116.0,33745.0,16200,12.7,107.78,-12.741,187.10,...,True,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2473,9,2023,145.0,130.0,41220.0,17300,6.9,115.41,11.874,138.98,...,False,False,False,False,False,True,False,False,False,False
2474,9,2023,262.0,254.0,42990.0,17300,6.9,115.41,11.874,138.98,...,False,False,False,False,False,False,True,False,False,False
2475,9,2023,243.0,200.0,39664.0,17300,6.9,115.41,11.874,138.98,...,False,False,False,False,False,False,False,True,False,False
2476,9,2023,62.0,39.0,39315.0,17300,6.9,115.41,11.874,138.98,...,False,False,False,False,False,False,False,False,True,False


## Ridge Regression

### df1

In [97]:
n_splits = len(df2) - 1
n_splits
y_col_name = "uchazeciOZamestnaniUoZZeny_ratio"
X = df1.drop(columns=y_col_name).to_numpy()
y = df1[y_col_name].to_numpy()

q = 1
exps = np.linspace(0, n_splits-1, num=n_splits)
weights = np.flip(np.power(q, exps))
weights

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1.

In [98]:
tscv = TimeSeriesSplit(n_splits=n_splits, test_size=1)

def optimize_alpha(alpha):
    rmses = []
    for i, (train_index, test_index) in enumerate(tscv.split(X)):
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X[train_index, :])
        X_test = scaler.transform(X[test_index, :])
        y_train = y[train_index]
        y_test = y[test_index]

        tmp_ridge = Ridge(alpha=alpha, random_state=SEED)
        tmp_ridge.fit(X_train, y_train)

        rmse = mean_squared_error(
            y_test, tmp_ridge.predict(X_test), squared=False)

        rmses.append(rmse)

    return np.average(rmses, weights=weights)


res = optimize.minimize_scalar(
    optimize_alpha, bounds=(0, 1000), options={"disp": True})
best_alpha = res.x
display(res)


Optimization terminated successfully;
The returned value satisfies the termination criteria
(using xtol =  1e-05 )


In [104]:
rmses = []
maes = []
for train_index, test_index in tscv.split(X):
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X[train_index, :])
    X_test = scaler.transform(X[test_index, :])
    y_train = y[train_index]
    y_test = y[test_index]
    tmp_ridge = Ridge(alpha=best_alpha, random_state=SEED)
    tmp_ridge.fit(X_train, y_train)
    y_pred = tmp_ridge.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mae = mean_absolute_error(y_test, y_pred)
    print("pred", tmp_ridge.predict(X_test),
          "test:", y_test, "rmse", rmse, "mae", mae)
    rmses.append(rmse)
    maes.append(mae)

weighted_rmse = np.average(rmses, weights=weights)
weighted_mae = np.average(maes, weights=weights)
print(weighted_rmse)
print(weighted_mae)

pred [0.52503995] test: [0.52069956] rmse 0.004340394110931012 mae 0.004340394110931012
pred [0.5162329] test: [0.51190695] rmse 0.004325946848066309 mae 0.004325946848066309
pred [0.49292837] test: [0.53235933] rmse 0.03943095884244707 mae 0.03943095884244707
pred [0.52931933] test: [0.53235796] rmse 0.003038627351448664 mae 0.003038627351448664
pred [0.49777629] test: [0.48715286] rmse 0.010623423629164519 mae 0.010623423629164519
pred [0.50681871] test: [0.50655175] rmse 0.00026696223861355506 mae 0.00026696223861355506
pred [0.51373343] test: [0.51663405] rmse 0.0029006161104331296 mae 0.0029006161104331296
pred [0.50905451] test: [0.53051357] rmse 0.021459057489093714 mae 0.021459057489093714
pred [0.53307562] test: [0.53394841] rmse 0.0008727860771845508 mae 0.0008727860771845508
pred [0.54000142] test: [0.54000142] rmse 2.5654922630735655e-11 mae 2.5654922630735655e-11
pred [0.52075008] test: [0.54799567] rmse 0.027245591060119367 mae 0.027245591060119367
pred [0.51932986] test:

In [78]:
res

 message: Solution found.
 success: True
  status: 0
     fun: [ 5.378e+00]
       x: [ 4.239e+01]
     nit: 33
    nfev: 33

In [146]:
from sklearn.model_selection import train_test_split
import statsmodels.api as sma

X = df1.drop(columns=y_col_name).to_numpy()
X = sma.add_constant(X)
y = df1[y_col_name].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=84, shuffle=False)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [147]:
y_train

array([0.48984919, 0.50225608, 0.50530578, ..., 0.56236532, 0.51740295,
       0.50510294])

In [148]:
X_train

array([[ 0.        , -1.56048665, -1.61150444, ..., -0.2773501 ,
        -0.2773501 , -0.2773501 ],
       [ 0.        , -1.56048665, -1.61150444, ..., -0.2773501 ,
        -0.2773501 , -0.2773501 ],
       [ 0.        , -1.56048665, -1.61150444, ..., -0.2773501 ,
        -0.2773501 , -0.2773501 ],
       ...,
       [ 0.        , -0.98477313,  1.79056049, ...,  3.60555128,
        -0.2773501 , -0.2773501 ],
       [ 0.        , -0.98477313,  1.79056049, ..., -0.2773501 ,
         3.60555128, -0.2773501 ],
       [ 0.        , -0.98477313,  1.79056049, ..., -0.2773501 ,
        -0.2773501 ,  3.60555128]])

In [149]:
best_alpha

31.333883214358604

In [150]:
from sklearn.metrics import r2_score


model = sma.OLS(endog=y_train, exog=X_train)
#fit = model.fit_regularized(alpha=best_alpha, L1_wt= 0)
fit = model.fit()
cmp = y_test, fit.predict(X_test)
mse = mean_squared_error(*cmp)
mae = mean_absolute_error(*cmp)
r2 = r2_score(*cmp)

print(mse, mae, r2)
fit.summary()

0.306420352563224 0.553398020485247 -775.8103348602441


0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.002
Model:,OLS,Adj. R-squared (uncentered):,-0.01
Method:,Least Squares,F-statistic:,0.1635
Date:,"Tue, 26 Dec 2023",Prob (F-statistic):,1.0
Time:,13:29:40,Log-Likelihood:,-1795.9
No. Observations:,2394,AIC:,3650.0
Df Residuals:,2365,BIC:,3818.0
Df Model:,29,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-2.914e-19,2.43e-18,-0.120,0.905,-5.07e-18,4.48e-18
x1,0.0156,0.018,0.862,0.389,-0.020,0.051
x2,0.0144,0.069,0.210,0.834,-0.120,0.149
x3,-0.0020,0.030,-0.066,0.947,-0.061,0.057
x4,-0.0049,0.025,-0.192,0.848,-0.055,0.045
x5,-0.0203,0.085,-0.240,0.811,-0.187,0.146
x6,-0.0035,0.094,-0.038,0.970,-0.187,0.180
x7,0.0084,0.030,0.275,0.783,-0.051,0.068
x8,0.0004,0.045,0.008,0.994,-0.087,0.088

0,1,2,3
Omnibus:,11.072,Durbin-Watson:,0.001
Prob(Omnibus):,0.004,Jarque-Bera (JB):,11.196
Skew:,-0.167,Prob(JB):,0.00371
Kurtosis:,2.965,Cond. No.,1.5e+16


### df12 (big economic)

In [441]:
test_size = 14 #kraje (1 for previous results)

n_splits = int(len(df2) / test_size) - 1
n_splits
y_col_name = "uchazeciOZamestnaniUoZZeny"
X = df12.drop(columns=y_col_name).to_numpy()
y = df12[y_col_name].to_numpy()

q = 1
exps = np.linspace(0, n_splits-1, num=n_splits)
weights = np.flip(np.power(q, exps))
weights

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1.])

In [421]:
tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)

def optimize_alpha(alpha):
    rmses = []
    for i, (train_index, test_index) in enumerate(tscv.split(X)):
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X[train_index, :])
        X_test = scaler.transform(X[test_index, :])
        y_train = y[train_index]
        y_test = y[test_index]

        tmp_ridge = Ridge(alpha=alpha, random_state=SEED)
        tmp_ridge.fit(X_train, y_train)

        rmse = mean_squared_error(
            y_test, tmp_ridge.predict(X_test), squared=False)

        rmses.append(rmse)

    return np.average(rmses, weights=weights)


res = optimize.minimize_scalar(
    optimize_alpha, bounds=(0, 1000), options={"disp": True})
best_alpha = res.x
display(res)


Optimization terminated successfully;
The returned value satisfies the termination criteria
(using xtol =  1e-05 )


 message: Solution found.
 success: True
  status: 0
     fun: 1613.0780252662955
       x: 174.593881316761
     nit: 14
    nfev: 14

In [345]:
#orig
eval_tscv(tscv, best_alpha, X, y, weights)

pred [4318.12112924] test: [6163] rmse 1844.8788707649655 mae 1844.8788707649655
[ -354.0232617     55.12018157  -531.05074921   125.72496106
   372.1335161   -543.52013927  -244.7116453  -1370.36712532
   177.41447709   208.12460256   896.10297077  -110.88255759
  1079.77117507    72.35219627  -262.45326212  3591.70905819
  -271.23894394  1656.89260384  -530.95900102  -128.46683345
  2767.3601654    256.33167428  -510.94112719  1238.67338213
  -234.15852396  1458.74858806  1913.29153155  -432.93319705
  -183.85775562]
14668.425666516066

pred [16650.19766437] test: [16681] rmse 30.802335627558932 mae 30.802335627558932
[ -354.24455149    55.7318608   -529.58565493   124.31901909
   369.78989968  -542.23723453  -239.22923111 -1375.4007729
   177.7284007    209.05276782   897.25457221  -109.09079557
  1078.94815367    71.17327067  -260.82185415  3592.03798755
  -269.1606981   1656.82721364  -530.95008837  -128.44802676
  2767.05765865   256.21499876  -511.00886235  1239.34484479
  -234.

In [391]:
#seasonal lagged
eval_tscv(tscv, best_alpha, X, y, weights)

pred [4416.47905979] test: [6163] rmse 1746.5209402129076 mae 1746.5209402129076
[-432.85213807   24.99843051 -284.49567169  188.74934351  154.69416548
 -380.45814759 -265.76890671 -939.15847081  165.69336428   23.20626042
  725.66775919  -32.00975119  684.07714015   39.60848329 -196.79467343
 2351.86378968  482.4508369    65.12092962   -3.32172771  733.82786794
 1842.47383971    3.74108379 -132.90436406 -313.01500019 1169.64944228
 -436.43782675 -142.54702374 2125.5097793   126.53465458 -485.93278585
  738.93454682 -233.82942698 1049.98466063 1534.60399034 -443.68211229
 -213.32554644]
14668.425666516061

pred [15859.52455417] test: [16681] rmse 821.4754458269745 mae 821.4754458269745
[-433.0635024    25.74210827 -283.75769151  187.70489299  153.60639516
 -379.55011799 -261.19739203 -942.56621692  165.89702071   23.59280989
  725.98866305  -30.54315975  683.55769038   39.04041913 -195.31466164
 2351.99230497  481.05321931   66.2766041    -3.36014932  733.28263592
 1843.10368134    3.6

In [422]:
#test size 14 -> no leakage?, lagged seasonal
eval_tscv(tscv, best_alpha, X, y, weights)

pred [ 5403.99734003  4352.13824005 15816.30024464  2636.24719065
  4891.85569022 20455.06171999  6920.11450584  2803.93315485
 11492.95524361  3984.71002134 14373.98221614 16462.23911937
  3508.90313035  4220.1743681 ] test: [ 5178  5847 16277  4202  5945 20046  7026  4416 12774  5884 14967 16129
  4939  5083] rmse 1112.804369405609 mae 951.9274409720499
[-4.78984345e+02 -3.02667226e+01 -3.84569102e+02  2.10315621e+02
  1.40303431e+02 -4.03263868e+02 -2.55077635e+02 -1.07581303e+03
  1.69391799e+02  6.87791371e+01  7.59840896e+02 -3.92853659e+01
  5.14695595e+02  2.26823893e+00 -1.98243044e+02  2.51971504e+03
  5.81675760e+02  8.66509765e+01 -7.89209570e+01  5.95487467e+02
  1.75436297e+03 -3.49829269e+01 -1.28544408e+02 -2.43544310e+02
  1.38701159e+03 -4.17719190e+02 -8.87049505e+01  2.41769355e+03
  2.09809334e+02 -4.45740519e+02  9.59154638e+02 -1.73589711e+02
  1.29611914e+03  1.73137258e+03 -4.22355312e+02 -1.65593921e+02]
14639.297843665787

pred [ 5137.18979227  4100.57611953 

huber regressor

In [445]:
tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)

def optimize_alpha(alpha):
    rmses = []
    for i, (train_index, test_index) in enumerate(tscv.split(X)):
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X[train_index, :])
        X_test = scaler.transform(X[test_index, :])
        y_train = y[train_index]
        y_test = y[test_index]

        tmp_ridge = HuberRegressor(alpha=alpha)
        tmp_ridge.fit(X_train, y_train)

        rmse = mean_squared_error(
            y_test, tmp_ridge.predict(X_test), squared=False)

        rmses.append(rmse)

    return np.average(rmses, weights=weights)


res = optimize.minimize_scalar(
    optimize_alpha, bounds=(0, 1000), options={"disp": True})
best_alpha = res.x
display(res)


Optimization terminated successfully;
The returned value satisfies the termination criteria
(using xtol =  1e-05 )


 message: Solution found.
 success: True
  status: 0
     fun: 1398.078744342449
       x: 0.06939024445873626
     nit: 35
    nfev: 35

In [447]:
eval_tscv_huber(tscv, best_alpha, X, y, weights)

pred [ 5210.07822198  4718.1050757  16323.07166229  2860.16763846
  5217.75497582 20502.10732869  7133.58244137  3134.46610421
 11735.28660616  4437.16765053 14842.49030156 16907.54353407
  3817.85617643  4445.53611229] test: [ 5178  5847 16277  4202  5945 20046  7026  4416 12774  5884 14967 16129
  4939  5083] rmse 883.4226264883124 mae 733.4680390861886
[ -501.59490415   -96.14187781  -368.53385225   272.44299026
    77.99911852  -293.55102989  -250.42107065 -1038.32317277
   177.3456885    146.85035004   613.1508484    -71.36730998
   326.94778465   -60.70296361  -167.650859    2712.5976394
   633.10377886    60.43791434   -71.14349294   503.65609908
  1793.44280482   -91.31672968   -27.4559627    -99.45404782
  1529.92318435  -320.10643866    34.00451821  2498.67925905
   322.84365572  -330.4973413   1051.14450438   -43.67127589
  1423.76934786  1877.15815144  -302.20060146   -56.90619461]
14459.7045855671

pred [ 4891.05314209  4446.79106155 15596.78177121  2687.71794458
  4945.46

### Ridge Regression df2

In [None]:
n_splits = len(df2) - 1
n_splits
y_col_name = "uchazeciOZamestnaniUoZZeny_ratio"
X = df2.drop(columns=y_col_name)
y = df2[y_col_name].to_numpy()

q = 1
exps = np.linspace(0, n_splits-1, num=n_splits)
weights = np.flip(np.power(q, exps))
weights

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1.

In [None]:
tscv = TimeSeriesSplit(n_splits=n_splits, test_size=1)

# do only every 14 -> do not leak data from other kraj in the month


def optimize_alpha(alpha):
    rmses = []
    for i, (train_index, test_index) in enumerate(tscv.split(X)):
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X.iloc[train_index, :])
        X_test = scaler.transform(X.iloc[test_index, :])
        y_train = y[train_index]
        y_test = y[test_index]

        tmp_ridge = Ridge(alpha=alpha, random_state=SEED)
        tmp_ridge.fit(X_train, y_train)

        rmse = mean_squared_error(
            y_test, tmp_ridge.predict(X_test), squared=False)

        rmses.append(rmse)

    return np.average(rmses, weights=weights)


res = optimize.minimize_scalar(
    optimize_alpha, bounds=(0, 1000), options={"disp": True})
best_alpha = res.x


Optimization terminated successfully;
The returned value satisfies the termination criteria
(using xtol =  1e-05 )


In [None]:
display(res)

 message: Solution found.
 success: True
  status: 0
     fun: 0.021607479938802538
       x: 999.999970046681
     nit: 36
    nfev: 36

In [None]:
eval_tscv(tscv, best_alpha, X, y)

pred [0.50701135] test: [0.52069956] rmse 0.01368820895008993 mae 0.01368820895008993
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
pred [0.48674641] test: [0.51190695] rmse 0.025160547459305405 mae 0.025160547459305405
[ 0.          0.          0.00011098  0.00011098 -0.00011098  0.
  0.          0.          0.00011098  0.00011098  0.00011098  0.00011098
  0.00011098  0.00011098  0.00011098  0.00011098  0.00011098  0.00011098
  0.00011098  0.00011098  0.00011098  0.00011098  0.00011098  0.00011098
  0.00011098  0.00011098  0.00011098  0.00011098  0.00011098  0.00011098
  0.00011098  0.00011098  0.00011098  0.00011098  0.00011098  0.00011098
  0.00011098  0.00011098  0.00011098  0.00011098  0.00011098  0.00011098
  0.00011098  0.00011098  0.00011098  0.00011098  0.          0.
  0.          0.          0.00011098  0.00011098  0.0

In [None]:
from sklearn.model_selection import train_test_split
import statsmodels.api as sma

X = df2.drop(columns=y_col_name).to_numpy(dtype=float)
X = sma.add_constant(X)
y = df2[y_col_name].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=84, shuffle=False)

#scaler = StandardScaler()
#X_train = scaler.fit_transform(X_train)
#X_test = scaler.transform(X_test)
X

array([[1.000e+00, 3.000e+00, 2.022e+03, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [1.000e+00, 3.000e+00, 2.022e+03, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [1.000e+00, 3.000e+00, 2.022e+03, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       ...,
       [1.000e+00, 9.000e+00, 2.023e+03, ..., 1.000e+00, 0.000e+00,
        0.000e+00],
       [1.000e+00, 9.000e+00, 2.023e+03, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [1.000e+00, 9.000e+00, 2.023e+03, ..., 0.000e+00, 0.000e+00,
        1.000e+00]])

In [None]:
from sklearn.metrics import r2_score


#model = sma.OLS(endog=y_train, exog=X_train)
model = sma.RLM(endog=y_train, exog=X_train, M=sma.robust.norms.HuberT())
#fit = model.fit_regularized(alpha=best_alpha, L1_wt= 0)
fit = model.fit()
cmp = y_test, fit.predict(X_test)
mse = mean_squared_error(*cmp)
mae = mean_absolute_error(*cmp)
r2 = r2_score(*cmp)

print(mse, mae, r2)
fit.summary()

2.5723027266328016 0.6737336793644056 -6520.079052754033


0,1,2,3
Dep. Variable:,y,No. Observations:,182.0
Model:,RLM,Df Residuals:,127.0
Method:,IRLS,Df Model:,54.0
Norm:,HuberT,,
Scale Est.:,mad,,
Cov Type:,H1,,
Date:,"Tue, 26 Dec 2023",,
Time:,14:23:39,,
No. Iterations:,50,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,2.1e-07,1.29e-07,1.623,0.105,-4.37e-08,4.64e-07
x1,-0.0028,0.003,-0.822,0.411,-0.009,0.004
x2,0.0004,0.000,1.623,0.105,-8.76e-05,0.001
x3,1.625e-05,2.45e-05,0.664,0.507,-3.17e-05,6.43e-05
x4,1.422e-05,2.62e-05,0.544,0.587,-3.7e-05,6.55e-05
x5,2.022e-06,1.32e-06,1.526,0.127,-5.75e-07,4.62e-06
x6,5.545e-06,5.13e-05,0.108,0.914,-9.5e-05,0.000
x7,0.0053,0.004,1.208,0.227,-0.003,0.014
x8,-0.0065,0.006,-1.100,0.271,-0.018,0.005


In [None]:
df2.columns

Index(['month', 'year', 'general_thefts', 'break_in_thefts',
       'avg_monthly_salary', 'monthly_min_wage',
       'monthly_inflation_rate_wrt_last_year', 'reer', 'm_do_65_ratio',
       'z_do_65_ratio', 'm_do_65_w2_ratio', 'm_do_65_w3_ratio',
       'm_do_65_w4_ratio', 'm_do_65_w5_ratio', 'm_do_65_w6_ratio',
       'm_do_65_w7_ratio', 'm_do_65_w8_ratio', 'm_do_65_w9_ratio',
       'm_do_65_w10_ratio', 'm_do_65_w11_ratio', 'm_do_65_w12_ratio',
       'm_do_65_w13_ratio', 'm_do_65_w14_ratio', 'm_do_65_w15_ratio',
       'm_do_65_w16_ratio', 'm_do_65_w17_ratio', 'm_do_65_w18_ratio',
       'm_do_65_w19_ratio', 'z_do_65_w2_ratio', 'z_do_65_w3_ratio',
       'z_do_65_w4_ratio', 'z_do_65_w5_ratio', 'z_do_65_w6_ratio',
       'z_do_65_w7_ratio', 'z_do_65_w8_ratio', 'z_do_65_w9_ratio',
       'z_do_65_w10_ratio', 'z_do_65_w11_ratio', 'z_do_65_w12_ratio',
       'z_do_65_w13_ratio', 'z_do_65_w14_ratio', 'z_do_65_w15_ratio',
       'z_do_65_w16_ratio', 'z_do_65_w17_ratio', 'z_do_65_w18_ratio'

### Ridge Regression df22 (small refugee)

In [449]:
test_size = 14 #kraje (1 for previous results)
n_splits = int(len(df2) / test_size) - 1

y_col_name = "uchazeciOZamestnaniUoZZeny"
X = df22.drop(columns=y_col_name)
y = df22[y_col_name].to_numpy()

q = 1
exps = np.linspace(0, n_splits-1, num=n_splits)
weights = np.flip(np.power(q, exps))
weights

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1.])

In [429]:
tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)

def optimize_alpha(alpha):
    rmses = []
    for i, (train_index, test_index) in enumerate(tscv.split(X)):
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X.iloc[train_index, :])
        X_test = scaler.transform(X.iloc[test_index, :])
        y_train = y[train_index]
        y_test = y[test_index]

        tmp_ridge = Ridge(alpha=alpha, random_state=SEED)
        tmp_ridge.fit(X_train, y_train)

        rmse = mean_squared_error(
            y_test, tmp_ridge.predict(X_test), squared=False)

        rmses.append(rmse)

    return np.average(rmses, weights=weights)


res = optimize.minimize_scalar(
    optimize_alpha, bounds=(0, 1000), options={"disp": True})
best_alpha = res.x


Optimization terminated successfully;
The returned value satisfies the termination criteria
(using xtol =  1e-05 )


In [430]:
display(res)

 message: Solution found.
 success: True
  status: 0
     fun: 2578.244746775478
       x: 137.0231113314598
     nit: 16
    nfev: 16

In [431]:
#test size 14 + lagged seasonal q 1
eval_tscv(tscv, best_alpha, X, y, weights=weights)

pred [ 4631.37511471  4016.63652812  7965.36545987  5100.97376651
  6010.80324301 11687.6743519   5205.66009983  2452.3807791
  9175.45879403  2336.02634815  9584.33575182  8580.41595545
  1603.54829773  3184.83852314] test: [ 5178  5847 16277  4202  5945 20046  7026  4416 12774  5884 14967 16129
  4939  5083] rmse 4446.474475447661 mae 3507.6472146910014
[  0.           0.         148.14834349 190.47607889  62.72928721
   0.           0.           0.         -49.50337593  -4.50037686
 -49.50337593 -49.50337593 -49.50337593 -49.50337593 -49.50337593
 -49.50337593 -49.50337593 -49.50337593 -49.50337593 -49.50337593
 -49.50337593 -49.50337593 -49.50337593 -49.50337593 -49.50337593
 -49.50337593 -49.50337593 -49.50337593  -4.50037686  -4.50037686
  -4.50037686  -4.50037686  -4.50037686  -4.50037686  -4.50037686
  -4.50037686  -4.50037686  -4.50037686  -4.50037686  -4.50037686
  -4.50037686  -4.50037686  -4.50037686  -4.50037686  -4.50037686
  -4.50037686   0.           0.           0.    

In [427]:
#test size 14 + lagged seasonal q 0.95
eval_tscv(tscv, best_alpha, X, y, weights=weights)

pred [ 4938.33077722  4349.63608464  7743.99615034  5527.67931834
  6346.94656295 11257.05691809  5484.72920598  2882.71414977
  9033.27113073  2762.29211977  9346.21974107  8394.67600564
  1974.74109353  3554.47276697] test: [ 5178  5847 16277  4202  5945 20046  7026  4416 12774  5884 14967 16129
  4939  5083] rmse 4514.080830174226 mae 3469.392124109729
[  0.           0.         130.27757547 166.38795744  59.19543471
   0.           0.           0.         -47.78021046  -2.42320914
 -47.78021046 -47.78021046 -47.78021046 -47.78021046 -47.78021046
 -47.78021046 -47.78021046 -47.78021046 -47.78021046 -47.78021046
 -47.78021046 -47.78021046 -47.78021046 -47.78021046 -47.78021046
 -47.78021046 -47.78021046 -47.78021046  -2.42320914  -2.42320914
  -2.42320914  -2.42320914  -2.42320914  -2.42320914  -2.42320914
  -2.42320914  -2.42320914  -2.42320914  -2.42320914  -2.42320914
  -2.42320914  -2.42320914  -2.42320914  -2.42320914  -2.42320914
  -2.42320914   0.           0.           0.    

In [388]:
#lagged seasonal
eval_tscv(tscv, best_alpha, X, y, weights=weights)

pred [5315.] test: [6163] rmse 848.0 mae 848.0
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0.]
5315.0

pred [4488.74383093] test: [16681] rmse 12192.256169065251 mae 12192.256169065251
[ 0.          0.          6.19593607  6.19593607 -6.19593607  0.
  0.          0.          6.19593607  6.19593607  6.19593607  6.19593607
  6.19593607  6.19593607  6.19593607  6.19593607  6.19593607  6.19593607
  6.19593607  6.19593607  6.19593607  6.19593607  6.19593607  6.19593607
  6.19593607  6.19593607  6.19593607  6.19593607  6.19593607  6.19593607
  6.19593607  6.19593607  6.19593607  6.19593607  6.19593607  6.19593607
  6.19593607  6.19593607  6.19593607  6.19593607  6.19593607  6.19593607
  6.19593607  6.19593607  6.19593607  6.19593607  0.          0.
  0.          0.          6.19593607  6.19593607  6.19593607 -6.1959

huber

In [451]:
tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)

def optimize_alpha(alpha):
    rmses = []
    for i, (train_index, test_index) in enumerate(tscv.split(X)):
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X.iloc[train_index, :])
        X_test = scaler.transform(X.iloc[test_index, :])
        y_train = y[train_index]
        y_test = y[test_index]

        tmp_ridge = HuberRegressor(alpha=alpha)
        tmp_ridge.fit(X_train, y_train)

        rmse = mean_squared_error(
            y_test, tmp_ridge.predict(X_test), squared=False)

        rmses.append(rmse)

    return np.average(rmses, weights=weights)


res = optimize.minimize_scalar(
    optimize_alpha, bounds=(0, 1000), options={"disp": True})
best_alpha = res.x
display(res)


Optimization terminated successfully;
The returned value satisfies the termination criteria
(using xtol =  1e-05 )


 message: Solution found.
 success: True
  status: 0
     fun: 3020.426686326278
       x: 0.10237466121072303
     nit: 35
    nfev: 35

In [452]:
eval_tscv_huber(tscv, best_alpha, X, y, weights)

pred [5190.13876516 4839.16271896 6735.93196058 5738.53780646 6266.83338922
 8953.28913396 5575.73535219 3801.18143523 8354.14838635 3966.51012366
 8098.47258043 7193.42316753 2989.57467024 4250.57791712] test: [ 5178  5847 16277  4202  5945 20046  7026  4416 12774  5884 14967 16129
  4939  5083] rmse 5173.891428520332 mae 3607.178751041881
[ 0.00000000e+00  0.00000000e+00  9.06457808e+01  1.08310873e+02
  6.75526649e+01  0.00000000e+00  3.25488613e-16  2.60390891e-15
 -3.28020369e+01  3.33870244e+00 -3.28020369e+01 -3.28020369e+01
 -3.28020369e+01 -3.28020369e+01 -3.28020369e+01 -3.28020369e+01
 -3.28020369e+01 -3.28020369e+01 -3.28020369e+01 -3.28020369e+01
 -3.28020369e+01 -3.28020369e+01 -3.28020369e+01 -3.28020369e+01
 -3.28020369e+01 -3.28020369e+01 -3.28020369e+01 -3.28020369e+01
  3.33870244e+00  3.33870244e+00  3.33870244e+00  3.33870244e+00
  3.33870244e+00  3.33870244e+00  3.33870244e+00  3.33870244e+00
  3.33870244e+00  3.33870244e+00  3.33870244e+00  3.33870244e+00
  3.338

other

In [214]:
from sklearn.model_selection import train_test_split
import statsmodels.api as sma

X = df22.drop(columns=y_col_name).to_numpy(dtype=float)
X = sma.add_constant(X)
y = df22[y_col_name].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=84, shuffle=False)

#scaler = StandardScaler()
#X_train = scaler.fit_transform(X_train)
#X_test = scaler.transform(X_test)
X

array([[1.000e+00, 3.000e+00, 2.022e+03, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [1.000e+00, 3.000e+00, 2.022e+03, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [1.000e+00, 3.000e+00, 2.022e+03, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       ...,
       [1.000e+00, 9.000e+00, 2.023e+03, ..., 1.000e+00, 0.000e+00,
        0.000e+00],
       [1.000e+00, 9.000e+00, 2.023e+03, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [1.000e+00, 9.000e+00, 2.023e+03, ..., 0.000e+00, 0.000e+00,
        1.000e+00]])

In [215]:
from sklearn.metrics import r2_score


#model = sma.OLS(endog=y_train, exog=X_train)
model = sma.RLM(endog=y_train, exog=X_train, M=sma.robust.norms.HuberT())
#fit = model.fit_regularized(alpha=best_alpha, L1_wt= 0)
fit = model.fit()
cmp = y_test, fit.predict(X_test)
mse = mean_squared_error(*cmp)
mae = mean_absolute_error(*cmp)
r2 = r2_score(*cmp)

print(mse, mae, r2)
fit.summary()

754650964.9366258 14904.609519244133 -21.388132825907917


0,1,2,3
Dep. Variable:,y,No. Observations:,182.0
Model:,RLM,Df Residuals:,127.0
Method:,IRLS,Df Model:,54.0
Norm:,HuberT,,
Scale Est.:,mad,,
Cov Type:,H1,,
Date:,"Tue, 26 Dec 2023",,
Time:,15:04:43,,
No. Iterations:,43,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.0008,0.004,-0.212,0.832,-0.008,0.007
x1,-45.3331,97.774,-0.464,0.643,-236.966,146.300
x2,-1.5983,7.537,-0.212,0.832,-16.371,13.175
x3,0.5706,0.711,0.803,0.422,-0.822,1.964
x4,-0.0924,0.759,-0.122,0.903,-1.580,1.395
x5,-0.0349,0.038,-0.908,0.364,-0.110,0.040
x6,-0.2526,1.488,-0.170,0.865,-3.169,2.664
x7,-130.5758,127.972,-1.020,0.308,-381.397,120.245
x8,190.6420,171.402,1.112,0.266,-145.300,526.584


In [None]:
df2.columns

Index(['month', 'year', 'general_thefts', 'break_in_thefts',
       'avg_monthly_salary', 'monthly_min_wage',
       'monthly_inflation_rate_wrt_last_year', 'reer', 'm_do_65_ratio',
       'z_do_65_ratio', 'm_do_65_w2_ratio', 'm_do_65_w3_ratio',
       'm_do_65_w4_ratio', 'm_do_65_w5_ratio', 'm_do_65_w6_ratio',
       'm_do_65_w7_ratio', 'm_do_65_w8_ratio', 'm_do_65_w9_ratio',
       'm_do_65_w10_ratio', 'm_do_65_w11_ratio', 'm_do_65_w12_ratio',
       'm_do_65_w13_ratio', 'm_do_65_w14_ratio', 'm_do_65_w15_ratio',
       'm_do_65_w16_ratio', 'm_do_65_w17_ratio', 'm_do_65_w18_ratio',
       'm_do_65_w19_ratio', 'z_do_65_w2_ratio', 'z_do_65_w3_ratio',
       'z_do_65_w4_ratio', 'z_do_65_w5_ratio', 'z_do_65_w6_ratio',
       'z_do_65_w7_ratio', 'z_do_65_w8_ratio', 'z_do_65_w9_ratio',
       'z_do_65_w10_ratio', 'z_do_65_w11_ratio', 'z_do_65_w12_ratio',
       'z_do_65_w13_ratio', 'z_do_65_w14_ratio', 'z_do_65_w15_ratio',
       'z_do_65_w16_ratio', 'z_do_65_w17_ratio', 'z_do_65_w18_ratio'

### df23 - small refugee (without ratios)

In [92]:
test_size = 14 #kraje (1 for previous results)
n_splits = int(len(df2) / test_size) - 1

y_col_name = "uchazeciOZamestnaniUoZZeny"
X = df23.drop(columns=y_col_name)
y = df23[y_col_name].to_numpy()

q = 0.95
exps = np.linspace(0, n_splits-1, num=n_splits)
weights = np.flip(np.power(q, exps))
weights

array([0.41812034, 0.44012667, 0.46329123, 0.48767498, 0.51334208,
       0.54036009, 0.56880009, 0.59873694, 0.63024941, 0.66342043,
       0.6983373 , 0.73509189, 0.77378094, 0.81450625, 0.857375  ,
       0.9025    , 0.95      , 1.        ])

In [93]:
tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)

def optimize_alpha(alpha):
    rmses = []
    for i, (train_index, test_index) in enumerate(tscv.split(X)):
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X.iloc[train_index, :])
        X_test = scaler.transform(X.iloc[test_index, :])
        y_train = y[train_index]
        y_test = y[test_index]

        tmp_ridge = Ridge(alpha=alpha, random_state=SEED)
        tmp_ridge.fit(X_train, y_train)

        rmse = mean_squared_error(
            y_test, tmp_ridge.predict(X_test), squared=False)

        rmses.append(rmse)

    return np.average(rmses, weights=weights)


res = optimize.minimize_scalar(
    optimize_alpha, bounds=(0, 1000), options={"disp": True})
best_alpha = res.x
print(res)


Optimization terminated successfully;
The returned value satisfies the termination criteria
(using xtol =  1e-05 )
 message: Solution found.
 success: True
  status: 0
     fun: 486.54027420426877
       x: 0.028236633770080587
     nit: 31
    nfev: 31


In [94]:
#all w (no ratios) 0.95
eval_tscv(tscv, best_alpha, X, y, weights=weights)

pred [ 4730.152346    5203.52774824 15189.28499721  3597.89816978
  4980.66324725 19500.82499854  6216.42989312  3690.37643535
 11661.76113177  4561.21267044 13126.65915459 15006.51845426
  4560.2699833   4817.19346658] test: [ 5178  5847 16277  4202  5945 20046  7026  4416 12774  5884 14967 16129
  4939  5083] rmse 941.8648631018657 mae 847.8733788275416
[ 0.00000000e+00  0.00000000e+00  3.85309262e+02  4.88621949e+02
 -5.91714806e+01 -1.59370680e+01 -5.41148269e+00 -1.59370680e+01
 -1.59370680e+01 -1.59370680e+01 -1.59370680e+01 -1.59370680e+01
 -1.59370680e+01 -1.59370680e+01 -1.59370680e+01 -1.59370680e+01
 -1.59370680e+01 -1.59370680e+01 -1.59370680e+01 -1.59370680e+01
 -1.59370680e+01 -1.59370680e+01 -1.59370680e+01 -1.59370680e+01
 -1.59370680e+01 -5.41148269e+00 -5.41148269e+00 -5.41148269e+00
 -5.41148269e+00 -5.41148269e+00 -5.41148269e+00 -5.41148269e+00
 -5.41148269e+00 -5.41148269e+00 -5.41148269e+00 -5.41148269e+00
 -5.41148269e+00 -5.41148269e+00 -5.41148269e+00 -5.41148

In [91]:
#all w (no ratios)
eval_tscv(tscv, best_alpha, X, y, weights=weights)

pred [ 4795.10274964  5346.79861902 15532.3421915   3636.20907273
  5094.56672261 19625.59311026  6284.06433856  3885.64843076
 12443.60437365  4944.36879798 13643.60162668 15152.94193093
  4679.70909682  4861.4496391 ] test: [ 5178  5847 16277  4202  5945 20046  7026  4416 12774  5884 14967 16129
  4939  5083] rmse 697.7322135349903 mae 627.6428071256965
[   0.            0.          352.09027384  453.34755209 -124.1961882
    0.            0.            0.            0.            0.
    0.            0.          866.25062676  237.36704405   80.15230835
  971.25280557 -125.86030017  335.18024822  158.90136997  866.44605376
  930.90510717    4.28657126   -5.3485984  -123.97914439  349.5818355
  -15.62805784 -101.25231851  788.95189922  -69.82963098 -194.84020347
 -165.35044197 -255.10012223  -46.19969469  566.00012459    8.18674905
  -46.84590346]
9369.0

pred [ 5103.22861019  5768.25030751 15633.05987157  4105.26073239
  5792.21664317 19110.92335031  6614.55031478  4212.08683508
 129

In [40]:
#only base and w19
eval_tscv(tscv, best_alpha, X, y, weights=weights)

pred [ 4845.3210258   5373.95014184 15549.89694436  3682.95122834
  5108.42600342 19644.97331181  6304.18104164  3876.20423449
 12498.04321212  4918.25558095 13647.56133886 15169.70820566
  4691.72825231  4896.95581041] test: [ 5178  5847 16277  4202  5945 20046  7026  4416 12774  5884 14967 16129
  4939  5083] rmse 685.1147870012549 mae 607.4888334281397
[   0.            0.          367.49820784  469.54830807  -93.91077896
  -53.37828316  -40.53773084  -53.37828316  -40.53773084    0.
    0.            0.            0.            0.            0.
    0.          864.20918494  251.89494472   87.93403184  963.64143315
  -95.99261264  354.24248168  181.59847625  866.37717468  925.98111554
   22.44317162    2.89036711 -123.14481305  360.16043288   -8.58723585
 -112.44881247  767.8580903   -80.36493902 -203.14228014 -138.98121414
 -258.18138253  -35.68412059  552.20202139    3.70153726  -52.50547442]
9369.0

pred [ 5082.56361266  5743.73454137 15549.77995403  4097.79874224
  5805.50095086

huber

In [43]:
tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)

def optimize_alpha(alpha):
    rmses = []
    for i, (train_index, test_index) in enumerate(tscv.split(X)):
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X.iloc[train_index, :])
        X_test = scaler.transform(X.iloc[test_index, :])
        y_train = y[train_index]
        y_test = y[test_index]

        tmp_ridge = HuberRegressor(alpha=alpha, max_iter=1000)
        tmp_ridge.fit(X_train, y_train)

        rmse = mean_squared_error(
            y_test, tmp_ridge.predict(X_test), squared=False)

        rmses.append(rmse)

    return np.average(rmses, weights=weights)


res = optimize.minimize_scalar(
    optimize_alpha, bounds=(0, 1000), options={"disp": True})
best_alpha = res.x
display(res)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("


Optimization terminated successfully;
The returned value satisfies the termination criteria
(using xtol =  1e-05 )


 message: Solution found.
 success: True
  status: 0
     fun: 549.8080653778746
       x: 0.0002784521644540049
     nit: 38
    nfev: 38

In [44]:
eval_tscv_huber(tscv, best_alpha, X, y, weights)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

pred [ 4831.13978709  5368.28537376 15551.43560462  3679.61572856
  5102.4960357  19654.0464889   6299.36671556  3868.89401181
 12494.37919065  4908.19265406 13639.38135485 15174.5466973
  4689.34899054  4893.42655674] test: [ 5178  5847 16277  4202  5945 20046  7026  4416 12774  5884 14967 16129
  4939  5083] rmse 688.776034559493 mae 611.3174864184606
[ 0.00000000e+00  0.00000000e+00  3.68655350e+02  4.70663735e+02
 -9.48698680e+01 -5.38164279e+01 -4.10013075e+01 -5.38164279e+01
 -4.10013075e+01  0.00000000e+00  1.31326252e-11  1.05061002e-10
 -1.31326252e-11  2.10122003e-10 -1.64157815e-12  1.31326252e-11
  8.65097212e+02  2.53183384e+02  8.85308207e+01  9.64566422e+02
 -9.72403359e+01  3.54895930e+02  1.82410552e+02  8.67688918e+02
  9.27370995e+02  2.22692680e+01  3.04626679e+00 -1.21678737e+02
  3.60712531e+02 -5.91244765e+00 -1.10972550e+02  7.68793856e+02
 -7.90209678e+01 -2.01691520e+02 -1.38808425e+02 -2.57629322e+02
 -3.70175199e+01  5.53861711e+02  6.60951511e+00 -5.0351521

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


### df2

In [158]:
n_splits = len(df2) - 1
n_splits
y_col_name = "uchazeciOZamestnaniUoZZeny_ratio"
X = df2.drop(columns=y_col_name)
y = df2[y_col_name].to_numpy()

q = 1
exps = np.linspace(0, n_splits-1, num=n_splits)
weights = np.flip(np.power(q, exps))
weights

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1.

In [169]:
tscv = TimeSeriesSplit(n_splits=n_splits, test_size=1)

# do only every 14 -> do not leak data from other kraj in the month


def optimize_alpha(alpha):
    rmses = []
    for i, (train_index, test_index) in enumerate(tscv.split(X)):
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X.iloc[train_index, :])
        X_test = scaler.transform(X.iloc[test_index, :])
        y_train = y[train_index]
        y_test = y[test_index]

        tmp_ridge = Ridge(alpha=alpha, random_state=SEED)
        tmp_ridge.fit(X_train, y_train)

        rmse = mean_squared_error(
            y_test, tmp_ridge.predict(X_test), squared=False)

        rmses.append(rmse)

    return np.average(rmses, weights=weights)


res = optimize.minimize_scalar(
    optimize_alpha, bounds=(0, 1000), options={"disp": True})
best_alpha = res.x


Optimization terminated successfully;
The returned value satisfies the termination criteria
(using xtol =  1e-05 )


In [167]:
display(res)

 message: Solution found.
 success: True
  status: 0
     fun: 0.021607479938802538
       x: 999.999970046681
     nit: 36
    nfev: 36

In [165]:
eval_tscv(tscv, best_alpha, X, y)

pred [0.50701135] test: [0.52069956] rmse 0.01368820895008993 mae 0.01368820895008993
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
pred [0.48674641] test: [0.51190695] rmse 0.025160547459305405 mae 0.025160547459305405
[ 0.          0.          0.00011098  0.00011098 -0.00011098  0.
  0.          0.          0.00011098  0.00011098  0.00011098  0.00011098
  0.00011098  0.00011098  0.00011098  0.00011098  0.00011098  0.00011098
  0.00011098  0.00011098  0.00011098  0.00011098  0.00011098  0.00011098
  0.00011098  0.00011098  0.00011098  0.00011098  0.00011098  0.00011098
  0.00011098  0.00011098  0.00011098  0.00011098  0.00011098  0.00011098
  0.00011098  0.00011098  0.00011098  0.00011098  0.00011098  0.00011098
  0.00011098  0.00011098  0.00011098  0.00011098  0.          0.
  0.          0.          0.00011098  0.00011098  0.0

In [182]:
from sklearn.model_selection import train_test_split
import statsmodels.api as sma

X = df2.drop(columns=y_col_name).to_numpy(dtype=float)
X = sma.add_constant(X)
y = df2[y_col_name].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=84, shuffle=False)

#scaler = StandardScaler()
#X_train = scaler.fit_transform(X_train)
#X_test = scaler.transform(X_test)
X

array([[1.000e+00, 3.000e+00, 2.022e+03, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [1.000e+00, 3.000e+00, 2.022e+03, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [1.000e+00, 3.000e+00, 2.022e+03, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       ...,
       [1.000e+00, 9.000e+00, 2.023e+03, ..., 1.000e+00, 0.000e+00,
        0.000e+00],
       [1.000e+00, 9.000e+00, 2.023e+03, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [1.000e+00, 9.000e+00, 2.023e+03, ..., 0.000e+00, 0.000e+00,
        1.000e+00]])

In [183]:
from sklearn.metrics import r2_score


#model = sma.OLS(endog=y_train, exog=X_train)
model = sma.RLM(endog=y_train, exog=X_train, M=sma.robust.norms.HuberT())
#fit = model.fit_regularized(alpha=best_alpha, L1_wt= 0)
fit = model.fit()
cmp = y_test, fit.predict(X_test)
mse = mean_squared_error(*cmp)
mae = mean_absolute_error(*cmp)
r2 = r2_score(*cmp)

print(mse, mae, r2)
fit.summary()

2.5723027266328016 0.6737336793644056 -6520.079052754033


0,1,2,3
Dep. Variable:,y,No. Observations:,182.0
Model:,RLM,Df Residuals:,127.0
Method:,IRLS,Df Model:,54.0
Norm:,HuberT,,
Scale Est.:,mad,,
Cov Type:,H1,,
Date:,"Tue, 26 Dec 2023",,
Time:,14:23:39,,
No. Iterations:,50,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,2.1e-07,1.29e-07,1.623,0.105,-4.37e-08,4.64e-07
x1,-0.0028,0.003,-0.822,0.411,-0.009,0.004
x2,0.0004,0.000,1.623,0.105,-8.76e-05,0.001
x3,1.625e-05,2.45e-05,0.664,0.507,-3.17e-05,6.43e-05
x4,1.422e-05,2.62e-05,0.544,0.587,-3.7e-05,6.55e-05
x5,2.022e-06,1.32e-06,1.526,0.127,-5.75e-07,4.62e-06
x6,5.545e-06,5.13e-05,0.108,0.914,-9.5e-05,0.000
x7,0.0053,0.004,1.208,0.227,-0.003,0.014
x8,-0.0065,0.006,-1.100,0.271,-0.018,0.005


In [173]:
df2.columns

Index(['month', 'year', 'general_thefts', 'break_in_thefts',
       'avg_monthly_salary', 'monthly_min_wage',
       'monthly_inflation_rate_wrt_last_year', 'reer', 'm_do_65_ratio',
       'z_do_65_ratio', 'm_do_65_w2_ratio', 'm_do_65_w3_ratio',
       'm_do_65_w4_ratio', 'm_do_65_w5_ratio', 'm_do_65_w6_ratio',
       'm_do_65_w7_ratio', 'm_do_65_w8_ratio', 'm_do_65_w9_ratio',
       'm_do_65_w10_ratio', 'm_do_65_w11_ratio', 'm_do_65_w12_ratio',
       'm_do_65_w13_ratio', 'm_do_65_w14_ratio', 'm_do_65_w15_ratio',
       'm_do_65_w16_ratio', 'm_do_65_w17_ratio', 'm_do_65_w18_ratio',
       'm_do_65_w19_ratio', 'z_do_65_w2_ratio', 'z_do_65_w3_ratio',
       'z_do_65_w4_ratio', 'z_do_65_w5_ratio', 'z_do_65_w6_ratio',
       'z_do_65_w7_ratio', 'z_do_65_w8_ratio', 'z_do_65_w9_ratio',
       'z_do_65_w10_ratio', 'z_do_65_w11_ratio', 'z_do_65_w12_ratio',
       'z_do_65_w13_ratio', 'z_do_65_w14_ratio', 'z_do_65_w15_ratio',
       'z_do_65_w16_ratio', 'z_do_65_w17_ratio', 'z_do_65_w18_ratio'

### Feature selection

In [224]:
from sklearn.feature_selection import SequentialFeatureSelector

In [278]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import make_scorer


estimator = Ridge()
y_col_name = "uchazeciOZamestnaniUoZZeny"
X = df22.drop(columns=y_col_name)
y = df22[y_col_name].to_numpy()
scaler = StandardScaler()
X_trans = scaler.fit_transform(X)
tscv = TimeSeriesSplit(n_splits=n_splits, test_size=1)
sfs = SequentialFeatureSelector(estimator=estimator, direction="forward", cv=tscv, scoring=make_scorer(mean_absolute_error))
sfs.fit(X_trans, y)

In [279]:
mask = sfs.get_support()
mask

array([ True,  True, False, False, False,  True, False,  True, False,
        True, False, False,  True,  True,  True, False, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True, False,
       False, False,  True,  True,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True, False, False, False, False, False, False, False,
        True, False, False, False, False, False, False, False, False,
       False, False, False, False])

In [280]:
all_colls = sfs.feature_names_in_
all_colls

AttributeError: 'SequentialFeatureSelector' object has no attribute 'feature_names_in_'

In [281]:
#excluded columns
excluded_cols = all_colls[~mask]
excluded_cols

array(['general_thefts', 'break_in_thefts', 'avg_monthly_salary',
       'monthly_inflation_rate_wrt_last_year', 'm_do_65_ratio',
       'm_do_65_w2_ratio', 'm_do_65_w3_ratio', 'm_do_65_w7_ratio',
       'm_do_65_w8_ratio', 'm_do_65_w18_ratio', 'm_do_65_w19_ratio',
       'z_do_65_w2_ratio', 'z_do_65_w6_ratio', 'z_do_65_w7_ratio',
       'z_do_65_w8_ratio', 'avg_energy_price', 'avg_gasoline_price',
       'avg_natural_gas_price', 'noveHlaseniUchazeci',
       'noveHlasenaAUvolnenaVPM', 'obsazenaAZrusenaVPM',
       'absolventiSkolAMladistvi', 'kraj_JHM', 'kraj_KVK', 'kraj_LBK',
       'kraj_MSK', 'kraj_OLK', 'kraj_PAK', 'kraj_PHA', 'kraj_PLK',
       'kraj_STC', 'kraj_ULK', 'kraj_VYS', 'kraj_ZLK'], dtype=object)

In [282]:
features = sfs.get_feature_names_out()
features

array(['x0', 'x1', 'x5', 'x7', 'x9', 'x12', 'x13', 'x14', 'x17', 'x18',
       'x19', 'x20', 'x21', 'x22', 'x23', 'x24', 'x25', 'x29', 'x30',
       'x31', 'x35', 'x36', 'x37', 'x38', 'x39', 'x40', 'x41', 'x42',
       'x43', 'x44', 'x45', 'x46', 'x54'], dtype=object)

In [284]:
X_select = sfs.transform(X)



In [294]:
n_splits = len(df22) - 1
n_splits

q = 1
exps = np.linspace(0, n_splits-1, num=n_splits)
weights = np.flip(np.power(q, exps))
weights

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1.

In [295]:
tscv = TimeSeriesSplit(n_splits=n_splits, test_size=1)

def optimize_alpha(alpha):
    rmses = []
    for i, (train_index, test_index) in enumerate(tscv.split(X)):
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_select[train_index, :])
        X_test = scaler.transform(X_select[test_index, :])
        y_train = y[train_index]
        y_test = y[test_index]

        tmp_ridge = Ridge(alpha=alpha, random_state=SEED)
        tmp_ridge.fit(X_train, y_train)

        rmse = mean_squared_error(
            y_test, tmp_ridge.predict(X_test), squared=False)

        rmses.append(rmse)

    return np.average(rmses, weights=weights)


res = optimize.minimize_scalar(
    optimize_alpha, bounds=(0, 1000), options={"disp": True})
best_alpha = res.x


Optimization terminated successfully;
The returned value satisfies the termination criteria
(using xtol =  1e-05 )


In [296]:
display(res)

 message: Solution found.
 success: True
  status: 0
     fun: 5038.862663229233
       x: 999.9999743212363
     nit: 36
    nfev: 36

In [297]:
eval_tscv(tscv, best_alpha, X_select, y, weights=weights)

pred [5315.] test: [6163] rmse 848.0 mae 848.0
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0.]
5315.0

pred [5771.55000087] test: [16681] rmse 10909.449999131135 mae 10909.449999131135
[0.         0.         0.         0.         0.80303032 0.80303032
 0.80303032 0.80303032 0.80303032 0.80303032 0.80303032 0.80303032
 0.80303032 0.80303032 0.80303032 0.80303032 0.80303032 0.80303032
 0.80303032 0.80303032 0.80303032 0.80303032 0.80303032 0.80303032
 0.80303032 0.80303032 0.80303032 0.80303032 0.80303032 0.80303032
 0.80303032 0.         0.80303032]
5739.0

pred [9317.5018309] test: [4409] rmse 4908.501830897749 mae 4908.501830897749
[ 0.          0.          0.          0.          3.1757762  13.43693166
 13.43693166 13.43693166 13.43693166 13.43693166 13.43693166 13.43693166
 13.43693166 13.43693166 13.43693166 13.43693166 13.43693166  3.1757762
  3.1757762   3.1757762   3.1757762   3.1757762   3.1757762   3.1757762
  3.1757762   3

decay changes nothing?

### df3 (big refugee)

In [95]:
test_size = 14 #kraje (1 for previous results)
n_splits = int(len(df2) / test_size) - 1

y_col_name = "uchazeciOZamestnaniUoZZeny"
X = df3.drop(columns=y_col_name)
y = df3[y_col_name].to_numpy()

q = 1
exps = np.linspace(0, n_splits-1, num=n_splits)
weights = np.flip(np.power(q, exps))
weights

NameError: name 'df3' is not defined

In [433]:
tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)

def optimize_alpha(alpha):
    rmses = []
    for i, (train_index, test_index) in enumerate(tscv.split(X)):
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X.iloc[train_index, :])
        X_test = scaler.transform(X.iloc[test_index, :])
        y_train = y[train_index]
        y_test = y[test_index]

        tmp_ridge = Ridge(alpha=alpha, random_state=SEED)
        tmp_ridge.fit(X_train, y_train)

        rmse = mean_squared_error(
            y_test, tmp_ridge.predict(X_test), squared=False)

        rmses.append(rmse)

    return np.average(rmses, weights=weights)


res = optimize.minimize_scalar(
    optimize_alpha, bounds=(0, 1000), options={"disp": True})
best_alpha = res.x


Optimization terminated successfully;
The returned value satisfies the termination criteria
(using xtol =  1e-05 )


In [434]:
display(res)

 message: Solution found.
 success: True
  status: 0
     fun: 1842.4116588622321
       x: 999.9999686486047
     nit: 36
    nfev: 36

In [435]:
# + seasonal lagged + test size 14
eval_tscv(tscv, best_alpha, X, y, weights=weights)

pred [ 6307.38942391  5265.279855   15291.81647315  3508.12481425
  5568.02767292 19134.43070873  7345.32707161  4049.06286722
 11324.22721312  5388.94696039 13917.53307433 15644.75766385
  4676.71857849  5262.43853951] test: [ 5178  5847 16277  4202  5945 20046  7026  4416 12774  5884 14967 16129
  4939  5083] rmse 758.7314293548045 mae 663.2306538259129
[-2.52843083e+02 -1.64583165e-02 -8.62455959e+01  1.73800809e+02
  4.52572903e+01 -3.16402188e+02 -2.34379293e+02 -6.11111278e+02
  1.71872567e+00  2.89770833e+00  1.71872567e+00  1.71872567e+00
  1.71872567e+00  1.71872567e+00  1.71872567e+00  1.71872567e+00
  1.71872567e+00  1.71872567e+00  1.71872567e+00  1.71872567e+00
  1.71872567e+00  1.71872567e+00  1.71872567e+00  1.71872567e+00
  1.71872567e+00  1.71872567e+00  1.71872567e+00  1.71872567e+00
  2.89770833e+00  2.89770833e+00  2.89770833e+00  2.89770833e+00
  2.89770833e+00  2.89770833e+00  2.89770833e+00  2.89770833e+00
  2.89770833e+00  2.89770833e+00  2.89770833e+00  2.89770

In [349]:
#orig
eval_tscv(tscv, best_alpha, X, y, weights=weights)

pred [3656.3229979] test: [6163] rmse 2506.6770021036064 mae 2506.6770021036064
[-2.28481454e+02 -4.28272438e+01 -1.73285408e+02  1.89983516e+02
  1.09051818e+02 -5.17202293e+02 -2.54345566e+02 -9.39798682e+02
 -2.36042694e-01 -2.36042694e-01 -2.36042694e-01 -2.36042694e-01
 -2.36042694e-01 -2.36042694e-01 -2.36042694e-01 -2.36042694e-01
 -2.36042694e-01 -2.36042694e-01 -2.36042694e-01 -2.36042694e-01
 -2.36042694e-01 -2.36042694e-01 -2.36042694e-01 -2.36042694e-01
 -2.36042694e-01 -2.36042694e-01 -2.36042694e-01 -2.36042694e-01
 -2.36042694e-01 -2.36042694e-01 -2.36042694e-01 -2.36042694e-01
 -2.36042694e-01 -2.36042694e-01 -2.36042694e-01 -2.36042694e-01
 -2.36042694e-01 -2.36042694e-01 -2.36042694e-01 -2.36042694e-01
 -2.36042694e-01 -2.36042694e-01 -2.36042694e-01 -2.36042694e-01
 -2.36042694e-01 -2.36042694e-01  1.51789748e+02  1.17233359e+01
  7.43377260e+02  2.45245387e+00  1.72809197e+03  1.45400619e+02
 -1.31393942e+02  2.91516131e+03 -4.44462207e+02  1.15115088e+03
 -6.334048

pred [7956.14164113] test: [7229] rmse 727.1416411263845 mae 727.1416411263845
[-2.27677037e+02 -4.40057636e+01 -1.72575016e+02  1.90030048e+02
  1.06393878e+02 -5.18412419e+02 -2.56267707e+02 -9.42686115e+02
  1.78738409e+00  1.25874886e+00  1.78738409e+00  1.78738409e+00
  1.78738409e+00  1.78738409e+00  1.78738409e+00  1.78738409e+00
  1.78738409e+00  1.78738409e+00  1.78738409e+00  1.78738409e+00
  1.78738409e+00  1.78738409e+00  1.78738409e+00  1.78738409e+00
  1.78738409e+00  1.78738409e+00  1.78738409e+00  1.78738409e+00
  1.25874886e+00  1.25874886e+00  1.25874886e+00  1.25874886e+00
  1.25874886e+00  1.25874886e+00  1.25874886e+00  1.25874886e+00
  1.25874886e+00  1.25874886e+00  1.25874886e+00  1.25874886e+00
  1.25874886e+00  1.25874886e+00  1.25874886e+00  1.25874886e+00
  1.25874886e+00  1.25874886e+00  1.52250529e+02  6.89041118e+00
  7.42061991e+02  3.33596860e+00  1.72876507e+03  1.45663009e+02
 -1.31010246e+02  2.91813261e+03 -4.43443855e+02  1.15134602e+03
 -6.3219519

In [404]:
# + lagged seasonal
eval_tscv(tscv, best_alpha, X, y, weights=weights)

pred [4244.47930914] test: [6163] rmse 1918.520690864263 mae 1918.520690864263
[-3.23055524e+02  4.21784620e+01 -1.46639672e+02  1.70275747e+02
  1.00624098e+02 -3.29454750e+02 -2.46435446e+02 -7.23873794e+02
 -1.02779897e-01 -1.02779897e-01 -1.02779897e-01 -1.02779897e-01
 -1.02779897e-01 -1.02779897e-01 -1.02779897e-01 -1.02779897e-01
 -1.02779897e-01 -1.02779897e-01 -1.02779897e-01 -1.02779897e-01
 -1.02779897e-01 -1.02779897e-01 -1.02779897e-01 -1.02779897e-01
 -1.02779897e-01 -1.02779897e-01 -1.02779897e-01 -1.02779897e-01
 -1.02779897e-01 -1.02779897e-01 -1.02779897e-01 -1.02779897e-01
 -1.02779897e-01 -1.02779897e-01 -1.02779897e-01 -1.02779897e-01
 -1.02779897e-01 -1.02779897e-01 -1.02779897e-01 -1.02779897e-01
 -1.02779897e-01 -1.02779897e-01 -1.02779897e-01 -1.02779897e-01
 -1.02779897e-01 -1.02779897e-01  1.51584499e+02 -4.47677542e+01
  6.25401308e+02  5.77359633e-02  9.43606646e+02  7.72520843e+01
 -1.39896191e+02  2.04534406e+03  2.88945644e+02  6.38192113e+01
  1.1901670

huber

In [454]:
tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)

def optimize_alpha(alpha):
    rmses = []
    for i, (train_index, test_index) in enumerate(tscv.split(X)):
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X.iloc[train_index, :])
        X_test = scaler.transform(X.iloc[test_index, :])
        y_train = y[train_index]
        y_test = y[test_index]

        tmp_ridge = HuberRegressor(alpha=alpha)
        tmp_ridge.fit(X_train, y_train)

        rmse = mean_squared_error(
            y_test, tmp_ridge.predict(X_test), squared=False)

        rmses.append(rmse)

    return np.average(rmses, weights=weights)


res = optimize.minimize_scalar(
    optimize_alpha, bounds=(0, 1000), options={"disp": True})
best_alpha = res.x
display(res)


Optimization terminated successfully;
The returned value satisfies the termination criteria
(using xtol =  1e-05 )


 message: Solution found.
 success: True
  status: 0
     fun: 2020.4286172905495
       x: 0.4523737203523583
     nit: 34
    nfev: 34

In [455]:
eval_tscv_huber(tscv, best_alpha, X, y, weights)

pred [ 6163.67453829  5390.22198611 15264.82216496  3519.69288374
  5778.45115839 18282.62426417  7451.79237884  4082.44645476
 11293.54507862  5570.27250241 14154.80608693 15218.01104191
  4611.67931586  5335.88838316] test: [ 5178  5847 16277  4202  5945 20046  7026  4416 12774  5884 14967 16129
  4939  5083] rmse 847.3007543138084 mae 708.8415973186635
[-2.65022901e+02 -6.78387949e+01 -4.33387628e+01  1.97536243e+02
  5.09551887e+00 -2.84002173e+02 -1.92328599e+02 -5.29335120e+02
 -5.16475081e-01  4.13152952e+00 -5.16475081e-01 -5.16475081e-01
 -5.16475081e-01 -5.16475081e-01 -5.16475081e-01 -5.16475081e-01
 -5.16475081e-01 -5.16475081e-01 -5.16475081e-01 -5.16475081e-01
 -5.16475081e-01 -5.16475081e-01 -5.16475081e-01 -5.16475081e-01
 -5.16475081e-01 -5.16475081e-01 -5.16475081e-01 -5.16475081e-01
  4.13152952e+00  4.13152952e+00  4.13152952e+00  4.13152952e+00
  4.13152952e+00  4.13152952e+00  4.13152952e+00  4.13152952e+00
  4.13152952e+00  4.13152952e+00  4.13152952e+00  4.13152

### df31 (big refugee without ratios)

In [73]:
test_size = 14 #kraje (1 for previous results)
n_splits = int(len(df2) / test_size) - 1

y_col_name = "uchazeciOZamestnaniUoZZeny"
X = df31.drop(columns=y_col_name)
y = df31[y_col_name].to_numpy()

q = 1
exps = np.linspace(0, n_splits-1, num=n_splits)
weights = np.flip(np.power(q, exps))
weights

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1.])

In [74]:
tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)

def optimize_alpha(alpha):
    rmses = []
    for i, (train_index, test_index) in enumerate(tscv.split(X)):
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X.iloc[train_index, :])
        X_test = scaler.transform(X.iloc[test_index, :])
        y_train = y[train_index]
        y_test = y[test_index]

        tmp_ridge = Ridge(alpha=alpha, random_state=SEED)
        tmp_ridge.fit(X_train, y_train)

        rmse = mean_squared_error(
            y_test, tmp_ridge.predict(X_test), squared=False)

        rmses.append(rmse)

    return np.average(rmses, weights=weights)


res = optimize.minimize_scalar(
    optimize_alpha, bounds=(0, 1000), options={"disp": True})
best_alpha = res.x


Optimization terminated successfully;
The returned value satisfies the termination criteria
(using xtol =  1e-05 )


In [75]:
eval_tscv(tscv, best_alpha, X, y, weights=weights)

pred [ 5955.60605887  4968.63306202 15959.3130955   2961.87026093
  5083.33136657 19017.42953872  6732.83291086  3614.87716618
 14111.44150237  5577.07112529 15244.93045383 15499.68028604
  4143.31153863  4598.97831722] test: [ 5178  5847 16277  4202  5945 20046  7026  4416 12774  5884 14967 16129
  4939  5083] rmse 791.3307971434817 mae 716.403524793641
[-2.67566160e+02  1.24761591e+01 -9.53914738e+01  1.70742157e+02
  5.45332878e+01  3.78056408e+00  2.90961711e-01  3.78056408e+00
  3.78056408e+00  3.78056408e+00  3.78056408e+00  3.78056408e+00
  3.78056408e+00  3.78056408e+00  3.78056408e+00  3.78056408e+00
  3.78056408e+00  3.78056408e+00  3.78056408e+00  3.78056408e+00
  3.78056408e+00  3.78056408e+00  3.78056408e+00  3.78056408e+00
  3.78056408e+00  2.90961711e-01  2.90961711e-01  2.90961711e-01
  2.90961711e-01  2.90961711e-01  2.90961711e-01  2.90961711e-01
  2.90961711e-01  2.90961711e-01  2.90961711e-01  2.90961711e-01
  2.90961711e-01  2.90961711e-01  2.90961711e-01  2.909617

huber

In [76]:
tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)

def optimize_alpha(alpha):
    rmses = []
    for i, (train_index, test_index) in enumerate(tscv.split(X)):
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X.iloc[train_index, :])
        X_test = scaler.transform(X.iloc[test_index, :])
        y_train = y[train_index]
        y_test = y[test_index]

        tmp_ridge = HuberRegressor(alpha=alpha)
        tmp_ridge.fit(X_train, y_train)

        rmse = mean_squared_error(
            y_test, tmp_ridge.predict(X_test), squared=False)

        rmses.append(rmse)

    return np.average(rmses, weights=weights)


res = optimize.minimize_scalar(
    optimize_alpha, bounds=(0, 1000), options={"disp": True})
best_alpha = res.x
display(res)


Optimization terminated successfully;
The returned value satisfies the termination criteria
(using xtol =  1e-05 )


 message: Solution found.
 success: True
  status: 0
     fun: 958.6882077581862
       x: 0.19197540948359565
     nit: 36
    nfev: 36

In [77]:
eval_tscv_huber(tscv, best_alpha, X, y, weights)

pred [ 6053.59869162  5284.5313988  16901.01669571  3085.247188
  5418.89550315 19576.71802106  7113.27338194  3844.39852385
 14142.07947435  5915.48303997 16044.71399553 16397.94094465
  4434.56658014  4815.37627128] test: [ 5178  5847 16277  4202  5945 20046  7026  4416 12774  5884 14967 16129
  4939  5083] rmse 706.2698258602646 mae 596.5266241060991
[-406.80004057    3.22137753 -164.50897131  188.9808446    96.12877649
    5.75601117   -2.95625553    5.75601117    5.75601117    5.75601117
    5.75601117    5.75601117    5.75601117    5.75601117    5.75601117
    5.75601117    5.75601117    5.75601117    5.75601117    5.75601117
    5.75601117    5.75601117    5.75601117    5.75601117    5.75601117
   -2.95625553   -2.95625553   -2.95625553   -2.95625553   -2.95625553
   -2.95625553   -2.95625553   -2.95625553   -2.95625553   -2.95625553
   -2.95625553   -2.95625553   -2.95625553   -2.95625553   -2.95625553
   -2.95625553   -2.95625553   -2.95625553 -282.84160891 -236.26526032
 -780

### df4 (small economic)

In [105]:
test_size = 14 #kraje (1 for previous results)
n_splits = int(len(df2) / test_size) - 1

y_col_name = "uchazeciOZamestnaniUoZZeny"
X = df4.drop(columns=y_col_name)
y = df4[y_col_name].to_numpy()

q = 0.95
exps = np.linspace(0, n_splits-1, num=n_splits)
weights = np.flip(np.power(q, exps))
weights

array([0.41812034, 0.44012667, 0.46329123, 0.48767498, 0.51334208,
       0.54036009, 0.56880009, 0.59873694, 0.63024941, 0.66342043,
       0.6983373 , 0.73509189, 0.77378094, 0.81450625, 0.857375  ,
       0.9025    , 0.95      , 1.        ])

In [106]:
tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)

def optimize_alpha(alpha):
    rmses = []
    for i, (train_index, test_index) in enumerate(tscv.split(X)):
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X.iloc[train_index, :])
        X_test = scaler.transform(X.iloc[test_index, :])
        y_train = y[train_index]
        y_test = y[test_index]

        tmp_ridge = Ridge(alpha=alpha, random_state=SEED)
        tmp_ridge.fit(X_train, y_train)

        rmse = mean_squared_error(
            y_test, tmp_ridge.predict(X_test), squared=False)

        rmses.append(rmse)

    return np.average(rmses, weights=weights)


res = optimize.minimize_scalar(
    optimize_alpha, bounds=(0, 1000), options={"disp": True})
best_alpha = res.x


Optimization terminated successfully;
The returned value satisfies the termination criteria
(using xtol =  1e-05 )


In [107]:
display(res)

 message: Solution found.
 success: True
  status: 0
     fun: 661.2217345282857
       x: 0.05448439239663425
     nit: 28
    nfev: 28

In [108]:
eval_tscv(tscv, best_alpha, X, y, weights=weights)

pred [ 4795.10274964  5346.79861902 15532.3421915   3636.20907273
  5094.56672261 19625.59311026  6284.06433856  3885.64843076
 12443.60437365  4944.36879798 13643.60162668 15152.94193093
  4679.70909682  4861.4496391 ] test: [ 5178  5847 16277  4202  5945 20046  7026  4416 12774  5884 14967 16129
  4939  5083] rmse 697.7322135349903 mae 627.6428071256965
[   0.            0.          352.09027384  453.34755209 -124.1961882
    0.            0.            0.            0.            0.
    0.            0.          866.25062676  237.36704405   80.15230835
  971.25280557 -125.86030017  335.18024822  158.90136997  866.44605376
  930.90510717    4.28657126   -5.3485984  -123.97914439  349.5818355
  -15.62805784 -101.25231851  788.95189922  -69.82963098 -194.84020347
 -165.35044197 -255.10012223  -46.19969469  566.00012459    8.18674905
  -46.84590346]
9369.0

pred [ 5103.22861019  5768.25030751 15633.05987157  4105.26073239
  5792.21664317 19110.92335031  6614.55031478  4212.08683508
 129

huber

In [457]:
tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)

def optimize_alpha(alpha):
    rmses = []
    for i, (train_index, test_index) in enumerate(tscv.split(X)):
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X.iloc[train_index, :])
        X_test = scaler.transform(X.iloc[test_index, :])
        y_train = y[train_index]
        y_test = y[test_index]

        tmp_ridge = HuberRegressor(alpha=alpha)
        tmp_ridge.fit(X_train, y_train)

        rmse = mean_squared_error(
            y_test, tmp_ridge.predict(X_test), squared=False)

        rmses.append(rmse)

    return np.average(rmses, weights=weights)


res = optimize.minimize_scalar(
    optimize_alpha, bounds=(0, 1000), options={"disp": True})
best_alpha = res.x
display(res)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("


Optimization terminated successfully;
The returned value satisfies the termination criteria
(using xtol =  1e-05 )


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

 message: Solution found.
 success: True
  status: 0
     fun: 655.1445566269994
       x: 0.00041669450020402203
     nit: 38
    nfev: 38

In [458]:
eval_tscv_huber(tscv, best_alpha, X, y, weights)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

pred [ 4782.03735847  5341.51057885 15533.11211245  3632.71536753
  5089.32533987 19633.29467306  6279.75703991  3879.36269429
 12439.37609713  4935.69879819 13635.9541332  15156.92589568
  4677.48956983  4858.06247513] test: [ 5178  5847 16277  4202  5945 20046  7026  4416 12774  5884 14967 16129
  4939  5083] rmse 701.221750099238 mae 631.3127047446485
[ 0.00000000e+00  0.00000000e+00  3.52982387e+02  4.54187271e+02
 -1.25323612e+02  0.00000000e+00  1.12984218e-11  9.03873748e-11
 -1.12984218e-11  1.80774750e-10 -1.41230273e-12  1.12984218e-11
  8.67045386e+02  2.38390623e+02  8.06290525e+01  9.72118017e+02
 -1.27239572e+02  3.35583874e+02  1.59419405e+02  8.67598481e+02
  9.32162531e+02  3.97537323e+00 -5.27266475e+00 -1.22678249e+02
  3.49938177e+02 -1.33568928e+01 -9.98063057e+01  7.89971105e+02
 -6.85014087e+01 -1.93441990e+02 -1.65429999e+02 -2.54544398e+02
 -4.74852923e+01  5.67594549e+02  1.08165392e+01 -4.48710883e+01]
9369.00000236863

pred [ 5036.166832    5579.62569445 152

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
