# Model Training and Evaluation

## Imports

In [373]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score
from scipy import optimize

SEED = 333

## Data Preparation

In [374]:
df = pd.read_csv('../dataset/team_A_dataset.csv')

seasonal_cols = ['avg_monthly_salary', 'general_thefts', 'break_in_thefts', 'noveHlaseniUchazeci',
                  'absolventiSkolAMladistvi', 'noveHlasenaAUvolnenaVPM', 'obsazenaAZrusenaVPM']

for col in seasonal_cols:
    df[col + '_prev_year'] = df[col].shift(168)

#fill previous year columns for 2009 with 2009 year values
for i in range(len(df)):
    for col in seasonal_cols:
        if np.isnan(df.loc[df.index[i], col + '_prev_year']):
            df.loc[df.index[i], col + '_prev_year'] = df.loc[df.index[i], col]

encoder = OneHotEncoder(handle_unknown="ignore", drop="first")

obj_cols = df.select_dtypes('object')
encoder.fit(obj_cols)
transformed_cols = encoder.transform(obj_cols).toarray()
feature_names = encoder.get_feature_names_out()
transformed_df = pd.DataFrame(
    transformed_cols, index=df.index, columns=feature_names).astype(bool)
df = pd.concat(
    [df.select_dtypes(exclude='object'), transformed_df], axis=1)

obj_cols = df.select_dtypes('bool').columns
df[obj_cols] = df[obj_cols].astype(int)

df = df.fillna(0)

#Since March 2022
war_df = df.iloc[(158*14):, :]

results = []

In [375]:
drop_cols_refugees =  ["celkem", "uchazeciOZamestnaniUoZ",	"uchazeciOZamestnaniUoZZeny_ratio", "uchazeciOZamestnaniUoZMuzi", "uchazeciOZamestnaniUoZMuzi_ratio"]
drop_cols_refugees += [f"celkem_w{w}" for w in range(2,20)]
drop_cols_refugees += [f"m_do_65_w{w}" for w in range(2,19)]
drop_cols_refugees += [f"z_do_65_w{w}" for w in range(2,19)]
drop_cols_refugees += [f"m_do_65_w{w}_ratio" for w in range(2,19)]
drop_cols_refugees += [f"z_do_65_w{w}_ratio" for w in range(2,19)]
 
drop_cols_no_refugees = drop_cols_refugees.copy()
drop_cols_no_refugees += ["m_do_65", "z_do_65","m_do_65_ratio", "z_do_65_ratio"]
drop_cols_no_refugees += [f"m_do_65_w{w}" for w in range(19,20)]
drop_cols_no_refugees += [f"z_do_65_w{w}" for w in range(19,20)]
drop_cols_no_refugees += [f"m_do_65_w{w}_ratio" for w in range(19,20)]
drop_cols_no_refugees += [f"z_do_65_w{w}_ratio" for w in range(19,20)]

## Model Training and Evaluation

### Evaluation Function 

In [401]:
def eval_tscv(tscv: TimeSeriesSplit, alpha: float, X, y, weights, model, verbose: bool = False,):
    rmses = []
    maes = []
    r2s = []
    adj_r2s = []
    for train_index, test_index in tscv.split(X):
        scaler = StandardScaler()
        if isinstance(X, pd.DataFrame):
            X_train = scaler.fit_transform(X.iloc[train_index, :])
            X_test = scaler.transform(X.iloc[test_index, :])
        else:
            X_train = scaler.fit_transform(X[train_index, :])
            X_test = scaler.transform(X[test_index, :])
        y_train = y[train_index]
        y_test = y[test_index]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        n = X.shape[0]
        k = X.shape[1]
        adj_r2 = 1 - (((1-r2)*(n-1))/(n-k-1))
        if verbose:
            print("pred", model.predict(X_test),
                  "test:", y_test, "rmse", rmse, "mae", mae)
            print(model.coef_)
            print(model.intercept_)
            print()
        rmses.append(rmse)
        maes.append(mae)
        r2s.append(r2)
        adj_r2s.append(adj_r2)

    weighted_rmse = np.average(rmses, weights=weights)
    weighted_mae = np.average(maes, weights=weights)
    weighted_r2 = np.average(r2s, weights=weights)
    weighted_adj_r2 = np.average(adj_r2s, weights=weights)
    sum_weighted_rmse = np.sum(rmses * weights)
    sum_weighted_mae = np.sum(maes * weights)
    print("rmse", weighted_rmse)
    print("mae", weighted_mae)
    print("r2", weighted_r2)
    print("adj r2", weighted_adj_r2)
    print("sum weighted rmse", sum_weighted_rmse)
    print("sum weighted mae", sum_weighted_mae)

    return (weighted_rmse, weighted_mae, weighted_r2, weighted_adj_r2, sum_weighted_rmse, sum_weighted_mae, model.coef_)

### Since start of the war refugees predictors present

In [377]:
df_war_refugees = war_df.drop(columns=drop_cols_refugees)
df_war_refugees.head()

Unnamed: 0,month,year,general_thefts,break_in_thefts,avg_monthly_salary,m_do_65,z_do_65,m_do_65_w19,z_do_65_w19,monthly_min_wage,monthly_inflation_rate_wrt_last_year,reer,m_do_65_ratio,z_do_65_ratio,m_do_65_w19_ratio,z_do_65_w19_ratio,bilance,avg_energy_price,avg_gasoline_price,avg_natural_gas_price,uchazeciOZamestnaniUoZZeny,noveHlaseniUchazeci,noveHlasenaAUvolnenaVPM,obsazenaAZrusenaVPM,absolventiSkolAMladistvi,avg_monthly_salary_prev_year,general_thefts_prev_year,break_in_thefts_prev_year,noveHlaseniUchazeci_prev_year,absolventiSkolAMladistvi_prev_year,noveHlasenaAUvolnenaVPM_prev_year,obsazenaAZrusenaVPM_prev_year,kraj_JHC,kraj_JHM,kraj_KVK,kraj_LBK,kraj_MSK,kraj_OLK,kraj_PAK,kraj_PHA,kraj_PLK,kraj_STC,kraj_ULK,kraj_VYS,kraj_ZLK
2212,3,2022,60.0,53.0,34689.0,1280.0,4562.0,1280.0,4562.0,16200,12.7,107.78,0.117205,0.417727,0.117205,0.417727,-12.741,187.1,1.9464,4.9772,5315,1451,2010,2094,527,33032.0,53.0,32.0,1412.0,555.0,2091.0,1775.0,0,0,0,0,0,0,0,0,0,0,0,0,0
2213,3,2022,113.0,71.0,33998.0,1558.0,5449.0,1558.0,5449.0,16200,12.7,107.78,0.123191,0.430853,0.123191,0.430853,-12.741,187.1,1.9464,4.9772,6163,1616,3249,4076,521,32213.0,75.0,68.0,1852.0,727.0,3097.0,2820.0,1,0,0,0,0,0,0,0,0,0,0,0,0
2214,3,2022,224.0,327.0,37027.0,3138.0,10331.0,3138.0,10331.0,16200,12.7,107.78,0.129461,0.426214,0.129461,0.426214,-12.741,187.1,1.9464,4.9772,16681,3383,6761,6142,1624,34989.0,137.0,193.0,3620.0,1751.0,5790.0,5105.0,0,1,0,0,0,0,0,0,0,0,0,0,0
2215,3,2022,54.0,63.0,32424.0,1152.0,4292.0,1152.0,4292.0,16200,12.7,107.78,0.117503,0.43778,0.117503,0.43778,-12.741,187.1,1.9464,4.9772,4409,1027,2096,1588,372,30189.0,56.0,55.0,1166.0,457.0,1241.0,1148.0,0,0,1,0,0,0,0,0,0,0,0,0,0
2216,3,2022,129.0,116.0,33745.0,992.0,4541.0,992.0,4541.0,16200,12.7,107.78,0.095375,0.436593,0.095375,0.436593,-12.741,187.1,1.9464,4.9772,5931,1562,3192,3957,566,31956.0,72.0,55.0,1547.0,601.0,2858.0,2790.0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [378]:
n_kraje = 14
n_splits = int(len(df_war_refugees)/n_kraje - 1)

X = df_war_refugees.loc[:, df_war_refugees.columns != 'uchazeciOZamestnaniUoZZeny'].to_numpy()
y = df_war_refugees.loc[:, df_war_refugees.columns == 'uchazeciOZamestnaniUoZZeny'].to_numpy()

tscv = TimeSeriesSplit(n_splits=n_splits, test_size=n_kraje)

In [379]:
weights = np.ones((18,))

def optimize_alpha(alpha):
    rmses = []
    for train_index, test_index in tscv.split(X):
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X[train_index, :])
        X_test = scaler.transform(X[test_index, :])
        y_train = y[train_index]
        y_test = y[test_index]

        model = Lasso(alpha=alpha, random_state=SEED, max_iter=100000)
        model.fit(X_train, y_train)

        rmse = mean_squared_error(y_test, model.predict(X_test), squared=False)
        rmses.append(rmse)

    return np.average(rmses, weights=weights)


res = optimize.minimize_scalar(
    optimize_alpha, bounds=(0, 1000), options={"disp": True})
best_alpha = res.x
print(f"Best alpha {best_alpha}")


Optimization terminated successfully;
The returned value satisfies the termination criteria
(using xtol =  1e-05 )
Best alpha 1.527077431355105


In [402]:
weights = np.ones((n_splits,))

results.append(eval_tscv(tscv, best_alpha, X, y, weights, Lasso(), verbose=False))

rmse 1935.8304130635124
mae 1757.9022864777794
r2 0.8750078595622246
adj r2 0.8731644687159484
sum weighted rmse 34844.94743514322
sum weighted mae 31642.24115660003


### Since start of the war no refugee predictors present

In [381]:
df_war_no_refugees = war_df.drop(columns=drop_cols_no_refugees)

df_war_no_refugees.head()

Unnamed: 0,month,year,general_thefts,break_in_thefts,avg_monthly_salary,monthly_min_wage,monthly_inflation_rate_wrt_last_year,reer,bilance,avg_energy_price,avg_gasoline_price,avg_natural_gas_price,uchazeciOZamestnaniUoZZeny,noveHlaseniUchazeci,noveHlasenaAUvolnenaVPM,obsazenaAZrusenaVPM,absolventiSkolAMladistvi,avg_monthly_salary_prev_year,general_thefts_prev_year,break_in_thefts_prev_year,noveHlaseniUchazeci_prev_year,absolventiSkolAMladistvi_prev_year,noveHlasenaAUvolnenaVPM_prev_year,obsazenaAZrusenaVPM_prev_year,kraj_JHC,kraj_JHM,kraj_KVK,kraj_LBK,kraj_MSK,kraj_OLK,kraj_PAK,kraj_PHA,kraj_PLK,kraj_STC,kraj_ULK,kraj_VYS,kraj_ZLK
2212,3,2022,60.0,53.0,34689.0,16200,12.7,107.78,-12.741,187.1,1.9464,4.9772,5315,1451,2010,2094,527,33032.0,53.0,32.0,1412.0,555.0,2091.0,1775.0,0,0,0,0,0,0,0,0,0,0,0,0,0
2213,3,2022,113.0,71.0,33998.0,16200,12.7,107.78,-12.741,187.1,1.9464,4.9772,6163,1616,3249,4076,521,32213.0,75.0,68.0,1852.0,727.0,3097.0,2820.0,1,0,0,0,0,0,0,0,0,0,0,0,0
2214,3,2022,224.0,327.0,37027.0,16200,12.7,107.78,-12.741,187.1,1.9464,4.9772,16681,3383,6761,6142,1624,34989.0,137.0,193.0,3620.0,1751.0,5790.0,5105.0,0,1,0,0,0,0,0,0,0,0,0,0,0
2215,3,2022,54.0,63.0,32424.0,16200,12.7,107.78,-12.741,187.1,1.9464,4.9772,4409,1027,2096,1588,372,30189.0,56.0,55.0,1166.0,457.0,1241.0,1148.0,0,0,1,0,0,0,0,0,0,0,0,0,0
2216,3,2022,129.0,116.0,33745.0,16200,12.7,107.78,-12.741,187.1,1.9464,4.9772,5931,1562,3192,3957,566,31956.0,72.0,55.0,1547.0,601.0,2858.0,2790.0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [382]:
n_kraje = 14
n_splits = int(len(war_df)/n_kraje - 1)

X = df_war_no_refugees.loc[:, df_war_no_refugees.columns != 'uchazeciOZamestnaniUoZZeny'].to_numpy()
y = df_war_no_refugees.loc[:, df_war_no_refugees.columns == 'uchazeciOZamestnaniUoZZeny'].to_numpy()

tscv = TimeSeriesSplit(n_splits=n_splits, test_size=n_kraje)

In [383]:
weights = np.ones((18,))

def optimize_alpha(alpha):
    rmses = []
    for train_index, test_index in tscv.split(X):
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X[train_index, :])
        X_test = scaler.transform(X[test_index, :])
        y_train = y[train_index]
        y_test = y[test_index]

        model = Lasso(alpha=alpha, random_state=SEED, max_iter=1000000)
        model.fit(X_train, y_train)

        rmse = mean_squared_error(y_test, model.predict(X_test), squared=False)
        rmses.append(rmse)

    return np.average(rmses, weights=weights)


res = optimize.minimize_scalar(
    optimize_alpha, bounds=(0, 1000), options={"disp": True})
best_alpha = res.x
print(f"Best alpha {best_alpha}")


Optimization terminated successfully;
The returned value satisfies the termination criteria
(using xtol =  1e-05 )
Best alpha 2.5822215535124333


In [384]:
weights = np.ones((n_splits,))

results.append(eval_tscv(tscv, best_alpha, X, y, weights,Lasso(), verbose=False))

rmse 633.5642306060388
mae 519.6986586135417
sum weighted rmse 11404.156150908699
sum weighted mae 9354.575855043751


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


### Entire period refugee predictors present

In [385]:
df_entire_refugees = df.drop(columns=drop_cols_refugees)
df_entire_refugees.head()

Unnamed: 0,month,year,general_thefts,break_in_thefts,avg_monthly_salary,m_do_65,z_do_65,m_do_65_w19,z_do_65_w19,monthly_min_wage,monthly_inflation_rate_wrt_last_year,reer,m_do_65_ratio,z_do_65_ratio,m_do_65_w19_ratio,z_do_65_w19_ratio,bilance,avg_energy_price,avg_gasoline_price,avg_natural_gas_price,uchazeciOZamestnaniUoZZeny,noveHlaseniUchazeci,noveHlasenaAUvolnenaVPM,obsazenaAZrusenaVPM,absolventiSkolAMladistvi,avg_monthly_salary_prev_year,general_thefts_prev_year,break_in_thefts_prev_year,noveHlaseniUchazeci_prev_year,absolventiSkolAMladistvi_prev_year,noveHlasenaAUvolnenaVPM_prev_year,obsazenaAZrusenaVPM_prev_year,kraj_JHC,kraj_JHM,kraj_KVK,kraj_LBK,kraj_MSK,kraj_OLK,kraj_PAK,kraj_PHA,kraj_PLK,kraj_STC,kraj_ULK,kraj_VYS,kraj_ZLK
0,1,2009,271.0,174.0,19132.0,0.0,0.0,0.0,0.0,8000,2.2,89.45,0.0,0.0,0.0,0.0,-2.73,70.209,1.1317,5.0707,8445,4781,1417,2948,1221,19132.0,271.0,174.0,4781.0,1221.0,1417.0,2948.0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,2009,275.0,181.0,19576.0,0.0,0.0,0.0,0.0,8000,2.2,89.45,0.0,0.0,0.0,0.0,-2.73,70.209,1.1317,5.0707,10352,5538,1053,2289,1512,19576.0,275.0,181.0,5538.0,1512.0,1053.0,2289.0,1,0,0,0,0,0,0,0,0,0,0,0,0
2,1,2009,909.0,286.0,21065.0,0.0,0.0,0.0,0.0,8000,2.2,89.45,0.0,0.0,0.0,0.0,-2.73,70.209,1.1317,5.0707,24333,9824,3140,4985,3078,21065.0,909.0,286.0,9824.0,3078.0,3140.0,4985.0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,1,2009,198.0,96.0,18652.0,0.0,0.0,0.0,0.0,8000,2.2,89.45,0.0,0.0,0.0,0.0,-2.73,70.209,1.1317,5.0707,7386,3325,496,1050,960,18652.0,198.0,96.0,3325.0,960.0,496.0,1050.0,0,0,1,0,0,0,0,0,0,0,0,0,0
4,1,2009,343.0,130.0,19653.0,0.0,0.0,0.0,0.0,8000,2.2,89.45,0.0,0.0,0.0,0.0,-2.73,70.209,1.1317,5.0707,9563,4814,1134,1665,1126,19653.0,343.0,130.0,4814.0,1126.0,1134.0,1665.0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [386]:
n_kraje = 14
n_splits = int(len(war_df)/n_kraje - 1)

X = df_entire_refugees.loc[:, df_entire_refugees.columns != 'uchazeciOZamestnaniUoZZeny'].to_numpy()
y = df_entire_refugees.loc[:, df_entire_refugees.columns == 'uchazeciOZamestnaniUoZZeny'].to_numpy()

tscv = TimeSeriesSplit(n_splits=n_splits, test_size=n_kraje)

In [387]:
weights = np.ones((18,))

def optimize_alpha(alpha):
    rmses = []
    for train_index, test_index in tscv.split(X):
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X[train_index, :])
        X_test = scaler.transform(X[test_index, :])
        y_train = y[train_index]
        y_test = y[test_index]

        model = Lasso(alpha=alpha, random_state=SEED, max_iter=1000000)
        model.fit(X_train, y_train)

        rmse = mean_squared_error(y_test, model.predict(X_test), squared=False)
        rmses.append(rmse)

    return np.average(rmses, weights=weights)


res = optimize.minimize_scalar(
    optimize_alpha, bounds=(0, 1000), options={"disp": True})
best_alpha = res.x
print(f"Best alpha {best_alpha}")


Optimization terminated successfully;
The returned value satisfies the termination criteria
(using xtol =  1e-05 )
Best alpha 298.76972314537164


In [388]:
weights = np.ones((n_splits,))

results.append(eval_tscv(tscv, best_alpha, X, y, weights,Lasso(), False))

rmse 1410.799217899033
mae 1022.0928755084346
sum weighted rmse 25394.385922182595
sum weighted mae 18397.671759151825


### Entire period no refugee predictors present

In [389]:
df_entire_no_refugees = df.drop(columns=drop_cols_no_refugees)
df_entire_no_refugees.head()

Unnamed: 0,month,year,general_thefts,break_in_thefts,avg_monthly_salary,monthly_min_wage,monthly_inflation_rate_wrt_last_year,reer,bilance,avg_energy_price,avg_gasoline_price,avg_natural_gas_price,uchazeciOZamestnaniUoZZeny,noveHlaseniUchazeci,noveHlasenaAUvolnenaVPM,obsazenaAZrusenaVPM,absolventiSkolAMladistvi,avg_monthly_salary_prev_year,general_thefts_prev_year,break_in_thefts_prev_year,noveHlaseniUchazeci_prev_year,absolventiSkolAMladistvi_prev_year,noveHlasenaAUvolnenaVPM_prev_year,obsazenaAZrusenaVPM_prev_year,kraj_JHC,kraj_JHM,kraj_KVK,kraj_LBK,kraj_MSK,kraj_OLK,kraj_PAK,kraj_PHA,kraj_PLK,kraj_STC,kraj_ULK,kraj_VYS,kraj_ZLK
0,1,2009,271.0,174.0,19132.0,8000,2.2,89.45,-2.73,70.209,1.1317,5.0707,8445,4781,1417,2948,1221,19132.0,271.0,174.0,4781.0,1221.0,1417.0,2948.0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,2009,275.0,181.0,19576.0,8000,2.2,89.45,-2.73,70.209,1.1317,5.0707,10352,5538,1053,2289,1512,19576.0,275.0,181.0,5538.0,1512.0,1053.0,2289.0,1,0,0,0,0,0,0,0,0,0,0,0,0
2,1,2009,909.0,286.0,21065.0,8000,2.2,89.45,-2.73,70.209,1.1317,5.0707,24333,9824,3140,4985,3078,21065.0,909.0,286.0,9824.0,3078.0,3140.0,4985.0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,1,2009,198.0,96.0,18652.0,8000,2.2,89.45,-2.73,70.209,1.1317,5.0707,7386,3325,496,1050,960,18652.0,198.0,96.0,3325.0,960.0,496.0,1050.0,0,0,1,0,0,0,0,0,0,0,0,0,0
4,1,2009,343.0,130.0,19653.0,8000,2.2,89.45,-2.73,70.209,1.1317,5.0707,9563,4814,1134,1665,1126,19653.0,343.0,130.0,4814.0,1126.0,1134.0,1665.0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [390]:
n_kraje = 14
n_splits = int(len(war_df)/n_kraje - 1)

X = df_entire_no_refugees.loc[:, df_entire_no_refugees.columns != 'uchazeciOZamestnaniUoZZeny'].to_numpy()
y = df_entire_no_refugees.loc[:, df_entire_no_refugees.columns == 'uchazeciOZamestnaniUoZZeny'].to_numpy()

tscv = TimeSeriesSplit(n_splits=n_splits, test_size=n_kraje)

In [391]:
weights = np.ones((18,))

def optimize_alpha(alpha):
    rmses = []
    for train_index, test_index in tscv.split(X):
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X[train_index, :])
        X_test = scaler.transform(X[test_index, :])
        y_train = y[train_index]
        y_test = y[test_index]

        model = Lasso(alpha=alpha, random_state=SEED, max_iter=1000000)
        model.fit(X_train, y_train)

        rmse = mean_squared_error(y_test, model.predict(X_test), squared=False)
        rmses.append(rmse)

    return np.average(rmses, weights=weights)


res = optimize.minimize_scalar(
    optimize_alpha, bounds=(0, 1000), options={"disp": True})
best_alpha = res.x
print(f"Best alpha {best_alpha}")


Optimization terminated successfully;
The returned value satisfies the termination criteria
(using xtol =  1e-05 )
Best alpha 298.76529218494517


In [392]:
weights = np.ones((n_splits,))

results.append(eval_tscv(tscv, best_alpha, X, y, weights,Lasso(), verbose=False))

rmse 1410.7995984470494
mae 1022.0958379928969
sum weighted rmse 25394.39277204689
sum weighted mae 18397.725083872145


## Summary

In [393]:
model_labels = ["Since start of the war with refugees (B1)",
                "Since start of the war no refugees (B2)",
                "Entire period with refugees (A1)", 
                "Entire period no refugees (A2)"]

dfs = [df_war_refugees.drop(columns="uchazeciOZamestnaniUoZZeny"),
       df_war_no_refugees.drop(columns="uchazeciOZamestnaniUoZZeny"),
       df_entire_refugees.drop(columns="uchazeciOZamestnaniUoZZeny"),
       df_war_no_refugees.drop(columns="uchazeciOZamestnaniUoZZeny")]

feature_sel_eps = 1e-6

for i in range(len(model_labels)):
    print(model_labels[i] + ":")
    weighted_rmse, weighted_mae, weighted_r2, weighted_adj_r2, sum_weighted_rmse, sum_weighted_mae, coefs = results[i]
    print("rmse", weighted_rmse)
    print("mae", weighted_mae)
    print("r2", weighted_r2)
    print("adj-r2", weighted_adj_r2)
    print("sum weighted rmse", sum_weighted_rmse)
    print("sum weighted mae", sum_weighted_mae)
    print("useful features", dfs[i].columns[np.abs(np.array(coefs)) > feature_sel_eps].to_list())
    print()

Since start of the war with refugees:
rmse 511.7960809135718
mae 406.6343926095869
sum weighted rmse 9212.329456444293
sum weighted mae 7319.419066972564
useful features ['year', 'general_thefts', 'avg_monthly_salary', 'm_do_65', 'z_do_65', 'm_do_65_w19', 'monthly_min_wage', 'monthly_inflation_rate_wrt_last_year', 'reer', 'm_do_65_ratio', 'z_do_65_ratio', 'm_do_65_w19_ratio', 'z_do_65_w19_ratio', 'bilance', 'avg_energy_price', 'avg_gasoline_price', 'avg_natural_gas_price', 'noveHlaseniUchazeci', 'noveHlasenaAUvolnenaVPM', 'obsazenaAZrusenaVPM', 'absolventiSkolAMladistvi', 'avg_monthly_salary_prev_year', 'general_thefts_prev_year', 'break_in_thefts_prev_year', 'noveHlaseniUchazeci_prev_year', 'absolventiSkolAMladistvi_prev_year', 'noveHlasenaAUvolnenaVPM_prev_year', 'obsazenaAZrusenaVPM_prev_year', 'kraj_JHC', 'kraj_JHM', 'kraj_KVK', 'kraj_LBK', 'kraj_MSK', 'kraj_OLK', 'kraj_PAK', 'kraj_PHA', 'kraj_PLK', 'kraj_STC', 'kraj_ULK', 'kraj_VYS', 'kraj_ZLK']

Since start of the war no refugees

Only window 19, no ratio cols, no celkem

In [394]:
drop_cols_refugees =  ["celkem", "m_do_65", "z_do_65","m_do_65_ratio", "z_do_65_ratio", "uchazeciOZamestnaniUoZ",	"uchazeciOZamestnaniUoZZeny_ratio", "uchazeciOZamestnaniUoZMuzi", "uchazeciOZamestnaniUoZMuzi_ratio"]
drop_cols_refugees += [f"celkem_w{w}" for w in range(2,20)]
drop_cols_refugees += [f"m_do_65_w{w}" for w in range(2,19)]
drop_cols_refugees += [f"z_do_65_w{w}" for w in range(2,19)]
drop_cols_refugees += [f"m_do_65_w{w}_ratio" for w in range(2,20)]
drop_cols_refugees += [f"z_do_65_w{w}_ratio" for w in range(2,20)]

drop_cols_no_refugees = drop_cols_refugees.copy()
drop_cols_no_refugees += [f"m_do_65_w{w}" for w in range(19,20)]
drop_cols_no_refugees += [f"z_do_65_w{w}" for w in range(19,20)]

Only window 19 with ratios, no celkem

In [395]:
drop_cols_refugees =  ["celkem", "m_do_65", "z_do_65","m_do_65_ratio", "z_do_65_ratio", "uchazeciOZamestnaniUoZ",	"uchazeciOZamestnaniUoZZeny_ratio", "uchazeciOZamestnaniUoZMuzi", "uchazeciOZamestnaniUoZMuzi_ratio"]
drop_cols_refugees += [f"celkem_w{w}" for w in range(2,20)]
drop_cols_refugees += [f"m_do_65_w{w}" for w in range(2,19)]
drop_cols_refugees += [f"z_do_65_w{w}" for w in range(2,19)]
drop_cols_refugees += [f"m_do_65_w{w}_ratio" for w in range(2,19)]
drop_cols_refugees += [f"z_do_65_w{w}_ratio" for w in range(2,19)]

drop_cols_no_refugees = drop_cols_refugees.copy()
drop_cols_no_refugees += [f"m_do_65_w{w}" for w in range(19,20)]
drop_cols_no_refugees += [f"z_do_65_w{w}" for w in range(19,20)]
drop_cols_no_refugees += [f"m_do_65_w{w}_ratio" for w in range(19,20)]
drop_cols_no_refugees += [f"z_do_65_w{w}_ratio" for w in range(19,20)]

Entire window, no ratios, no celkem

In [396]:
drop_cols_refugees =  ["celkem", "m_do_65", "z_do_65","m_do_65_ratio", "z_do_65_ratio", "uchazeciOZamestnaniUoZ",	"uchazeciOZamestnaniUoZZeny_ratio", "uchazeciOZamestnaniUoZMuzi", "uchazeciOZamestnaniUoZMuzi_ratio"]
drop_cols_refugees += [f"celkem_w{w}" for w in range(2,20)]
drop_cols_refugees += [f"m_do_65_w{w}_ratio" for w in range(2,20)]
drop_cols_refugees += [f"z_do_65_w{w}_ratio" for w in range(2,20)]

drop_cols_no_refugees = drop_cols_refugees.copy()
drop_cols_no_refugees += [f"m_do_65_w{w}" for w in range(2,20)]
drop_cols_no_refugees += [f"z_do_65_w{w}" for w in range(2,20)]

Entire window with ratios, no celkem

In [397]:
drop_cols_refugees =  ["celkem", "m_do_65", "z_do_65","m_do_65_ratio", "z_do_65_ratio", "uchazeciOZamestnaniUoZ",	"uchazeciOZamestnaniUoZZeny_ratio", "uchazeciOZamestnaniUoZMuzi", "uchazeciOZamestnaniUoZMuzi_ratio"]
drop_cols_refugees += [f"celkem_w{w}" for w in range(2,20)]

drop_cols_no_refugees = drop_cols_refugees.copy()
drop_cols_no_refugees += [f"m_do_65_w{w}" for w in range(2,20)]
drop_cols_no_refugees += [f"z_do_65_w{w}" for w in range(2,20)]
drop_cols_no_refugees += [f"m_do_65_w{w}_ratio" for w in range(2,20)]
drop_cols_no_refugees += [f"z_do_65_w{w}_ratio" for w in range(2,20)]

No lags at all

In [398]:
drop_cols_refugees =  ["celkem", "uchazeciOZamestnaniUoZ", "uchazeciOZamestnaniUoZZeny_ratio", "uchazeciOZamestnaniUoZMuzi", "uchazeciOZamestnaniUoZMuzi_ratio"]
drop_cols_refugees += [f"celkem_w{w}" for w in range(2,20)]
drop_cols_refugees += [f"m_do_65_w{w}" for w in range(2,20)]
drop_cols_refugees += [f"z_do_65_w{w}" for w in range(2,20)]
drop_cols_refugees += [f"m_do_65_w{w}_ratio" for w in range(2,20)]
drop_cols_refugees += [f"z_do_65_w{w}_ratio" for w in range(2,20)]

drop_cols_no_refugees = drop_cols_refugees.copy()
drop_cols_no_refugees += ["m_do_65", "z_do_65", "m_do_65_ratio", "z_do_65_ratio"]

No lags at all, no ratios

In [399]:
drop_cols_refugees =  ["celkem", "m_do_65_ratio", "z_do_65_ratio", "uchazeciOZamestnaniUoZ",	"uchazeciOZamestnaniUoZZeny_ratio", "uchazeciOZamestnaniUoZMuzi", "uchazeciOZamestnaniUoZMuzi_ratio"]
drop_cols_refugees += [f"celkem_w{w}" for w in range(2,20)]
drop_cols_refugees += [f"m_do_65_w{w}" for w in range(2,20)]
drop_cols_refugees += [f"z_do_65_w{w}" for w in range(2,20)]
drop_cols_refugees += [f"m_do_65_w{w}_ratio" for w in range(2,20)]
drop_cols_refugees += [f"z_do_65_w{w}_ratio" for w in range(2,20)]

drop_cols_no_refugees = drop_cols_refugees.copy()
drop_cols_no_refugees += ["m_do_65", "z_do_65"]