# Mlynatom experiments

TODO add seasonal

In [185]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso, Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from scipy import optimize

SEED = 333

In [336]:
def transform_df(df: pd.DataFrame, encoder: OneHotEncoder) -> pd.DataFrame:
    # select all object columns
    obj_cols = df.select_dtypes('object')
    # apply transformation by encoder fitted on the training split
    transformed_cols = encoder.transform(obj_cols)
    # get new feature names
    feature_names = encoder.get_feature_names_out()
    # create new dataframe
    transformed_df = pd.DataFrame(
        transformed_cols, index=df.index, columns=feature_names).astype(bool)
    new_df = pd.concat(
        [df.select_dtypes(exclude='object'), transformed_df], axis=1)
    return new_df

def eval_tscv(tscv: TimeSeriesSplit, alpha: float, X, y, weights, verbose: bool = True):
    rmses = []
    maes = []
    for train_index, test_index in tscv.split(X):
        scaler = StandardScaler()
        if isinstance(X, pd.DataFrame):
            X_train = scaler.fit_transform(X.iloc[train_index, :])
            X_test = scaler.transform(X.iloc[test_index, :])
        else:
            X_train = scaler.fit_transform(X[train_index, :])
            X_test = scaler.transform(X[test_index, :])
        y_train = y[train_index]
        y_test = y[test_index]
        tmp_ridge = Ridge(alpha=alpha, random_state=SEED)
        tmp_ridge.fit(X_train, y_train)
        y_pred = tmp_ridge.predict(X_test)
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        mae = mean_absolute_error(y_test, y_pred)
        if verbose:
            print("pred", tmp_ridge.predict(X_test),
                  "test:", y_test, "rmse", rmse, "mae", mae)
            print(tmp_ridge.coef_)
            print(tmp_ridge.intercept_)
            print()
        rmses.append(rmse)
        maes.append(mae)

    weighted_rmse = np.average(rmses, weights=weights)
    weighted_mae = np.average(maes, weights=weights)
    sum_weighted_rmse = np.sum(rmses * weights)
    print("rmse", weighted_rmse)
    print("mae", weighted_mae)
    print("sum weighted rmse",sum_weighted_rmse)

## Prepare Data

In [2]:
orig_df = pd.read_csv('../dataset/team_A_dataset.csv')
orig_df.head

<bound method NDFrame.head of       month  year kraj  general_thefts  break_in_thefts  avg_monthly_salary  \
0         1  2009  HKK           271.0            174.0             19132.0   
1         1  2009  JHC           275.0            181.0             19576.0   
2         1  2009  JHM           909.0            286.0             21065.0   
3         1  2009  KVK           198.0             96.0             18652.0   
4         1  2009  LBK           343.0            130.0             19653.0   
...     ...   ...  ...             ...              ...                 ...   
2473      9  2023  PLK           145.0            130.0             41220.0   
2474      9  2023  STC           262.0            254.0             42990.0   
2475      9  2023  ULK           243.0            200.0             39664.0   
2476      9  2023  VYS            62.0             39.0             39315.0   
2477      9  2023  ZLK            67.0             57.0             38596.0   

      celkem  m_do_65

In [3]:
display(orig_df.isna().sum())

month                               0
year                                0
kraj                                0
general_thefts                      0
break_in_thefts                     0
                                   ..
noveHlasenaAUvolnenaVPM             0
obsazenaAZrusenaVPM                 0
absolventiSkolAMladistvi            0
uchazeciOZamestnaniUoZMuzi_ratio    0
uchazeciOZamestnaniUoZZeny_ratio    0
Length: 117, dtype: int64

In [4]:
orig_df.describe()

Unnamed: 0,month,year,general_thefts,break_in_thefts,avg_monthly_salary,celkem,m_do_65,z_do_65,celkem_w2,celkem_w3,...,avg_natural_gas_price,uchazeciOZamestnaniUoZ,uchazeciOZamestnaniUoZZeny,uchazeciOZamestnaniUoZMuzi,noveHlaseniUchazeci,noveHlasenaAUvolnenaVPM,obsazenaAZrusenaVPM,absolventiSkolAMladistvi,uchazeciOZamestnaniUoZMuzi_ratio,uchazeciOZamestnaniUoZZeny_ratio
count,2478.0,2478.0,2478.0,2478.0,2478.0,266.0,266.0,266.0,266.0,266.0,...,2478.0,2478.0,2478.0,2478.0,2478.0,2478.0,2478.0,2478.0,2478.0,2478.0
mean,6.423729,2015.881356,489.349475,233.924939,28453.470944,1394.827068,347.5,618.56015,2762.87218,4105.74812,...,3.499783,27967.878531,14174.182809,13793.695722,3371.029459,1975.188458,1899.223164,1480.798628,0.486333,0.513667
std,3.429731,4.259787,681.754931,214.294818,6730.770886,5885.779007,785.677495,2600.293961,8979.240558,11466.193698,...,1.23672,17887.5741,8725.67403,9234.474479,1770.41761,1716.588043,1706.252205,1062.353914,0.030833,0.030833
min,1.0,2009.0,19.0,19.0,17704.0,-26447.0,-3008.0,-12127.0,-24614.0,-22643.0,...,1.7147,5329.0,2808.0,2413.0,817.0,0.0,0.0,165.0,0.408138,0.431466
25%,3.0,2012.0,139.0,80.0,22815.0,276.25,104.25,125.25,543.25,801.25,...,2.664,13350.25,6925.75,6407.5,1965.0,883.25,893.25,633.0,0.464473,0.491328
50%,6.0,2016.0,252.0,162.0,26431.5,509.5,177.5,247.5,1072.5,1629.0,...,3.2143,23996.0,12384.5,11521.0,2892.0,1529.0,1450.0,1244.0,0.486783,0.513217
75%,9.0,2020.0,479.0,282.0,33427.0,1317.0,446.5,660.0,3221.25,5672.75,...,4.08,34709.0,17910.5,17183.5,4402.5,2536.25,2360.0,1895.0,0.508672,0.535527
max,12.0,2023.0,4599.0,1107.0,53070.0,60636.0,6854.0,27163.0,74113.0,83545.0,...,8.7705,93714.0,43201.0,50572.0,12418.0,12114.0,17285.0,6058.0,0.568534,0.591862


In [5]:
orig_df.columns

Index(['month', 'year', 'kraj', 'general_thefts', 'break_in_thefts',
       'avg_monthly_salary', 'celkem', 'm_do_65', 'z_do_65', 'celkem_w2',
       ...
       'avg_natural_gas_price', 'uchazeciOZamestnaniUoZ',
       'uchazeciOZamestnaniUoZZeny', 'uchazeciOZamestnaniUoZMuzi',
       'noveHlaseniUchazeci', 'noveHlasenaAUvolnenaVPM', 'obsazenaAZrusenaVPM',
       'absolventiSkolAMladistvi', 'uchazeciOZamestnaniUoZMuzi_ratio',
       'uchazeciOZamestnaniUoZZeny_ratio'],
      dtype='object', length=117)

### Dataset version 1 - predicting uchazeciOZamestnaniUoZZeny_ratio, all rows, without refugee info -> big (economic) model

In [35]:
drop_cols = [f"celkem_w{w}" for w in range(2,20)]
drop_cols += [f"m_do_65_w{w}" for w in range(2,20)]
drop_cols += [f"z_do_65_w{w}" for w in range(2,20)]
drop_cols += [f"m_do_65_w{w}_ratio" for w in range(2,20)]
drop_cols += [f"z_do_65_w{w}_ratio" for w in range(2,20)]
drop_cols += ["celkem", "m_do_65", "z_do_65","m_do_65_ratio", "z_do_65_ratio", "uchazeciOZamestnaniUoZ",	"uchazeciOZamestnaniUoZZeny", "uchazeciOZamestnaniUoZMuzi", "uchazeciOZamestnaniUoZMuzi_ratio"]


In [36]:
df1 = orig_df.drop(columns=drop_cols)
display(df1.columns)
display(df1.info())

Index(['month', 'year', 'kraj', 'general_thefts', 'break_in_thefts',
       'avg_monthly_salary', 'monthly_min_wage',
       'monthly_inflation_rate_wrt_last_year', 'reer', 'bilance',
       'avg_energy_price', 'avg_gasoline_price', 'avg_natural_gas_price',
       'noveHlaseniUchazeci', 'noveHlasenaAUvolnenaVPM', 'obsazenaAZrusenaVPM',
       'absolventiSkolAMladistvi', 'uchazeciOZamestnaniUoZZeny_ratio'],
      dtype='object')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2478 entries, 0 to 2477
Data columns (total 18 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   month                                 2478 non-null   int64  
 1   year                                  2478 non-null   int64  
 2   kraj                                  2478 non-null   object 
 3   general_thefts                        2478 non-null   float64
 4   break_in_thefts                       2478 non-null   float64
 5   avg_monthly_salary                    2478 non-null   float64
 6   monthly_min_wage                      2478 non-null   int64  
 7   monthly_inflation_rate_wrt_last_year  2478 non-null   float64
 8   reer                                  2478 non-null   float64
 9   bilance                               2478 non-null   float64
 10  avg_energy_price                      2478 non-null   float64
 11  avg_gasoline_pric

None

In [37]:
df1

Unnamed: 0,month,year,kraj,general_thefts,break_in_thefts,avg_monthly_salary,monthly_min_wage,monthly_inflation_rate_wrt_last_year,reer,bilance,avg_energy_price,avg_gasoline_price,avg_natural_gas_price,noveHlaseniUchazeci,noveHlasenaAUvolnenaVPM,obsazenaAZrusenaVPM,absolventiSkolAMladistvi,uchazeciOZamestnaniUoZZeny_ratio
0,1,2009,HKK,271.0,174.0,19132.0,8000,2.2,89.45,-2.730,70.209,1.1317,5.0707,4781,1417,2948,1221,0.489849
1,1,2009,JHC,275.0,181.0,19576.0,8000,2.2,89.45,-2.730,70.209,1.1317,5.0707,5538,1053,2289,1512,0.502256
2,1,2009,JHM,909.0,286.0,21065.0,8000,2.2,89.45,-2.730,70.209,1.1317,5.0707,9824,3140,4985,3078,0.505306
3,1,2009,KVK,198.0,96.0,18652.0,8000,2.2,89.45,-2.730,70.209,1.1317,5.0707,3325,496,1050,960,0.479642
4,1,2009,LBK,343.0,130.0,19653.0,8000,2.2,89.45,-2.730,70.209,1.1317,5.0707,4814,1134,1665,1126,0.496367
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2473,9,2023,PLK,145.0,130.0,41220.0,17300,6.9,115.41,11.874,138.980,1.7623,2.7372,2360,1473,1221,809,0.577460
2474,9,2023,STC,262.0,254.0,42990.0,17300,6.9,115.41,11.874,138.980,1.7623,2.7372,5929,2502,1777,2149,0.578063
2475,9,2023,ULK,243.0,200.0,39664.0,17300,6.9,115.41,11.874,138.980,1.7623,2.7372,4871,1380,1217,2084,0.574391
2476,9,2023,VYS,62.0,39.0,39315.0,17300,6.9,115.41,11.874,138.980,1.7623,2.7372,2006,885,865,769,0.563609


categorical column is kraj (month and year are probably not - in terms of time series)

In [41]:
encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False, drop="first")

obj_cols = df1.select_dtypes('object')
encoder.fit(obj_cols)

df1 = transform_df(df1, encoder=encoder)

In [42]:
df1

Unnamed: 0,month,year,general_thefts,break_in_thefts,avg_monthly_salary,monthly_min_wage,monthly_inflation_rate_wrt_last_year,reer,bilance,avg_energy_price,...,kraj_LBK,kraj_MSK,kraj_OLK,kraj_PAK,kraj_PHA,kraj_PLK,kraj_STC,kraj_ULK,kraj_VYS,kraj_ZLK
0,1,2009,271.0,174.0,19132.0,8000,2.2,89.45,-2.730,70.209,...,False,False,False,False,False,False,False,False,False,False
1,1,2009,275.0,181.0,19576.0,8000,2.2,89.45,-2.730,70.209,...,False,False,False,False,False,False,False,False,False,False
2,1,2009,909.0,286.0,21065.0,8000,2.2,89.45,-2.730,70.209,...,False,False,False,False,False,False,False,False,False,False
3,1,2009,198.0,96.0,18652.0,8000,2.2,89.45,-2.730,70.209,...,False,False,False,False,False,False,False,False,False,False
4,1,2009,343.0,130.0,19653.0,8000,2.2,89.45,-2.730,70.209,...,True,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2473,9,2023,145.0,130.0,41220.0,17300,6.9,115.41,11.874,138.980,...,False,False,False,False,False,True,False,False,False,False
2474,9,2023,262.0,254.0,42990.0,17300,6.9,115.41,11.874,138.980,...,False,False,False,False,False,False,True,False,False,False
2475,9,2023,243.0,200.0,39664.0,17300,6.9,115.41,11.874,138.980,...,False,False,False,False,False,False,False,True,False,False
2476,9,2023,62.0,39.0,39315.0,17300,6.9,115.41,11.874,138.980,...,False,False,False,False,False,False,False,False,True,False


In [43]:
df1.isna().sum()

month                                   0
year                                    0
general_thefts                          0
break_in_thefts                         0
avg_monthly_salary                      0
monthly_min_wage                        0
monthly_inflation_rate_wrt_last_year    0
reer                                    0
bilance                                 0
avg_energy_price                        0
avg_gasoline_price                      0
avg_natural_gas_price                   0
noveHlaseniUchazeci                     0
noveHlasenaAUvolnenaVPM                 0
obsazenaAZrusenaVPM                     0
absolventiSkolAMladistvi                0
uchazeciOZamestnaniUoZZeny_ratio        0
kraj_JHC                                0
kraj_JHM                                0
kraj_KVK                                0
kraj_LBK                                0
kraj_MSK                                0
kraj_OLK                                0
kraj_PAK                          

### Dataset version 2 - predicting uchazeciOZamestnaniUoZZeny_ratio, refugees rows -> refugee model

In [44]:
drop_cols2 = [f"celkem_w{w}" for w in range(2,20)]
drop_cols2 += [f"m_do_65_w{w}" for w in range(2,20)]
drop_cols2 += [f"z_do_65_w{w}" for w in range(2,20)]
drop_cols2 += ["celkem", "m_do_65", "z_do_65", "uchazeciOZamestnaniUoZ", "uchazeciOZamestnaniUoZZeny", "uchazeciOZamestnaniUoZMuzi", "uchazeciOZamestnaniUoZMuzi_ratio"]

In [51]:
df2 = orig_df.drop(columns=drop_cols2)

In [52]:
df2 = df2[(df2.year > 2022) | ((df2.year == 2022) & (df2.month > 2))]
df2

Unnamed: 0,month,year,kraj,general_thefts,break_in_thefts,avg_monthly_salary,monthly_min_wage,monthly_inflation_rate_wrt_last_year,reer,m_do_65_ratio,...,z_do_65_w19_ratio,bilance,avg_energy_price,avg_gasoline_price,avg_natural_gas_price,noveHlaseniUchazeci,noveHlasenaAUvolnenaVPM,obsazenaAZrusenaVPM,absolventiSkolAMladistvi,uchazeciOZamestnaniUoZZeny_ratio
2212,3,2022,HKK,60.0,53.0,34689.0,16200,12.7,107.78,0.117205,...,0.417727,-12.741,187.10,1.9464,4.9772,1451,2010,2094,527,0.507011
2213,3,2022,JHC,113.0,71.0,33998.0,16200,12.7,107.78,0.123191,...,0.430853,-12.741,187.10,1.9464,4.9772,1616,3249,4076,521,0.520700
2214,3,2022,JHM,224.0,327.0,37027.0,16200,12.7,107.78,0.129461,...,0.426214,-12.741,187.10,1.9464,4.9772,3383,6761,6142,1624,0.511907
2215,3,2022,KVK,54.0,63.0,32424.0,16200,12.7,107.78,0.117503,...,0.437780,-12.741,187.10,1.9464,4.9772,1027,2096,1588,372,0.532359
2216,3,2022,LBK,129.0,116.0,33745.0,16200,12.7,107.78,0.095375,...,0.436593,-12.741,187.10,1.9464,4.9772,1562,3192,3957,566,0.532358
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2473,9,2023,PLK,145.0,130.0,41220.0,17300,6.9,115.41,0.277383,...,0.489928,11.874,138.98,1.7623,2.7372,2360,1473,1221,809,0.577460
2474,9,2023,STC,262.0,254.0,42990.0,17300,6.9,115.41,0.338071,...,0.433166,11.874,138.98,1.7623,2.7372,5929,2502,1777,2149,0.578063
2475,9,2023,ULK,243.0,200.0,39664.0,17300,6.9,115.41,0.430108,...,0.408651,11.874,138.98,1.7623,2.7372,4871,1380,1217,2084,0.574391
2476,9,2023,VYS,62.0,39.0,39315.0,17300,6.9,115.41,0.350299,...,0.424424,11.874,138.98,1.7623,2.7372,2006,885,865,769,0.563609


In [53]:
from sklearn.preprocessing import OneHotEncoder


encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False, drop="first")

obj_cols = df2.select_dtypes('object')
encoder.fit(obj_cols)

df2 = transform_df(df2, encoder=encoder)

In [95]:
display(df2)

Unnamed: 0,month,year,general_thefts,break_in_thefts,avg_monthly_salary,monthly_min_wage,monthly_inflation_rate_wrt_last_year,reer,m_do_65_ratio,z_do_65_ratio,...,kraj_LBK,kraj_MSK,kraj_OLK,kraj_PAK,kraj_PHA,kraj_PLK,kraj_STC,kraj_ULK,kraj_VYS,kraj_ZLK
2212,3,2022,60.0,53.0,34689.0,16200,12.7,107.78,0.117205,0.417727,...,False,False,False,False,False,False,False,False,False,False
2213,3,2022,113.0,71.0,33998.0,16200,12.7,107.78,0.123191,0.430853,...,False,False,False,False,False,False,False,False,False,False
2214,3,2022,224.0,327.0,37027.0,16200,12.7,107.78,0.129461,0.426214,...,False,False,False,False,False,False,False,False,False,False
2215,3,2022,54.0,63.0,32424.0,16200,12.7,107.78,0.117503,0.437780,...,False,False,False,False,False,False,False,False,False,False
2216,3,2022,129.0,116.0,33745.0,16200,12.7,107.78,0.095375,0.436593,...,True,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2473,9,2023,145.0,130.0,41220.0,17300,6.9,115.41,0.277383,0.549075,...,False,False,False,False,False,True,False,False,False,False
2474,9,2023,262.0,254.0,42990.0,17300,6.9,115.41,0.338071,0.509645,...,False,False,False,False,False,False,True,False,False,False
2475,9,2023,243.0,200.0,39664.0,17300,6.9,115.41,0.430108,0.370968,...,False,False,False,False,False,False,False,True,False,False
2476,9,2023,62.0,39.0,39315.0,17300,6.9,115.41,0.350299,0.488024,...,False,False,False,False,False,False,False,False,True,False


### Dataset version 1.2 predicting uchazeciOZamestnaniUoZZeny, all rows, without refugee info -> big (economic) model

In [190]:
drop_cols = [f"celkem_w{w}" for w in range(2,20)]
drop_cols += [f"m_do_65_w{w}" for w in range(2,20)]
drop_cols += [f"z_do_65_w{w}" for w in range(2,20)]
drop_cols += [f"m_do_65_w{w}_ratio" for w in range(2,20)]
drop_cols += [f"z_do_65_w{w}_ratio" for w in range(2,20)]
drop_cols += ["celkem", "m_do_65", "z_do_65","m_do_65_ratio", "z_do_65_ratio", "uchazeciOZamestnaniUoZ",	"uchazeciOZamestnaniUoZZeny_ratio", "uchazeciOZamestnaniUoZMuzi", "uchazeciOZamestnaniUoZMuzi_ratio"]

In [191]:
df12 = orig_df.drop(columns=drop_cols)
display(df12.columns)
display(df12.info())

Index(['month', 'year', 'kraj', 'general_thefts', 'break_in_thefts',
       'avg_monthly_salary', 'monthly_min_wage',
       'monthly_inflation_rate_wrt_last_year', 'reer', 'bilance',
       'avg_energy_price', 'avg_gasoline_price', 'avg_natural_gas_price',
       'uchazeciOZamestnaniUoZZeny', 'noveHlaseniUchazeci',
       'noveHlasenaAUvolnenaVPM', 'obsazenaAZrusenaVPM',
       'absolventiSkolAMladistvi'],
      dtype='object')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2478 entries, 0 to 2477
Data columns (total 18 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   month                                 2478 non-null   int64  
 1   year                                  2478 non-null   int64  
 2   kraj                                  2478 non-null   object 
 3   general_thefts                        2478 non-null   float64
 4   break_in_thefts                       2478 non-null   float64
 5   avg_monthly_salary                    2478 non-null   float64
 6   monthly_min_wage                      2478 non-null   int64  
 7   monthly_inflation_rate_wrt_last_year  2478 non-null   float64
 8   reer                                  2478 non-null   float64
 9   bilance                               2478 non-null   float64
 10  avg_energy_price                      2478 non-null   float64
 11  avg_gasoline_pric

None

In [192]:
encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False, drop="first")

obj_cols = df12.select_dtypes('object')
encoder.fit(obj_cols)

df12 = transform_df(df12, encoder=encoder)

### Dataset version 2.2 - predicting uchazeciOZamestnaniUoZZeny, refugees rows -> refugee model

In [200]:
drop_cols2 = [f"celkem_w{w}" for w in range(2,20)]
drop_cols2 += [f"m_do_65_w{w}" for w in range(2,20)]
drop_cols2 += [f"z_do_65_w{w}" for w in range(2,20)]
drop_cols2 += ["celkem", "m_do_65", "z_do_65", "uchazeciOZamestnaniUoZ", "uchazeciOZamestnaniUoZZeny_ratio", "uchazeciOZamestnaniUoZMuzi", "uchazeciOZamestnaniUoZMuzi_ratio"]

In [201]:
df22 = orig_df.drop(columns=drop_cols2)
df22 = df22[(df22.year > 2022) | ((df22.year == 2022) & (df22.month > 2))]
df22

Unnamed: 0,month,year,kraj,general_thefts,break_in_thefts,avg_monthly_salary,monthly_min_wage,monthly_inflation_rate_wrt_last_year,reer,m_do_65_ratio,...,z_do_65_w19_ratio,bilance,avg_energy_price,avg_gasoline_price,avg_natural_gas_price,uchazeciOZamestnaniUoZZeny,noveHlaseniUchazeci,noveHlasenaAUvolnenaVPM,obsazenaAZrusenaVPM,absolventiSkolAMladistvi
2212,3,2022,HKK,60.0,53.0,34689.0,16200,12.7,107.78,0.117205,...,0.417727,-12.741,187.10,1.9464,4.9772,5315,1451,2010,2094,527
2213,3,2022,JHC,113.0,71.0,33998.0,16200,12.7,107.78,0.123191,...,0.430853,-12.741,187.10,1.9464,4.9772,6163,1616,3249,4076,521
2214,3,2022,JHM,224.0,327.0,37027.0,16200,12.7,107.78,0.129461,...,0.426214,-12.741,187.10,1.9464,4.9772,16681,3383,6761,6142,1624
2215,3,2022,KVK,54.0,63.0,32424.0,16200,12.7,107.78,0.117503,...,0.437780,-12.741,187.10,1.9464,4.9772,4409,1027,2096,1588,372
2216,3,2022,LBK,129.0,116.0,33745.0,16200,12.7,107.78,0.095375,...,0.436593,-12.741,187.10,1.9464,4.9772,5931,1562,3192,3957,566
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2473,9,2023,PLK,145.0,130.0,41220.0,17300,6.9,115.41,0.277383,...,0.489928,11.874,138.98,1.7623,2.7372,6538,2360,1473,1221,809
2474,9,2023,STC,262.0,254.0,42990.0,17300,6.9,115.41,0.338071,...,0.433166,11.874,138.98,1.7623,2.7372,17887,5929,2502,1777,2149
2475,9,2023,ULK,243.0,200.0,39664.0,17300,6.9,115.41,0.430108,...,0.408651,11.874,138.98,1.7623,2.7372,17728,4871,1380,1217,2084
2476,9,2023,VYS,62.0,39.0,39315.0,17300,6.9,115.41,0.350299,...,0.424424,11.874,138.98,1.7623,2.7372,5303,2006,885,865,769


In [202]:
encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False, drop="first")

obj_cols = df22.select_dtypes('object')
encoder.fit(obj_cols)

df22 = transform_df(df22, encoder=encoder)

In [203]:
display(df22)

Unnamed: 0,month,year,general_thefts,break_in_thefts,avg_monthly_salary,monthly_min_wage,monthly_inflation_rate_wrt_last_year,reer,m_do_65_ratio,z_do_65_ratio,...,kraj_LBK,kraj_MSK,kraj_OLK,kraj_PAK,kraj_PHA,kraj_PLK,kraj_STC,kraj_ULK,kraj_VYS,kraj_ZLK
2212,3,2022,60.0,53.0,34689.0,16200,12.7,107.78,0.117205,0.417727,...,False,False,False,False,False,False,False,False,False,False
2213,3,2022,113.0,71.0,33998.0,16200,12.7,107.78,0.123191,0.430853,...,False,False,False,False,False,False,False,False,False,False
2214,3,2022,224.0,327.0,37027.0,16200,12.7,107.78,0.129461,0.426214,...,False,False,False,False,False,False,False,False,False,False
2215,3,2022,54.0,63.0,32424.0,16200,12.7,107.78,0.117503,0.437780,...,False,False,False,False,False,False,False,False,False,False
2216,3,2022,129.0,116.0,33745.0,16200,12.7,107.78,0.095375,0.436593,...,True,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2473,9,2023,145.0,130.0,41220.0,17300,6.9,115.41,0.277383,0.549075,...,False,False,False,False,False,True,False,False,False,False
2474,9,2023,262.0,254.0,42990.0,17300,6.9,115.41,0.338071,0.509645,...,False,False,False,False,False,False,True,False,False,False
2475,9,2023,243.0,200.0,39664.0,17300,6.9,115.41,0.430108,0.370968,...,False,False,False,False,False,False,False,True,False,False
2476,9,2023,62.0,39.0,39315.0,17300,6.9,115.41,0.350299,0.488024,...,False,False,False,False,False,False,False,False,True,False


### Dataset version 3 - predicting uchazeciOZamestnaniUoZZeny, all rows, imputation on refugee rows in years before -> refugee model

In [298]:
drop_cols2 = [f"celkem_w{w}" for w in range(2,20)]
drop_cols2 += [f"m_do_65_w{w}" for w in range(2,20)]
drop_cols2 += [f"z_do_65_w{w}" for w in range(2,20)]
drop_cols2 += ["celkem", "m_do_65", "z_do_65", "uchazeciOZamestnaniUoZ", "uchazeciOZamestnaniUoZZeny_ratio", "uchazeciOZamestnaniUoZMuzi", "uchazeciOZamestnaniUoZMuzi_ratio"]

In [299]:
df3 = orig_df.drop(columns=drop_cols2)
df3

Unnamed: 0,month,year,kraj,general_thefts,break_in_thefts,avg_monthly_salary,monthly_min_wage,monthly_inflation_rate_wrt_last_year,reer,m_do_65_ratio,...,z_do_65_w19_ratio,bilance,avg_energy_price,avg_gasoline_price,avg_natural_gas_price,uchazeciOZamestnaniUoZZeny,noveHlaseniUchazeci,noveHlasenaAUvolnenaVPM,obsazenaAZrusenaVPM,absolventiSkolAMladistvi
0,1,2009,HKK,271.0,174.0,19132.0,8000,2.2,89.45,,...,,-2.730,70.209,1.1317,5.0707,8445,4781,1417,2948,1221
1,1,2009,JHC,275.0,181.0,19576.0,8000,2.2,89.45,,...,,-2.730,70.209,1.1317,5.0707,10352,5538,1053,2289,1512
2,1,2009,JHM,909.0,286.0,21065.0,8000,2.2,89.45,,...,,-2.730,70.209,1.1317,5.0707,24333,9824,3140,4985,3078
3,1,2009,KVK,198.0,96.0,18652.0,8000,2.2,89.45,,...,,-2.730,70.209,1.1317,5.0707,7386,3325,496,1050,960
4,1,2009,LBK,343.0,130.0,19653.0,8000,2.2,89.45,,...,,-2.730,70.209,1.1317,5.0707,9563,4814,1134,1665,1126
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2473,9,2023,PLK,145.0,130.0,41220.0,17300,6.9,115.41,0.277383,...,0.489928,11.874,138.980,1.7623,2.7372,6538,2360,1473,1221,809
2474,9,2023,STC,262.0,254.0,42990.0,17300,6.9,115.41,0.338071,...,0.433166,11.874,138.980,1.7623,2.7372,17887,5929,2502,1777,2149
2475,9,2023,ULK,243.0,200.0,39664.0,17300,6.9,115.41,0.430108,...,0.408651,11.874,138.980,1.7623,2.7372,17728,4871,1380,1217,2084
2476,9,2023,VYS,62.0,39.0,39315.0,17300,6.9,115.41,0.350299,...,0.424424,11.874,138.980,1.7623,2.7372,5303,2006,885,865,769


In [301]:
df3 = df3.fillna(0)
df3

Unnamed: 0,month,year,kraj,general_thefts,break_in_thefts,avg_monthly_salary,monthly_min_wage,monthly_inflation_rate_wrt_last_year,reer,m_do_65_ratio,...,z_do_65_w19_ratio,bilance,avg_energy_price,avg_gasoline_price,avg_natural_gas_price,uchazeciOZamestnaniUoZZeny,noveHlaseniUchazeci,noveHlasenaAUvolnenaVPM,obsazenaAZrusenaVPM,absolventiSkolAMladistvi
0,1,2009,HKK,271.0,174.0,19132.0,8000,2.2,89.45,0.000000,...,0.000000,-2.730,70.209,1.1317,5.0707,8445,4781,1417,2948,1221
1,1,2009,JHC,275.0,181.0,19576.0,8000,2.2,89.45,0.000000,...,0.000000,-2.730,70.209,1.1317,5.0707,10352,5538,1053,2289,1512
2,1,2009,JHM,909.0,286.0,21065.0,8000,2.2,89.45,0.000000,...,0.000000,-2.730,70.209,1.1317,5.0707,24333,9824,3140,4985,3078
3,1,2009,KVK,198.0,96.0,18652.0,8000,2.2,89.45,0.000000,...,0.000000,-2.730,70.209,1.1317,5.0707,7386,3325,496,1050,960
4,1,2009,LBK,343.0,130.0,19653.0,8000,2.2,89.45,0.000000,...,0.000000,-2.730,70.209,1.1317,5.0707,9563,4814,1134,1665,1126
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2473,9,2023,PLK,145.0,130.0,41220.0,17300,6.9,115.41,0.277383,...,0.489928,11.874,138.980,1.7623,2.7372,6538,2360,1473,1221,809
2474,9,2023,STC,262.0,254.0,42990.0,17300,6.9,115.41,0.338071,...,0.433166,11.874,138.980,1.7623,2.7372,17887,5929,2502,1777,2149
2475,9,2023,ULK,243.0,200.0,39664.0,17300,6.9,115.41,0.430108,...,0.408651,11.874,138.980,1.7623,2.7372,17728,4871,1380,1217,2084
2476,9,2023,VYS,62.0,39.0,39315.0,17300,6.9,115.41,0.350299,...,0.424424,11.874,138.980,1.7623,2.7372,5303,2006,885,865,769


In [302]:
encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False, drop="first")

obj_cols = df3.select_dtypes('object')
encoder.fit(obj_cols)

df3 = transform_df(df3, encoder=encoder)

### Dataset version 4 - predicting uchazeciOZamestnaniUoZZeny, refugee rows -> small economic model

In [319]:
drop_cols = [f"celkem_w{w}" for w in range(2,20)]
drop_cols += [f"m_do_65_w{w}" for w in range(2,20)]
drop_cols += [f"z_do_65_w{w}" for w in range(2,20)]
drop_cols += [f"m_do_65_w{w}_ratio" for w in range(2,20)]
drop_cols += [f"z_do_65_w{w}_ratio" for w in range(2,20)]
drop_cols += ["celkem", "m_do_65", "z_do_65","m_do_65_ratio", "z_do_65_ratio", "uchazeciOZamestnaniUoZ",	"uchazeciOZamestnaniUoZZeny_ratio", "uchazeciOZamestnaniUoZMuzi", "uchazeciOZamestnaniUoZMuzi_ratio"]

In [320]:
df4 = orig_df.drop(columns=drop_cols)
df4 = df4[(df4.year > 2022) | ((df4.year == 2022) & (df4.month > 2))]
df4

Unnamed: 0,month,year,kraj,general_thefts,break_in_thefts,avg_monthly_salary,monthly_min_wage,monthly_inflation_rate_wrt_last_year,reer,bilance,avg_energy_price,avg_gasoline_price,avg_natural_gas_price,uchazeciOZamestnaniUoZZeny,noveHlaseniUchazeci,noveHlasenaAUvolnenaVPM,obsazenaAZrusenaVPM,absolventiSkolAMladistvi
2212,3,2022,HKK,60.0,53.0,34689.0,16200,12.7,107.78,-12.741,187.10,1.9464,4.9772,5315,1451,2010,2094,527
2213,3,2022,JHC,113.0,71.0,33998.0,16200,12.7,107.78,-12.741,187.10,1.9464,4.9772,6163,1616,3249,4076,521
2214,3,2022,JHM,224.0,327.0,37027.0,16200,12.7,107.78,-12.741,187.10,1.9464,4.9772,16681,3383,6761,6142,1624
2215,3,2022,KVK,54.0,63.0,32424.0,16200,12.7,107.78,-12.741,187.10,1.9464,4.9772,4409,1027,2096,1588,372
2216,3,2022,LBK,129.0,116.0,33745.0,16200,12.7,107.78,-12.741,187.10,1.9464,4.9772,5931,1562,3192,3957,566
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2473,9,2023,PLK,145.0,130.0,41220.0,17300,6.9,115.41,11.874,138.98,1.7623,2.7372,6538,2360,1473,1221,809
2474,9,2023,STC,262.0,254.0,42990.0,17300,6.9,115.41,11.874,138.98,1.7623,2.7372,17887,5929,2502,1777,2149
2475,9,2023,ULK,243.0,200.0,39664.0,17300,6.9,115.41,11.874,138.98,1.7623,2.7372,17728,4871,1380,1217,2084
2476,9,2023,VYS,62.0,39.0,39315.0,17300,6.9,115.41,11.874,138.98,1.7623,2.7372,5303,2006,885,865,769


In [321]:
encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False, drop="first")

obj_cols = df4.select_dtypes('object')
encoder.fit(obj_cols)

df4 = transform_df(df4, encoder=encoder)

## Ridge Regression

### df1

In [97]:
n_splits = len(df2) - 1
n_splits
y_col_name = "uchazeciOZamestnaniUoZZeny_ratio"
X = df1.drop(columns=y_col_name).to_numpy()
y = df1[y_col_name].to_numpy()

q = 1
exps = np.linspace(0, n_splits-1, num=n_splits)
weights = np.flip(np.power(q, exps))
weights

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1.

In [98]:
tscv = TimeSeriesSplit(n_splits=n_splits, test_size=1)

def optimize_alpha(alpha):
    rmses = []
    for i, (train_index, test_index) in enumerate(tscv.split(X)):
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X[train_index, :])
        X_test = scaler.transform(X[test_index, :])
        y_train = y[train_index]
        y_test = y[test_index]

        tmp_ridge = Ridge(alpha=alpha, random_state=SEED)
        tmp_ridge.fit(X_train, y_train)

        rmse = mean_squared_error(
            y_test, tmp_ridge.predict(X_test), squared=False)

        rmses.append(rmse)

    return np.average(rmses, weights=weights)


res = optimize.minimize_scalar(
    optimize_alpha, bounds=(0, 1000), options={"disp": True})
best_alpha = res.x
display(res)


Optimization terminated successfully;
The returned value satisfies the termination criteria
(using xtol =  1e-05 )


In [104]:
rmses = []
maes = []
for train_index, test_index in tscv.split(X):
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X[train_index, :])
    X_test = scaler.transform(X[test_index, :])
    y_train = y[train_index]
    y_test = y[test_index]
    tmp_ridge = Ridge(alpha=best_alpha, random_state=SEED)
    tmp_ridge.fit(X_train, y_train)
    y_pred = tmp_ridge.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mae = mean_absolute_error(y_test, y_pred)
    print("pred", tmp_ridge.predict(X_test),
          "test:", y_test, "rmse", rmse, "mae", mae)
    rmses.append(rmse)
    maes.append(mae)

weighted_rmse = np.average(rmses, weights=weights)
weighted_mae = np.average(maes, weights=weights)
print(weighted_rmse)
print(weighted_mae)

pred [0.52503995] test: [0.52069956] rmse 0.004340394110931012 mae 0.004340394110931012
pred [0.5162329] test: [0.51190695] rmse 0.004325946848066309 mae 0.004325946848066309
pred [0.49292837] test: [0.53235933] rmse 0.03943095884244707 mae 0.03943095884244707
pred [0.52931933] test: [0.53235796] rmse 0.003038627351448664 mae 0.003038627351448664
pred [0.49777629] test: [0.48715286] rmse 0.010623423629164519 mae 0.010623423629164519
pred [0.50681871] test: [0.50655175] rmse 0.00026696223861355506 mae 0.00026696223861355506
pred [0.51373343] test: [0.51663405] rmse 0.0029006161104331296 mae 0.0029006161104331296
pred [0.50905451] test: [0.53051357] rmse 0.021459057489093714 mae 0.021459057489093714
pred [0.53307562] test: [0.53394841] rmse 0.0008727860771845508 mae 0.0008727860771845508
pred [0.54000142] test: [0.54000142] rmse 2.5654922630735655e-11 mae 2.5654922630735655e-11
pred [0.52075008] test: [0.54799567] rmse 0.027245591060119367 mae 0.027245591060119367
pred [0.51932986] test:

In [78]:
res

 message: Solution found.
 success: True
  status: 0
     fun: [ 5.378e+00]
       x: [ 4.239e+01]
     nit: 33
    nfev: 33

In [146]:
from sklearn.model_selection import train_test_split
import statsmodels.api as sma

X = df1.drop(columns=y_col_name).to_numpy()
X = sma.add_constant(X)
y = df1[y_col_name].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=84, shuffle=False)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [147]:
y_train

array([0.48984919, 0.50225608, 0.50530578, ..., 0.56236532, 0.51740295,
       0.50510294])

In [148]:
X_train

array([[ 0.        , -1.56048665, -1.61150444, ..., -0.2773501 ,
        -0.2773501 , -0.2773501 ],
       [ 0.        , -1.56048665, -1.61150444, ..., -0.2773501 ,
        -0.2773501 , -0.2773501 ],
       [ 0.        , -1.56048665, -1.61150444, ..., -0.2773501 ,
        -0.2773501 , -0.2773501 ],
       ...,
       [ 0.        , -0.98477313,  1.79056049, ...,  3.60555128,
        -0.2773501 , -0.2773501 ],
       [ 0.        , -0.98477313,  1.79056049, ..., -0.2773501 ,
         3.60555128, -0.2773501 ],
       [ 0.        , -0.98477313,  1.79056049, ..., -0.2773501 ,
        -0.2773501 ,  3.60555128]])

In [149]:
best_alpha

31.333883214358604

In [150]:
from sklearn.metrics import r2_score


model = sma.OLS(endog=y_train, exog=X_train)
#fit = model.fit_regularized(alpha=best_alpha, L1_wt= 0)
fit = model.fit()
cmp = y_test, fit.predict(X_test)
mse = mean_squared_error(*cmp)
mae = mean_absolute_error(*cmp)
r2 = r2_score(*cmp)

print(mse, mae, r2)
fit.summary()

0.306420352563224 0.553398020485247 -775.8103348602441


0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.002
Model:,OLS,Adj. R-squared (uncentered):,-0.01
Method:,Least Squares,F-statistic:,0.1635
Date:,"Tue, 26 Dec 2023",Prob (F-statistic):,1.0
Time:,13:29:40,Log-Likelihood:,-1795.9
No. Observations:,2394,AIC:,3650.0
Df Residuals:,2365,BIC:,3818.0
Df Model:,29,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-2.914e-19,2.43e-18,-0.120,0.905,-5.07e-18,4.48e-18
x1,0.0156,0.018,0.862,0.389,-0.020,0.051
x2,0.0144,0.069,0.210,0.834,-0.120,0.149
x3,-0.0020,0.030,-0.066,0.947,-0.061,0.057
x4,-0.0049,0.025,-0.192,0.848,-0.055,0.045
x5,-0.0203,0.085,-0.240,0.811,-0.187,0.146
x6,-0.0035,0.094,-0.038,0.970,-0.187,0.180
x7,0.0084,0.030,0.275,0.783,-0.051,0.068
x8,0.0004,0.045,0.008,0.994,-0.087,0.088

0,1,2,3
Omnibus:,11.072,Durbin-Watson:,0.001
Prob(Omnibus):,0.004,Jarque-Bera (JB):,11.196
Skew:,-0.167,Prob(JB):,0.00371
Kurtosis:,2.965,Cond. No.,1.5e+16


### df12

In [341]:
n_splits = len(df2) - 1
n_splits
y_col_name = "uchazeciOZamestnaniUoZZeny"
X = df12.drop(columns=y_col_name).to_numpy()
y = df12[y_col_name].to_numpy()

q = 1
exps = np.linspace(0, n_splits-1, num=n_splits)
weights = np.flip(np.power(q, exps))
weights

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1.

In [342]:
tscv = TimeSeriesSplit(n_splits=n_splits, test_size=1)

def optimize_alpha(alpha):
    rmses = []
    for i, (train_index, test_index) in enumerate(tscv.split(X)):
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X[train_index, :])
        X_test = scaler.transform(X[test_index, :])
        y_train = y[train_index]
        y_test = y[test_index]

        tmp_ridge = Ridge(alpha=alpha, random_state=SEED)
        tmp_ridge.fit(X_train, y_train)

        rmse = mean_squared_error(
            y_test, tmp_ridge.predict(X_test), squared=False)

        rmses.append(rmse)

    return np.average(rmses, weights=weights)


res = optimize.minimize_scalar(
    optimize_alpha, bounds=(0, 1000), options={"disp": True})
best_alpha = res.x
display(res)


Optimization terminated successfully;
The returned value satisfies the termination criteria
(using xtol =  1e-05 )


 message: Solution found.
 success: True
  status: 0
     fun: 1489.241882698458
       x: 157.51893148599052
     nit: 34
    nfev: 34

In [345]:
eval_tscv(tscv, best_alpha, X, y, weights)

pred [4318.12112924] test: [6163] rmse 1844.8788707649655 mae 1844.8788707649655
[ -354.0232617     55.12018157  -531.05074921   125.72496106
   372.1335161   -543.52013927  -244.7116453  -1370.36712532
   177.41447709   208.12460256   896.10297077  -110.88255759
  1079.77117507    72.35219627  -262.45326212  3591.70905819
  -271.23894394  1656.89260384  -530.95900102  -128.46683345
  2767.3601654    256.33167428  -510.94112719  1238.67338213
  -234.15852396  1458.74858806  1913.29153155  -432.93319705
  -183.85775562]
14668.425666516066

pred [16650.19766437] test: [16681] rmse 30.802335627558932 mae 30.802335627558932
[ -354.24455149    55.7318608   -529.58565493   124.31901909
   369.78989968  -542.23723453  -239.22923111 -1375.4007729
   177.7284007    209.05276782   897.25457221  -109.09079557
  1078.94815367    71.17327067  -260.82185415  3592.03798755
  -269.1606981   1656.82721364  -530.95008837  -128.44802676
  2767.05765865   256.21499876  -511.00886235  1239.34484479
  -234.

### Ridge Regression df22

In [337]:
n_splits = len(df22) - 1
n_splits
y_col_name = "uchazeciOZamestnaniUoZZeny"
X = df22.drop(columns=y_col_name)
y = df22[y_col_name].to_numpy()

q = 0.95
exps = np.linspace(0, n_splits-1, num=n_splits)
weights = np.flip(np.power(q, exps))
weights

array([1.31532113e-06, 1.38454856e-06, 1.45741953e-06, 1.53412582e-06,
       1.61486929e-06, 1.69986241e-06, 1.78932885e-06, 1.88350405e-06,
       1.98263585e-06, 2.08698510e-06, 2.19682642e-06, 2.31244887e-06,
       2.43415670e-06, 2.56227021e-06, 2.69712654e-06, 2.83908057e-06,
       2.98850586e-06, 3.14579564e-06, 3.31136383e-06, 3.48564614e-06,
       3.66910120e-06, 3.86221179e-06, 4.06548609e-06, 4.27945905e-06,
       4.50469373e-06, 4.74178288e-06, 4.99135040e-06, 5.25405305e-06,
       5.53058216e-06, 5.82166543e-06, 6.12806887e-06, 6.45059881e-06,
       6.79010401e-06, 7.14747791e-06, 7.52366096e-06, 7.91964311e-06,
       8.33646643e-06, 8.77522783e-06, 9.23708192e-06, 9.72324413e-06,
       1.02349938e-05, 1.07736777e-05, 1.13407134e-05, 1.19375930e-05,
       1.25658874e-05, 1.32272499e-05, 1.39234209e-05, 1.46562326e-05,
       1.54276132e-05, 1.62395929e-05, 1.70943083e-05, 1.79940087e-05,
       1.89410618e-05, 1.99379598e-05, 2.09873261e-05, 2.20919222e-05,
      

In [338]:
tscv = TimeSeriesSplit(n_splits=n_splits, test_size=1)

# do only every 14 -> do not leak data from other kraj in the month


def optimize_alpha(alpha):
    rmses = []
    for i, (train_index, test_index) in enumerate(tscv.split(X)):
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X.iloc[train_index, :])
        X_test = scaler.transform(X.iloc[test_index, :])
        y_train = y[train_index]
        y_test = y[test_index]

        tmp_ridge = Ridge(alpha=alpha, random_state=SEED)
        tmp_ridge.fit(X_train, y_train)

        rmse = mean_squared_error(
            y_test, tmp_ridge.predict(X_test), squared=False)

        rmses.append(rmse)

    return np.average(rmses, weights=weights)


res = optimize.minimize_scalar(
    optimize_alpha, bounds=(0, 1000), options={"disp": True})
best_alpha = res.x


Optimization terminated successfully;
The returned value satisfies the termination criteria
(using xtol =  1e-05 )


In [339]:
display(res)

 message: Solution found.
 success: True
  status: 0
     fun: 1097.7972674692373
       x: 12.734703799149864
     nit: 40
    nfev: 40

In [340]:
eval_tscv(tscv, best_alpha, X, y, weights=weights)

pred [5315.] test: [6163] rmse 848.0 mae 848.0
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
5315.0

pred [3761.32320664] test: [16681] rmse 12919.676793361654 mae 12919.676793361654
[ 0.          0.          8.09664771  8.09664771 -8.09664771  0.
  0.          0.          8.09664771  8.09664771  8.09664771  8.09664771
  8.09664771  8.09664771  8.09664771  8.09664771  8.09664771  8.09664771
  8.09664771  8.09664771  8.09664771  8.09664771  8.09664771  8.09664771
  8.09664771  8.09664771  8.09664771  8.09664771  8.09664771  8.09664771
  8.09664771  8.09664771  8.09664771  8.09664771  8.09664771  8.09664771
  8.09664771  8.09664771  8.09664771  8.09664771  8.09664771  8.09664771
  8.09664771  8.09664771  8.09664771  8.09664771  0.          0.
  0.          0.          8.09664771  8.09664771  8.09664771 -8.09664771
  8.09664771  0. 

pred [12401.8786165] test: [16213] rmse 3811.121383502701 mae 3811.121383502701
[   0.            0.          432.85398561  598.4082683   119.14456925
    0.            0.            0.          -36.80398827  -15.83933754
  -36.80398827  -36.80398827  -36.80398827  -36.80398827  -36.80398827
  -36.80398827  -36.80398827  -36.80398827  -36.80398827  -36.80398827
  -36.80398827  -36.80398827  -36.80398827  -36.80398827  -36.80398827
  -36.80398827  -36.80398827  -36.80398827  -15.83933754  -15.83933754
  -15.83933754  -15.83933754  -15.83933754  -15.83933754  -15.83933754
  -15.83933754  -15.83933754  -15.83933754  -15.83933754  -15.83933754
  -15.83933754  -15.83933754  -15.83933754  -15.83933754  -15.83933754
  -15.83933754    0.            0.            0.            0.
  911.87494575  524.03751179  242.35973008  986.97308379  -75.47382117
  589.87738767 -134.95012929 -354.29435632  790.18949252 -118.03342713
 -247.14176597  -11.44515946 -131.74823387   59.61349172    0.
    0.       

In [216]:
df22.columns

Index(['month', 'year', 'general_thefts', 'break_in_thefts',
       'avg_monthly_salary', 'monthly_min_wage',
       'monthly_inflation_rate_wrt_last_year', 'reer', 'm_do_65_ratio',
       'z_do_65_ratio', 'm_do_65_w2_ratio', 'm_do_65_w3_ratio',
       'm_do_65_w4_ratio', 'm_do_65_w5_ratio', 'm_do_65_w6_ratio',
       'm_do_65_w7_ratio', 'm_do_65_w8_ratio', 'm_do_65_w9_ratio',
       'm_do_65_w10_ratio', 'm_do_65_w11_ratio', 'm_do_65_w12_ratio',
       'm_do_65_w13_ratio', 'm_do_65_w14_ratio', 'm_do_65_w15_ratio',
       'm_do_65_w16_ratio', 'm_do_65_w17_ratio', 'm_do_65_w18_ratio',
       'm_do_65_w19_ratio', 'z_do_65_w2_ratio', 'z_do_65_w3_ratio',
       'z_do_65_w4_ratio', 'z_do_65_w5_ratio', 'z_do_65_w6_ratio',
       'z_do_65_w7_ratio', 'z_do_65_w8_ratio', 'z_do_65_w9_ratio',
       'z_do_65_w10_ratio', 'z_do_65_w11_ratio', 'z_do_65_w12_ratio',
       'z_do_65_w13_ratio', 'z_do_65_w14_ratio', 'z_do_65_w15_ratio',
       'z_do_65_w16_ratio', 'z_do_65_w17_ratio', 'z_do_65_w18_ratio'

In [214]:
from sklearn.model_selection import train_test_split
import statsmodels.api as sma

X = df22.drop(columns=y_col_name).to_numpy(dtype=float)
X = sma.add_constant(X)
y = df22[y_col_name].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=84, shuffle=False)

#scaler = StandardScaler()
#X_train = scaler.fit_transform(X_train)
#X_test = scaler.transform(X_test)
X

array([[1.000e+00, 3.000e+00, 2.022e+03, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [1.000e+00, 3.000e+00, 2.022e+03, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [1.000e+00, 3.000e+00, 2.022e+03, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       ...,
       [1.000e+00, 9.000e+00, 2.023e+03, ..., 1.000e+00, 0.000e+00,
        0.000e+00],
       [1.000e+00, 9.000e+00, 2.023e+03, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [1.000e+00, 9.000e+00, 2.023e+03, ..., 0.000e+00, 0.000e+00,
        1.000e+00]])

In [215]:
from sklearn.metrics import r2_score


#model = sma.OLS(endog=y_train, exog=X_train)
model = sma.RLM(endog=y_train, exog=X_train, M=sma.robust.norms.HuberT())
#fit = model.fit_regularized(alpha=best_alpha, L1_wt= 0)
fit = model.fit()
cmp = y_test, fit.predict(X_test)
mse = mean_squared_error(*cmp)
mae = mean_absolute_error(*cmp)
r2 = r2_score(*cmp)

print(mse, mae, r2)
fit.summary()

754650964.9366258 14904.609519244133 -21.388132825907917


0,1,2,3
Dep. Variable:,y,No. Observations:,182.0
Model:,RLM,Df Residuals:,127.0
Method:,IRLS,Df Model:,54.0
Norm:,HuberT,,
Scale Est.:,mad,,
Cov Type:,H1,,
Date:,"Tue, 26 Dec 2023",,
Time:,15:04:43,,
No. Iterations:,43,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.0008,0.004,-0.212,0.832,-0.008,0.007
x1,-45.3331,97.774,-0.464,0.643,-236.966,146.300
x2,-1.5983,7.537,-0.212,0.832,-16.371,13.175
x3,0.5706,0.711,0.803,0.422,-0.822,1.964
x4,-0.0924,0.759,-0.122,0.903,-1.580,1.395
x5,-0.0349,0.038,-0.908,0.364,-0.110,0.040
x6,-0.2526,1.488,-0.170,0.865,-3.169,2.664
x7,-130.5758,127.972,-1.020,0.308,-381.397,120.245
x8,190.6420,171.402,1.112,0.266,-145.300,526.584


In [None]:
df2.columns

Index(['month', 'year', 'general_thefts', 'break_in_thefts',
       'avg_monthly_salary', 'monthly_min_wage',
       'monthly_inflation_rate_wrt_last_year', 'reer', 'm_do_65_ratio',
       'z_do_65_ratio', 'm_do_65_w2_ratio', 'm_do_65_w3_ratio',
       'm_do_65_w4_ratio', 'm_do_65_w5_ratio', 'm_do_65_w6_ratio',
       'm_do_65_w7_ratio', 'm_do_65_w8_ratio', 'm_do_65_w9_ratio',
       'm_do_65_w10_ratio', 'm_do_65_w11_ratio', 'm_do_65_w12_ratio',
       'm_do_65_w13_ratio', 'm_do_65_w14_ratio', 'm_do_65_w15_ratio',
       'm_do_65_w16_ratio', 'm_do_65_w17_ratio', 'm_do_65_w18_ratio',
       'm_do_65_w19_ratio', 'z_do_65_w2_ratio', 'z_do_65_w3_ratio',
       'z_do_65_w4_ratio', 'z_do_65_w5_ratio', 'z_do_65_w6_ratio',
       'z_do_65_w7_ratio', 'z_do_65_w8_ratio', 'z_do_65_w9_ratio',
       'z_do_65_w10_ratio', 'z_do_65_w11_ratio', 'z_do_65_w12_ratio',
       'z_do_65_w13_ratio', 'z_do_65_w14_ratio', 'z_do_65_w15_ratio',
       'z_do_65_w16_ratio', 'z_do_65_w17_ratio', 'z_do_65_w18_ratio'

### df2

In [158]:
n_splits = len(df2) - 1
n_splits
y_col_name = "uchazeciOZamestnaniUoZZeny_ratio"
X = df2.drop(columns=y_col_name)
y = df2[y_col_name].to_numpy()

q = 1
exps = np.linspace(0, n_splits-1, num=n_splits)
weights = np.flip(np.power(q, exps))
weights

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1.

In [169]:
tscv = TimeSeriesSplit(n_splits=n_splits, test_size=1)

# do only every 14 -> do not leak data from other kraj in the month


def optimize_alpha(alpha):
    rmses = []
    for i, (train_index, test_index) in enumerate(tscv.split(X)):
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X.iloc[train_index, :])
        X_test = scaler.transform(X.iloc[test_index, :])
        y_train = y[train_index]
        y_test = y[test_index]

        tmp_ridge = Ridge(alpha=alpha, random_state=SEED)
        tmp_ridge.fit(X_train, y_train)

        rmse = mean_squared_error(
            y_test, tmp_ridge.predict(X_test), squared=False)

        rmses.append(rmse)

    return np.average(rmses, weights=weights)


res = optimize.minimize_scalar(
    optimize_alpha, bounds=(0, 1000), options={"disp": True})
best_alpha = res.x


Optimization terminated successfully;
The returned value satisfies the termination criteria
(using xtol =  1e-05 )


In [167]:
display(res)

 message: Solution found.
 success: True
  status: 0
     fun: 0.021607479938802538
       x: 999.999970046681
     nit: 36
    nfev: 36

In [165]:
eval_tscv(tscv, best_alpha, X, y)

pred [0.50701135] test: [0.52069956] rmse 0.01368820895008993 mae 0.01368820895008993
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
pred [0.48674641] test: [0.51190695] rmse 0.025160547459305405 mae 0.025160547459305405
[ 0.          0.          0.00011098  0.00011098 -0.00011098  0.
  0.          0.          0.00011098  0.00011098  0.00011098  0.00011098
  0.00011098  0.00011098  0.00011098  0.00011098  0.00011098  0.00011098
  0.00011098  0.00011098  0.00011098  0.00011098  0.00011098  0.00011098
  0.00011098  0.00011098  0.00011098  0.00011098  0.00011098  0.00011098
  0.00011098  0.00011098  0.00011098  0.00011098  0.00011098  0.00011098
  0.00011098  0.00011098  0.00011098  0.00011098  0.00011098  0.00011098
  0.00011098  0.00011098  0.00011098  0.00011098  0.          0.
  0.          0.          0.00011098  0.00011098  0.0

In [182]:
from sklearn.model_selection import train_test_split
import statsmodels.api as sma

X = df2.drop(columns=y_col_name).to_numpy(dtype=float)
X = sma.add_constant(X)
y = df2[y_col_name].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=84, shuffle=False)

#scaler = StandardScaler()
#X_train = scaler.fit_transform(X_train)
#X_test = scaler.transform(X_test)
X

array([[1.000e+00, 3.000e+00, 2.022e+03, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [1.000e+00, 3.000e+00, 2.022e+03, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [1.000e+00, 3.000e+00, 2.022e+03, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       ...,
       [1.000e+00, 9.000e+00, 2.023e+03, ..., 1.000e+00, 0.000e+00,
        0.000e+00],
       [1.000e+00, 9.000e+00, 2.023e+03, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [1.000e+00, 9.000e+00, 2.023e+03, ..., 0.000e+00, 0.000e+00,
        1.000e+00]])

In [183]:
from sklearn.metrics import r2_score


#model = sma.OLS(endog=y_train, exog=X_train)
model = sma.RLM(endog=y_train, exog=X_train, M=sma.robust.norms.HuberT())
#fit = model.fit_regularized(alpha=best_alpha, L1_wt= 0)
fit = model.fit()
cmp = y_test, fit.predict(X_test)
mse = mean_squared_error(*cmp)
mae = mean_absolute_error(*cmp)
r2 = r2_score(*cmp)

print(mse, mae, r2)
fit.summary()

2.5723027266328016 0.6737336793644056 -6520.079052754033


0,1,2,3
Dep. Variable:,y,No. Observations:,182.0
Model:,RLM,Df Residuals:,127.0
Method:,IRLS,Df Model:,54.0
Norm:,HuberT,,
Scale Est.:,mad,,
Cov Type:,H1,,
Date:,"Tue, 26 Dec 2023",,
Time:,14:23:39,,
No. Iterations:,50,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,2.1e-07,1.29e-07,1.623,0.105,-4.37e-08,4.64e-07
x1,-0.0028,0.003,-0.822,0.411,-0.009,0.004
x2,0.0004,0.000,1.623,0.105,-8.76e-05,0.001
x3,1.625e-05,2.45e-05,0.664,0.507,-3.17e-05,6.43e-05
x4,1.422e-05,2.62e-05,0.544,0.587,-3.7e-05,6.55e-05
x5,2.022e-06,1.32e-06,1.526,0.127,-5.75e-07,4.62e-06
x6,5.545e-06,5.13e-05,0.108,0.914,-9.5e-05,0.000
x7,0.0053,0.004,1.208,0.227,-0.003,0.014
x8,-0.0065,0.006,-1.100,0.271,-0.018,0.005


In [173]:
df2.columns

Index(['month', 'year', 'general_thefts', 'break_in_thefts',
       'avg_monthly_salary', 'monthly_min_wage',
       'monthly_inflation_rate_wrt_last_year', 'reer', 'm_do_65_ratio',
       'z_do_65_ratio', 'm_do_65_w2_ratio', 'm_do_65_w3_ratio',
       'm_do_65_w4_ratio', 'm_do_65_w5_ratio', 'm_do_65_w6_ratio',
       'm_do_65_w7_ratio', 'm_do_65_w8_ratio', 'm_do_65_w9_ratio',
       'm_do_65_w10_ratio', 'm_do_65_w11_ratio', 'm_do_65_w12_ratio',
       'm_do_65_w13_ratio', 'm_do_65_w14_ratio', 'm_do_65_w15_ratio',
       'm_do_65_w16_ratio', 'm_do_65_w17_ratio', 'm_do_65_w18_ratio',
       'm_do_65_w19_ratio', 'z_do_65_w2_ratio', 'z_do_65_w3_ratio',
       'z_do_65_w4_ratio', 'z_do_65_w5_ratio', 'z_do_65_w6_ratio',
       'z_do_65_w7_ratio', 'z_do_65_w8_ratio', 'z_do_65_w9_ratio',
       'z_do_65_w10_ratio', 'z_do_65_w11_ratio', 'z_do_65_w12_ratio',
       'z_do_65_w13_ratio', 'z_do_65_w14_ratio', 'z_do_65_w15_ratio',
       'z_do_65_w16_ratio', 'z_do_65_w17_ratio', 'z_do_65_w18_ratio'

### Feature selection

In [224]:
from sklearn.feature_selection import SequentialFeatureSelector

In [278]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import make_scorer


estimator = Ridge()
y_col_name = "uchazeciOZamestnaniUoZZeny"
X = df22.drop(columns=y_col_name)
y = df22[y_col_name].to_numpy()
scaler = StandardScaler()
X_trans = scaler.fit_transform(X)
tscv = TimeSeriesSplit(n_splits=n_splits, test_size=1)
sfs = SequentialFeatureSelector(estimator=estimator, direction="forward", cv=tscv, scoring=make_scorer(mean_absolute_error))
sfs.fit(X_trans, y)

In [279]:
mask = sfs.get_support()
mask

array([ True,  True, False, False, False,  True, False,  True, False,
        True, False, False,  True,  True,  True, False, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True, False,
       False, False,  True,  True,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True, False, False, False, False, False, False, False,
        True, False, False, False, False, False, False, False, False,
       False, False, False, False])

In [280]:
all_colls = sfs.feature_names_in_
all_colls

AttributeError: 'SequentialFeatureSelector' object has no attribute 'feature_names_in_'

In [281]:
#excluded columns
excluded_cols = all_colls[~mask]
excluded_cols

array(['general_thefts', 'break_in_thefts', 'avg_monthly_salary',
       'monthly_inflation_rate_wrt_last_year', 'm_do_65_ratio',
       'm_do_65_w2_ratio', 'm_do_65_w3_ratio', 'm_do_65_w7_ratio',
       'm_do_65_w8_ratio', 'm_do_65_w18_ratio', 'm_do_65_w19_ratio',
       'z_do_65_w2_ratio', 'z_do_65_w6_ratio', 'z_do_65_w7_ratio',
       'z_do_65_w8_ratio', 'avg_energy_price', 'avg_gasoline_price',
       'avg_natural_gas_price', 'noveHlaseniUchazeci',
       'noveHlasenaAUvolnenaVPM', 'obsazenaAZrusenaVPM',
       'absolventiSkolAMladistvi', 'kraj_JHM', 'kraj_KVK', 'kraj_LBK',
       'kraj_MSK', 'kraj_OLK', 'kraj_PAK', 'kraj_PHA', 'kraj_PLK',
       'kraj_STC', 'kraj_ULK', 'kraj_VYS', 'kraj_ZLK'], dtype=object)

In [282]:
features = sfs.get_feature_names_out()
features

array(['x0', 'x1', 'x5', 'x7', 'x9', 'x12', 'x13', 'x14', 'x17', 'x18',
       'x19', 'x20', 'x21', 'x22', 'x23', 'x24', 'x25', 'x29', 'x30',
       'x31', 'x35', 'x36', 'x37', 'x38', 'x39', 'x40', 'x41', 'x42',
       'x43', 'x44', 'x45', 'x46', 'x54'], dtype=object)

In [284]:
X_select = sfs.transform(X)



In [294]:
n_splits = len(df22) - 1
n_splits

q = 1
exps = np.linspace(0, n_splits-1, num=n_splits)
weights = np.flip(np.power(q, exps))
weights

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1.

In [295]:
tscv = TimeSeriesSplit(n_splits=n_splits, test_size=1)

def optimize_alpha(alpha):
    rmses = []
    for i, (train_index, test_index) in enumerate(tscv.split(X)):
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_select[train_index, :])
        X_test = scaler.transform(X_select[test_index, :])
        y_train = y[train_index]
        y_test = y[test_index]

        tmp_ridge = Ridge(alpha=alpha, random_state=SEED)
        tmp_ridge.fit(X_train, y_train)

        rmse = mean_squared_error(
            y_test, tmp_ridge.predict(X_test), squared=False)

        rmses.append(rmse)

    return np.average(rmses, weights=weights)


res = optimize.minimize_scalar(
    optimize_alpha, bounds=(0, 1000), options={"disp": True})
best_alpha = res.x


Optimization terminated successfully;
The returned value satisfies the termination criteria
(using xtol =  1e-05 )


In [296]:
display(res)

 message: Solution found.
 success: True
  status: 0
     fun: 5038.862663229233
       x: 999.9999743212363
     nit: 36
    nfev: 36

In [297]:
eval_tscv(tscv, best_alpha, X_select, y, weights=weights)

pred [5315.] test: [6163] rmse 848.0 mae 848.0
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0.]
5315.0

pred [5771.55000087] test: [16681] rmse 10909.449999131135 mae 10909.449999131135
[0.         0.         0.         0.         0.80303032 0.80303032
 0.80303032 0.80303032 0.80303032 0.80303032 0.80303032 0.80303032
 0.80303032 0.80303032 0.80303032 0.80303032 0.80303032 0.80303032
 0.80303032 0.80303032 0.80303032 0.80303032 0.80303032 0.80303032
 0.80303032 0.80303032 0.80303032 0.80303032 0.80303032 0.80303032
 0.80303032 0.         0.80303032]
5739.0

pred [9317.5018309] test: [4409] rmse 4908.501830897749 mae 4908.501830897749
[ 0.          0.          0.          0.          3.1757762  13.43693166
 13.43693166 13.43693166 13.43693166 13.43693166 13.43693166 13.43693166
 13.43693166 13.43693166 13.43693166 13.43693166 13.43693166  3.1757762
  3.1757762   3.1757762   3.1757762   3.1757762   3.1757762   3.1757762
  3.1757762   3

decay changes nothing?

### df3

In [346]:
n_splits = len(df2) - 1
n_splits
y_col_name = "uchazeciOZamestnaniUoZZeny"
X = df3.drop(columns=y_col_name)
y = df3[y_col_name].to_numpy()

q = 1
exps = np.linspace(0, n_splits-1, num=n_splits)
weights = np.flip(np.power(q, exps))
weights

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1.

In [347]:
tscv = TimeSeriesSplit(n_splits=n_splits, test_size=1)

def optimize_alpha(alpha):
    rmses = []
    for i, (train_index, test_index) in enumerate(tscv.split(X)):
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X.iloc[train_index, :])
        X_test = scaler.transform(X.iloc[test_index, :])
        y_train = y[train_index]
        y_test = y[test_index]

        tmp_ridge = Ridge(alpha=alpha, random_state=SEED)
        tmp_ridge.fit(X_train, y_train)

        rmse = mean_squared_error(
            y_test, tmp_ridge.predict(X_test), squared=False)

        rmses.append(rmse)

    return np.average(rmses, weights=weights)


res = optimize.minimize_scalar(
    optimize_alpha, bounds=(0, 1000), options={"disp": True})
best_alpha = res.x


Optimization terminated successfully;
The returned value satisfies the termination criteria
(using xtol =  1e-05 )


In [348]:
display(res)

 message: Solution found.
 success: True
  status: 0
     fun: 1090.445700974699
       x: 661.6935209330768
     nit: 27
    nfev: 27

In [349]:
eval_tscv(tscv, best_alpha, X, y, weights=weights)

pred [3656.3229979] test: [6163] rmse 2506.6770021036064 mae 2506.6770021036064
[-2.28481454e+02 -4.28272438e+01 -1.73285408e+02  1.89983516e+02
  1.09051818e+02 -5.17202293e+02 -2.54345566e+02 -9.39798682e+02
 -2.36042694e-01 -2.36042694e-01 -2.36042694e-01 -2.36042694e-01
 -2.36042694e-01 -2.36042694e-01 -2.36042694e-01 -2.36042694e-01
 -2.36042694e-01 -2.36042694e-01 -2.36042694e-01 -2.36042694e-01
 -2.36042694e-01 -2.36042694e-01 -2.36042694e-01 -2.36042694e-01
 -2.36042694e-01 -2.36042694e-01 -2.36042694e-01 -2.36042694e-01
 -2.36042694e-01 -2.36042694e-01 -2.36042694e-01 -2.36042694e-01
 -2.36042694e-01 -2.36042694e-01 -2.36042694e-01 -2.36042694e-01
 -2.36042694e-01 -2.36042694e-01 -2.36042694e-01 -2.36042694e-01
 -2.36042694e-01 -2.36042694e-01 -2.36042694e-01 -2.36042694e-01
 -2.36042694e-01 -2.36042694e-01  1.51789748e+02  1.17233359e+01
  7.43377260e+02  2.45245387e+00  1.72809197e+03  1.45400619e+02
 -1.31393942e+02  2.91516131e+03 -4.44462207e+02  1.15115088e+03
 -6.334048

pred [7956.14164113] test: [7229] rmse 727.1416411263845 mae 727.1416411263845
[-2.27677037e+02 -4.40057636e+01 -1.72575016e+02  1.90030048e+02
  1.06393878e+02 -5.18412419e+02 -2.56267707e+02 -9.42686115e+02
  1.78738409e+00  1.25874886e+00  1.78738409e+00  1.78738409e+00
  1.78738409e+00  1.78738409e+00  1.78738409e+00  1.78738409e+00
  1.78738409e+00  1.78738409e+00  1.78738409e+00  1.78738409e+00
  1.78738409e+00  1.78738409e+00  1.78738409e+00  1.78738409e+00
  1.78738409e+00  1.78738409e+00  1.78738409e+00  1.78738409e+00
  1.25874886e+00  1.25874886e+00  1.25874886e+00  1.25874886e+00
  1.25874886e+00  1.25874886e+00  1.25874886e+00  1.25874886e+00
  1.25874886e+00  1.25874886e+00  1.25874886e+00  1.25874886e+00
  1.25874886e+00  1.25874886e+00  1.25874886e+00  1.25874886e+00
  1.25874886e+00  1.25874886e+00  1.52250529e+02  6.89041118e+00
  7.42061991e+02  3.33596860e+00  1.72876507e+03  1.45663009e+02
 -1.31010246e+02  2.91813261e+03 -4.43443855e+02  1.15134602e+03
 -6.3219519

### df4

In [350]:
n_splits = len(df2) - 1
n_splits
y_col_name = "uchazeciOZamestnaniUoZZeny"
X = df4.drop(columns=y_col_name)
y = df4[y_col_name].to_numpy()

q = 0.95
exps = np.linspace(0, n_splits-1, num=n_splits)
weights = np.flip(np.power(q, exps))
weights

array([1.31532113e-06, 1.38454856e-06, 1.45741953e-06, 1.53412582e-06,
       1.61486929e-06, 1.69986241e-06, 1.78932885e-06, 1.88350405e-06,
       1.98263585e-06, 2.08698510e-06, 2.19682642e-06, 2.31244887e-06,
       2.43415670e-06, 2.56227021e-06, 2.69712654e-06, 2.83908057e-06,
       2.98850586e-06, 3.14579564e-06, 3.31136383e-06, 3.48564614e-06,
       3.66910120e-06, 3.86221179e-06, 4.06548609e-06, 4.27945905e-06,
       4.50469373e-06, 4.74178288e-06, 4.99135040e-06, 5.25405305e-06,
       5.53058216e-06, 5.82166543e-06, 6.12806887e-06, 6.45059881e-06,
       6.79010401e-06, 7.14747791e-06, 7.52366096e-06, 7.91964311e-06,
       8.33646643e-06, 8.77522783e-06, 9.23708192e-06, 9.72324413e-06,
       1.02349938e-05, 1.07736777e-05, 1.13407134e-05, 1.19375930e-05,
       1.25658874e-05, 1.32272499e-05, 1.39234209e-05, 1.46562326e-05,
       1.54276132e-05, 1.62395929e-05, 1.70943083e-05, 1.79940087e-05,
       1.89410618e-05, 1.99379598e-05, 2.09873261e-05, 2.20919222e-05,
      

In [351]:
tscv = TimeSeriesSplit(n_splits=n_splits, test_size=1)

def optimize_alpha(alpha):
    rmses = []
    for i, (train_index, test_index) in enumerate(tscv.split(X)):
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X.iloc[train_index, :])
        X_test = scaler.transform(X.iloc[test_index, :])
        y_train = y[train_index]
        y_test = y[test_index]

        tmp_ridge = Ridge(alpha=alpha, random_state=SEED)
        tmp_ridge.fit(X_train, y_train)

        rmse = mean_squared_error(
            y_test, tmp_ridge.predict(X_test), squared=False)

        rmses.append(rmse)

    return np.average(rmses, weights=weights)


res = optimize.minimize_scalar(
    optimize_alpha, bounds=(0, 1000), options={"disp": True})
best_alpha = res.x


Optimization terminated successfully;
The returned value satisfies the termination criteria
(using xtol =  1e-05 )


In [352]:
display(res)

 message: Solution found.
 success: True
  status: 0
     fun: 266.74292541986154
       x: 6.638538074770203e-06
     nit: 39
    nfev: 39

In [353]:
eval_tscv(tscv, best_alpha, X, y, weights=weights)

pred [5315.] test: [6163] rmse 848.0 mae 848.0
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0.]
5315.0

pred [-10618.30449021] test: [16681] rmse 27299.30449021052 mae 27299.30449021052
[  0.           0.          52.99997801  52.99997801 -52.99997801
   0.           0.           0.           0.           0.
   0.           0.          52.99997801  52.99997801  52.99997801
 -52.99997801  52.99997801   0.           0.           0.
   0.           0.           0.           0.           0.
   0.           0.           0.           0.        ]
5739.0

pred [3528.6950233] test: [4409] rmse 880.3049767029606 mae 880.3049767029606
[   0.            0.          637.86631343  661.00733499  635.9554334
    0.            0.            0.            0.            0.
    0.            0.          660.87217243  648.18333241  593.80779534
  659.75843035 -297.47995705  659.9430179     0.            0.
    0.            0.            0.            0.            