In [28]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso

In [117]:
orig_df = pd.read_csv('../dataset/team_A_dataset.csv')

#all columns with NANs
print(orig_df.columns[orig_df.isna().any()].tolist())

#replacing NANs with zeros
df = orig_df.fillna(0)

avg_cols = ['avg_monthly_salary', 'monthly_min_wage', 'monthly_inflation_rate_wrt_last_year', 'reer', 'bilance',
            'avg_energy_price', 'avg_gasoline_price', 'avg_natural_gas_price']

avg_df = orig_df.groupby(['month', 'year'], as_index=False)[avg_cols].mean()

#sum_cols = ['general_thefts', 'break_in_thefts', 'uchazeciOZamestnaniUoZ', 'uchazeciOZamestnaniUoZZeny',
#            'noveHlaseniUchazeci', 'noveHlasenaAUvolnenaVPM', 'obsazenaAZrusenaVPM', 'absolventiSkolAMladistvi',
#            'z_do_65_w19', 'm_do_65', 'celkem']

#ignoring 'celkem' and 'uchazeciOZamestnaniUoZ'
sum_cols = ['general_thefts', 'break_in_thefts', 'uchazeciOZamestnaniUoZZeny',
           'noveHlaseniUchazeci', 'noveHlasenaAUvolnenaVPM', 'obsazenaAZrusenaVPM', 'absolventiSkolAMladistvi',
           'z_do_65_w19', 'm_do_65_w19']

sum_df = orig_df.groupby(['month', 'year'], as_index=False)[sum_cols].sum()

df = pd.merge(avg_df, sum_df, on=['month', 'year'])

df['date'] = pd.to_datetime(df['year'].astype(str) + df['month'].astype(str), format='%Y%m')

df = df.set_index('date')
df = df.sort_index()

seasonal_cols = ['avg_monthly_salary', 'general_thefts', 'break_in_thefts', 'noveHlaseniUchazeci',
                  'absolventiSkolAMladistvi', 'noveHlasenaAUvolnenaVPM', 'obsazenaAZrusenaVPM']

for col in seasonal_cols:
    df[col + '_prev_year'] = df[col].shift(11)

#fill previous year columns for 2009 with 2009 year values
for i in range(len(df)):
    for col in seasonal_cols:
        if np.isnan(df.loc[df.index[i], col + '_prev_year']):
            df.loc[df.index[i], col + '_prev_year'] = df.loc[df.index[i], col]

#df = df.drop(columns=['month', 'year'])
            
war_df = df.iloc[158:, :] #Since March 2022
war_df.tail(5)

['celkem', 'm_do_65', 'z_do_65', 'celkem_w2', 'celkem_w3', 'celkem_w4', 'celkem_w5', 'celkem_w6', 'celkem_w7', 'celkem_w8', 'celkem_w9', 'celkem_w10', 'celkem_w11', 'celkem_w12', 'celkem_w13', 'celkem_w14', 'celkem_w15', 'celkem_w16', 'celkem_w17', 'celkem_w18', 'celkem_w19', 'm_do_65_w2', 'm_do_65_w3', 'm_do_65_w4', 'm_do_65_w5', 'm_do_65_w6', 'm_do_65_w7', 'm_do_65_w8', 'm_do_65_w9', 'm_do_65_w10', 'm_do_65_w11', 'm_do_65_w12', 'm_do_65_w13', 'm_do_65_w14', 'm_do_65_w15', 'm_do_65_w16', 'm_do_65_w17', 'm_do_65_w18', 'm_do_65_w19', 'z_do_65_w2', 'z_do_65_w3', 'z_do_65_w4', 'z_do_65_w5', 'z_do_65_w6', 'z_do_65_w7', 'z_do_65_w8', 'z_do_65_w9', 'z_do_65_w10', 'z_do_65_w11', 'z_do_65_w12', 'z_do_65_w13', 'z_do_65_w14', 'z_do_65_w15', 'z_do_65_w16', 'z_do_65_w17', 'z_do_65_w18', 'z_do_65_w19', 'm_do_65_ratio', 'z_do_65_ratio', 'm_do_65_w2_ratio', 'm_do_65_w3_ratio', 'm_do_65_w4_ratio', 'm_do_65_w5_ratio', 'm_do_65_w6_ratio', 'm_do_65_w7_ratio', 'm_do_65_w8_ratio', 'm_do_65_w9_ratio', 'm_do

Unnamed: 0_level_0,month,year,avg_monthly_salary,monthly_min_wage,monthly_inflation_rate_wrt_last_year,reer,bilance,avg_energy_price,avg_gasoline_price,avg_natural_gas_price,...,absolventiSkolAMladistvi,z_do_65_w19,m_do_65_w19,avg_monthly_salary_prev_year,general_thefts_prev_year,break_in_thefts_prev_year,noveHlaseniUchazeci_prev_year,absolventiSkolAMladistvi_prev_year,noveHlasenaAUvolnenaVPM_prev_year,obsazenaAZrusenaVPM_prev_year
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-05-01,5,2023,40930.142857,17300.0,11.1,117.36,9.37,150.59,1.6911,2.3395,...,11499,149692.0,82353.0,37914.357143,2927.0,2690.0,30851.0,8999.0,23787.0,41710.0
2023-06-01,6,2023,40930.142857,17300.0,9.7,117.36,19.236,150.19,1.6982,2.4959,...,9512,153834.0,84692.0,37790.571429,3201.0,2894.0,34846.0,9446.0,17232.0,23390.0
2023-07-01,7,2023,40473.285714,17300.0,8.8,115.41,-6.421,156.26,1.7325,2.6531,...,9928,158260.0,87720.0,37790.571429,3325.0,2912.0,42324.0,10420.0,22812.0,23735.0
2023-08-01,8,2023,40473.285714,17300.0,8.5,115.41,-5.628,152.26,1.7791,2.6613,...,10795,161187.0,89998.0,37790.571429,2969.0,2763.0,49340.0,15490.0,20650.0,26879.0
2023-09-01,9,2023,40473.285714,17300.0,6.9,115.41,11.874,138.98,1.7623,2.7372,...,16894,164537.0,92435.0,41179.142857,3075.0,3114.0,38161.0,15117.0,19413.0,28151.0


## Since start of the war refugees predictor present

In [114]:
n_splits = len(war_df) - 1
X = war_df.loc[:, war_df.columns != 'uchazeciOZamestnaniUoZZeny'].to_numpy()
y = war_df.loc[:, war_df.columns == 'uchazeciOZamestnaniUoZZeny'].to_numpy()

q = 0.95
exps = np.linspace(0, n_splits-1, num=n_splits)
weights = np.flip(np.power(q, exps))

best_err = [[10e9]]
best_alpha = -1
for alpha in np.logspace(-3, 3, num=7, base=10):
    weighted_error = 0
    tscv = TimeSeriesSplit(n_splits=n_splits, test_size=1)
    for train_index, test_index in tscv.split(X):
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X[train_index, :])
        X_test = scaler.transform(X[test_index, :])

        model = Lasso(alpha=alpha, max_iter=10000000)
        model.fit(X_train, y[train_index])

        y_pred = model.predict(X_test)
        y_target = y[test_index]
        weighted_error += abs(y_target - y_pred) * weights[test_index-1]

    if weighted_error < best_err:
        best_err = weighted_error
        best_alpha = alpha

print("Best absolute weighted_error " + str(best_err[0][0]) + " for alpha: " + str(alpha))
    

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Best absolute weighted_error 38187.93032146147 for alpha: 1000.0


  model = cd_fast.enet_coordinate_descent(


## Entire period without refugees

In [116]:
n_splits = len(war_df) - 1
X = df.loc[:, np.invert(df.columns.isin(['uchazeciOZamestnaniUoZZeny', 'z_do_65_w19', 'm_do_65_w19']))].to_numpy()
y = df.loc[:, df.columns == 'uchazeciOZamestnaniUoZZeny'].to_numpy()

q = 0.95
exps = np.linspace(0, n_splits-1, num=n_splits)
weights = np.flip(np.power(q, exps))

best_err = [[10e9]]
best_alpha = -1
for alpha in np.logspace(-3, 3, num=7, base=10):
    weighted_error = 0
    tscv = TimeSeriesSplit(n_splits=n_splits, test_size=1)
    for train_index, test_index in tscv.split(X):
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X[train_index, :])
        X_test = scaler.transform(X[test_index, :])

        model = Lasso(alpha=alpha, max_iter=100000000)
        model.fit(X_train, y[train_index])

        y_pred = model.predict(X_test)
        y_target = y[test_index]
        weighted_error += abs(y_target - y_pred) * weights[test_index-1-len(df)+len(war_df)]

    if weighted_error < best_err:
        best_err = weighted_error
        best_alpha = alpha

print("Best absolute weighted_error " + str(best_err[0][0]) + " for alpha: " + str(alpha))

Best absolute weighted_error 60078.304464707944 for alpha: 1000.0
