In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso

In [8]:
orig_df = pd.read_csv('../dataset/team_A_dataset.csv')

#all columns with NANs
print(orig_df.columns[orig_df.isna().any()].tolist())

#replacing NANs with zeros
df = orig_df.fillna(0)

avg_cols = ['avg_monthly_salary', 'monthly_min_wage', 'monthly_inflation_rate_wrt_last_year', 'reer', 'bilance',
            'avg_energy_price', 'avg_gasoline_price', 'avg_natural_gas_price']

avg_df = orig_df.groupby(['month', 'year'], as_index=False)[avg_cols].mean()

#sum_cols = ['general_thefts', 'break_in_thefts', 'uchazeciOZamestnaniUoZ', 'uchazeciOZamestnaniUoZZeny',
#            'noveHlaseniUchazeci', 'noveHlasenaAUvolnenaVPM', 'obsazenaAZrusenaVPM', 'absolventiSkolAMladistvi',
#            'z_do_65_w19', 'm_do_65', 'celkem']

sum_cols = ['general_thefts', 'break_in_thefts', 'z_do_65_w19']

sum_df = orig_df.groupby(['month', 'year'], as_index=False)[sum_cols].sum()

df = pd.merge(avg_df, sum_df, on=['month', 'year'])

df['date'] = pd.to_datetime(df['year'].astype(str) + df['month'].astype(str), format='%Y%m')

df = df.set_index('date')
df = df.sort_index()

#df = df.drop(columns=['month', 'year'])

war_df = df.iloc[158:, :] #Since March 2022
war_df.tail(5)

['celkem', 'm_do_65', 'z_do_65', 'celkem_w2', 'celkem_w3', 'celkem_w4', 'celkem_w5', 'celkem_w6', 'celkem_w7', 'celkem_w8', 'celkem_w9', 'celkem_w10', 'celkem_w11', 'celkem_w12', 'celkem_w13', 'celkem_w14', 'celkem_w15', 'celkem_w16', 'celkem_w17', 'celkem_w18', 'celkem_w19', 'm_do_65_w2', 'm_do_65_w3', 'm_do_65_w4', 'm_do_65_w5', 'm_do_65_w6', 'm_do_65_w7', 'm_do_65_w8', 'm_do_65_w9', 'm_do_65_w10', 'm_do_65_w11', 'm_do_65_w12', 'm_do_65_w13', 'm_do_65_w14', 'm_do_65_w15', 'm_do_65_w16', 'm_do_65_w17', 'm_do_65_w18', 'm_do_65_w19', 'z_do_65_w2', 'z_do_65_w3', 'z_do_65_w4', 'z_do_65_w5', 'z_do_65_w6', 'z_do_65_w7', 'z_do_65_w8', 'z_do_65_w9', 'z_do_65_w10', 'z_do_65_w11', 'z_do_65_w12', 'z_do_65_w13', 'z_do_65_w14', 'z_do_65_w15', 'z_do_65_w16', 'z_do_65_w17', 'z_do_65_w18', 'z_do_65_w19', 'm_do_65_ratio', 'z_do_65_ratio', 'm_do_65_w2_ratio', 'm_do_65_w3_ratio', 'm_do_65_w4_ratio', 'm_do_65_w5_ratio', 'm_do_65_w6_ratio', 'm_do_65_w7_ratio', 'm_do_65_w8_ratio', 'm_do_65_w9_ratio', 'm_do

Unnamed: 0_level_0,month,year,avg_monthly_salary,monthly_min_wage,monthly_inflation_rate_wrt_last_year,reer,bilance,avg_energy_price,avg_gasoline_price,avg_natural_gas_price,general_thefts,break_in_thefts,z_do_65_w19
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2023-05-01,5,2023,40930.142857,17300.0,11.1,117.36,9.37,150.59,1.6911,2.3395,3017.0,2748.0,149692.0
2023-06-01,6,2023,40930.142857,17300.0,9.7,117.36,19.236,150.19,1.6982,2.4959,2915.0,2467.0,153834.0
2023-07-01,7,2023,40473.285714,17300.0,8.8,115.41,-6.421,156.26,1.7325,2.6531,3114.0,2513.0,158260.0
2023-08-01,8,2023,40473.285714,17300.0,8.5,115.41,-5.628,152.26,1.7791,2.6613,3263.0,2463.0,161187.0
2023-09-01,9,2023,40473.285714,17300.0,6.9,115.41,11.874,138.98,1.7623,2.7372,2956.0,2294.0,164537.0


In [27]:
n_splits = len(war_df) - 1
X = war_df.loc[:, df.columns != 'z_do_65_w19'].to_numpy()
y = war_df.loc[:, df.columns == 'z_do_65_w19'].to_numpy()

q = 0.95
exps = np.linspace(0, n_splits-1, num=n_splits)
weights = np.power(q, exps)

weighted_error = 0
tscv = TimeSeriesSplit(n_splits=n_splits, test_size=1)
for train_index, test_index in tscv.split(X):
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X[train_index, :])
    X_test = scaler.transform(X[test_index, :])

    model = Lasso(alpha=1000, fit_intercept=False,max_iter=100000)
    model.fit(X_train, y[train_index])

    y_pred = model.predict(X_test)
    y_target = y[test_index]
    weighted_error += abs(y_target - y_pred) * weights[test_index-1]

print("Absolute weighted_error " + str(weighted_error[0][0]))
    

Absolute weighted_error 1837077.6537656875
