In [None]:
import pandas as pd
import numpy as np
from dsipts import TimeSeries, RNN, Attention,read_public_dataset, LinearTS, Persistent
import matplotlib.pyplot as plt
from datetime import timedelta

In [None]:
## reading weather dataset
## time column is the time, y is the target, the others are covariates (in the past)
data, columns = read_public_dataset('/home/agobbi/Projects/ExpTS/data','weather')

In [None]:
data

In [None]:
model_to_use = 'linear' #attention, rnn
use_covariates = False  #use only y

In [None]:
##load the timeseries to the datastructure, adding the hour column and use all the covariates
ts = TimeSeries('weather')
if model_to_use!='attention':
    ts.load_signal(data,enrich_cat=['hour'],target_variables=['y'],past_variables=columns if use_covariates else [])
else:
    ##attention will use also y in the future, using a masking mechanism
    ts.load_signal(data,enrich_cat=['hour'],target_variables=['y'],future_variables=['y'],past_variables=columns if use_covariates else [])

In [None]:
ts.past_variables


In [None]:
fig = ts.plot()

In [None]:
#Let now prepare a model predictin the next 16 step using the past 16 steps 
past_steps = 16
future_steps = 16


In [None]:
#RNN
config = dict(model_configs =dict(
                                    cat_emb_dim = 16,
                                    hidden_RNN = 256,
                                    num_layers_RNN = 2,
                                    sum_emb = True,                                        #not influent here, there is only 'hour' as categorical variable
                                    kind = 'lstm',
                                    kernel_size_encoder = 7,
                                    past_steps = past_steps,
                                    future_steps = future_steps,
                                    past_channels = len(ts.num_var),                       #parameter that depends on the ts dataset
                                    future_channels = len(ts.future_variables),            #parameter that depends on the ts dataset
                                    embs = [ts.dataset[c].nunique() for c in ts.cat_var],  #parameter that depends on the ts dataset
                                    quantiles=[0.1,0.5,0.9],                               #use quantile loss
                                    out_channels = len(ts.target_variables)),              #parameter that depends on the ts dataset
                scheduler_config = dict(gamma=0.1,step_size=100),
                optim_config = dict(lr = 0.0005,weight_decay=0.01))
model_sum = RNN(**config['model_configs'],optim_config = config['optim_config'],scheduler_config =config['scheduler_config'] )


In [None]:

config = dict(model_configs =dict(
                                    past_channels = len(ts.num_var),
                                    future_channels = len(ts.future_variables),
                                    d_model = 128,
                                    cat_emb_dim = 16,
                                    num_heads = 8,
                                    dropout = 0.5,
                                    n_layer_encoder = 6,
                                    n_layer_decoder  = 3,
                                    past_steps = past_steps,
                                    future_steps = future_steps,
                                    embs = [ts.dataset[c].nunique() for c in ts.cat_var],
                                    quantiles= [0.1,0.5,0.9],
                                    out_channels= len(ts.target_variables)),
                    scheduler_config = dict(gamma=0.1,step_size=100),
                    optim_config = dict(lr = 0.0005,weight_decay=0.2))
model_sum = Attention(**config['model_configs'],optim_config = config['optim_config'],scheduler_config =config['scheduler_config'] )


In [None]:


config = dict(model_configs =dict(
                                    past_steps = past_steps,
                                    future_steps = future_steps,
                                    past_channels = len(ts.num_var),
                                    future_channels = len(ts.future_variables),
                                    embs = [ts.dataset[c].nunique() for c in ts.cat_var],
                                    cat_emb_dim = 8,
                                    kernel_size_encoder = 7,
                                     sum_emb = True,
                                     out_channels = len(ts.target_variables),
                                    hidden_size = 256,
                                   kind='nlinear',
                                    quantiles=[],
    
                                    ),
                scheduler_config = dict(gamma=0.1,step_size=24),
                optim_config = dict(lr = 0.0005,weight_decay=0.01))
model_sum = LinearTS(**config['model_configs'],optim_config = config['optim_config'],scheduler_config =config['scheduler_config'] )


In [None]:
#set the desirere model
ts.set_model(model_sum,config=config )

In [None]:
##splitting parameters
split_params = {'perc_train':0.7,'perc_valid':0.1,                             ##if not None it will split 70% 10% 20%
               'range_train':None, 'range_validation':None, 'range_test':None, ## or we can split using ranges for example range_train=['2021-02-03','2022-04-08']
               'past_steps':past_steps,
               'future_steps':future_steps,
               'shift':0 if model_to_use!='attention' else 1 ,                 ## if there is a shift in the dataset, usually for attention models
               'starting_point':None,                                          ## do not skip samples
               'skip_step' : 1                                                 ## distance between two consecutive samples
                             }

In [None]:
#train the model for 50 epochs with auto_lr_find 
ts.train_model(dirpath=f"/home/agobbi/Projects/ExpTS/tmp/{model_to_use}",
               split_params=split_params,
               batch_size=128,
               num_workers=4,
               max_epochs=50,
               auto_lr_find=True)

In [None]:
#Print the losses, check overfitting
ts.losses.plot()

In [None]:
#make inferences on 
res = ts.inference_on_set(200,4,set='test',rescaling=True)

In [None]:
## get the median MSE for each lag
import numpy as np
res.groupby('lag').apply(lambda x: np.nanmean((x.y-x.y_median)**2)).reset_index().rename(columns={0:'error'}) 

In [None]:
#save model 
ts.save(f"{model_to_use}_test")

In [None]:
## load the model and check if we obtain the same result

In [None]:
ts.load(LinearTS,f"{model_to_use}_test",load_last=False)
res = ts.inference_on_set(200,4,set='test',rescaling=True)
error = res.groupby('lag').apply(lambda x: np.nanmean((x.y-x.y_median)**2)).reset_index().rename(columns={0:'error'}) 
error

In [None]:
##print the mean MSE along the lag steps
plt.plot(error.lag,error.error)

In [None]:
%matplotlib inline
lag = 7
try:
    %matplotlib qast
    to_plot = res
except:
    print('better to have qt, i will reduce the dataset')
    plt.figure(figsize=(15,7))
    to_plot = res[res.time>pd.to_datetime('2020-12-28')]
plt.plot(to_plot[to_plot.lag==lag].time, to_plot[to_plot.lag==lag].y,label='real',alpha=0.5)
plt.plot(to_plot[to_plot.lag==lag].time, to_plot[to_plot.lag==lag].y_median,label='median',alpha=0.5)
plt.fill_between(to_plot[to_plot.lag==lag].time, to_plot[to_plot.lag==lag].y_low , to_plot[to_plot.lag==lag].y_high, alpha=0.2,label='error band')

plt.title('Prediction on test for lag=7')
plt.legend()

In [None]:
##check for persistence effects
res['time_lagged'] = res.time-res.apply(lambda x: timedelta(minutes=10*x.lag),axis=1)
res_pers = pd.merge(res,res[['time','y']].rename(columns={'time': 'time_lagged', 'y':'last'}).drop_duplicates(),on='time_lagged')
error_persistent = res_pers.groupby('lag').apply(lambda x: np.nanmean((x['last']-x.y_median)**2)).reset_index().rename(columns={0:'error'}) 

In [None]:
plt.plot(error.lag,error.error,label='model')
plt.plot(error_persistent.lag,error_persistent.error,label = 'persistent')
plt.legend()