In [1]:
import pandas as pd
import numpy as np
from dsipts import TimeSeries, RNN,read_public_dataset, LinearTS, Persistent, TFT
import matplotlib.pyplot as plt
from datetime import timedelta
import logging
import sys
import random

file_handler = logging.FileHandler(filename='tmp.log')
stdout_handler = logging.StreamHandler(stream=sys.stdout)
handlers = [file_handler, stdout_handler]

logging.basicConfig(
    level=logging.INFO, 
    format='[%(asctime)s] {%(filename)s:%(lineno)d} %(levelname)s - %(message)s',
    handlers=handlers
)


In [2]:
def random_walk(n):
    tot = np.zeros(n)
    probs = np.zeros(n)
    for i in range(n-1):
        prob = random.random()
        if prob<0.5:
            delta = 1
        else:
            delta = -1
        tot[i+1] = tot[i] + delta
        probs[i+1]= prob
    return tot, probs
N = 20000
random.seed(6)
x, p = random_walk(N)
data = pd.DataFrame({'y':x/x.max(),'p':p,'time':range(N)})

In [3]:
##loasklearnthe timeseries to the datastructure, adding the hour column and use all the covariates
ts = TimeSeries('weather')
ts.load_signal(data,enrich_cat=[],target_variables=['y'],past_variables= [],future_variables=['p'])
ts

[2023-10-13 15:37:15,009] {utils.py:24} INFO - 

[2023-10-13 15:37:15,010] {utils.py:25} INFO - ######################################################################################################################################################
[2023-10-13 15:37:15,011] {utils.py:26} INFO -                                                        I will drop duplicates, I dont like them                                                       
[2023-10-13 15:37:15,012] {utils.py:27} INFO - ######################################################################################################################################################
[2023-10-13 15:37:15,016] {utils.py:29} INFO -                I will update past column adding all target columns, if you want to avoid this beahviour please use check_pass as false                


Timeseries named weather of length 20000.
 Categorical variable: [],
 Future variables: ['p'],
 Past variables: ['y'],
 Target variables: ['y']
 With no group

In [4]:
#Let now prepare a model predictin the next 16 step using the past 16 steps 
past_steps = 64
future_steps = 64


In [5]:
ts.future_variables

['p']

In [6]:


config = dict(model_configs =dict(
                                    past_steps = past_steps,
                                    future_steps = future_steps,
                                    past_channels = len(ts.past_variables),
                                    future_channels = len(ts.future_variables),
                                    embs = [ts.dataset[c].nunique() for c in ts.cat_var],
                                      d_model=128,
                                      d_head= 32,
                                      n_head=8,
                                     dropout_rate = 0.01,
                                      num_layers_RNN= 8,
                                      optim='torch.optim.SGD',
                                     out_channels = len(ts.target_variables),
                                    quantiles= [],
                                   loss_type='exponential_penalization',
                                 persistence_weight = 0,
                                    ),
                scheduler_config = dict(gamma=0.1,step_size=2400000000000),
                optim_config = dict(lr = 0.0005,weight_decay=0.0))
model_linear = TFT(**config['model_configs'],optim_config = config['optim_config'],scheduler_config =config['scheduler_config'],verbose=False )


In [7]:
#set the desirere model
ts.set_model(model_linear,config=config )

[2023-10-13 15:37:16,802] {utils.py:17} INFO - 

[2023-10-13 15:37:16,802] {utils.py:18} INFO - ######################################################################################################################################################
[2023-10-13 15:37:16,804] {utils.py:19} INFO - ####################################                                                                              ####################################
[2023-10-13 15:37:16,804] {utils.py:20} INFO -                                                                   Setting the model                                                                   
[2023-10-13 15:37:16,805] {utils.py:21} INFO - ####################################                                                                              ####################################
[2023-10-13 15:37:16,806] {utils.py:22} INFO - ################################################################################################################

In [8]:
##splitting parameters
split_params = {'perc_train':0.6,'perc_valid':0.2,                             ##if not None it will split 70% 10% 20%
               'range_train':None, 'range_validation':None, 'range_test':None, ## or we can split using ranges for example range_train=['2021-02-03','2022-04-08']
               'past_steps':past_steps,
               'future_steps':future_steps,
               'shift':0,
               'starting_point':None,                                          ## do not skip samples
               'skip_step' : 1                                                 ## distance between two consecutive samples
                             }

In [None]:
#train the model for 50 epochs with auto_lr_find 
ts.train_model(dirpath=f"/home/agobbi/Projects/ExpTS/rf/tft",
               split_params=split_params,
               batch_size=32,
               num_workers=2,
               max_epochs=30,
               auto_lr_find=True)

[2023-10-13 15:37:19,036] {utils.py:17} INFO - 

[2023-10-13 15:37:19,037] {utils.py:18} INFO - ######################################################################################################################################################
[2023-10-13 15:37:19,038] {utils.py:19} INFO - ####################################                                                                              ####################################
[2023-10-13 15:37:19,038] {utils.py:20} INFO -                                                                   Training the model                                                                  
[2023-10-13 15:37:19,039] {utils.py:21} INFO - ####################################                                                                              ####################################
[2023-10-13 15:37:19,039] {utils.py:22} INFO - ################################################################################################################

Global seed set to 42
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(


Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

LR finder stopped early after 98 steps due to diverging loss.
Learning rate set to 7.585775750291837e-08
Restoring states from the checkpoint path at /home/agobbi/Projects/ExpTS/rf/tft/.lr_find_11bbc504-b140-4a27-9585-496ac5731ac6.ckpt
Restored all states from the checkpoint file at /home/agobbi/Projects/ExpTS/rf/tft/.lr_find_11bbc504-b140-4a27-9585-496ac5731ac6.ckpt

   | Name                   | Type                    | Params
--------------------------------------------------------------------
0  | target_linear          | Linear                  | 256   
1  | linear_aux_past        | ModuleList              | 0     
2  | linear_aux_fut         | ModuleList              | 256   
3  | emb_cat_var            | embedding_cat_variables | 25.0 K
4  | rnn                    | LSTM_Model              | 1.1 M 
5  | res_conn1_past         | ResidualConnection      | 33.0 K
6  | res_conn1_fut          | ResidualConnection      | 33.0 K
7  | grn1_past              | GRN                     | 

In [157]:
#Print the losses, check overfitting
ts.losses.plot()

AttributeError: 'TimeSeries' object has no attribute 'losses'

In [158]:
#make inferences on 
res = ts.inference_on_set(200,4,set='test',rescaling=True)

[2023-10-13 15:36:38,178] {utils.py:17} INFO - 

[2023-10-13 15:36:38,179] {utils.py:18} INFO - ######################################################################################################################################################
[2023-10-13 15:36:38,180] {utils.py:19} INFO - ######################                                                                                                          ######################
[2023-10-13 15:36:38,181] {utils.py:20} INFO -                                                     Inference on a set (train, validation o test)                                                     
[2023-10-13 15:36:38,181] {utils.py:21} INFO - ######################                                                                                                          ######################
[2023-10-13 15:36:38,181] {utils.py:22} INFO - ################################################################################################################

AttributeError: 'TimeSeries' object has no attribute 'split_params'

In [146]:
%matplotlib qt
lag = 15

plt.plot(res[res.lag==lag].time, res[res.lag==lag].y,label='real',alpha=0.5)
plt.plot(res[res.lag==lag].time, res[res.lag==lag].y_pred,label='pred',alpha=0.5)

plt.title('Prediction on test for lag=7')
plt.legend()

<matplotlib.legend.Legend at 0x7fe8bc4e8970>

In [147]:
res['prediction_time'] = res.apply(lambda x: int(x.time-x.lag), axis=1)

In [148]:
res

Unnamed: 0,lag,time,y,y_pred,prediction_time
0,1,16064,0.377953,0.399360,16063
1,1,16065,0.370079,0.389656,16064
2,1,16066,0.377953,0.395594,16065
3,1,16067,0.385827,0.403635,16066
4,1,16068,0.377953,0.401778,16067
...,...,...,...,...,...
247803,64,19994,0.771654,0.673848,19930
247804,64,19995,0.763780,0.676096,19931
247805,64,19996,0.755906,0.673301,19932
247806,64,19997,0.748031,0.675759,19933


In [150]:
%matplotlib qt
date = 19932

plt.plot(res[res.prediction_time==date].time, res[res.prediction_time==date].y,label='real',alpha=0.5)
plt.plot(res[res.prediction_time==date].time, res[res.prediction_time==date].y_pred,label='pred',alpha=0.5)
#plt.ylim(res.y.min(),res.y.max())
plt.title('Prediction on test for lag=7')
plt.legend()

<matplotlib.legend.Legend at 0x7fe87c310670>

In [None]:
## get the median MSE for each lag
import numpy as np
res.groupby('lag').apply(lambda x: np.nanmean((x.y-x.y_median)**2)).reset_index().rename(columns={0:'error'}) 

In [None]:
#save model 
ts.save(f"{model_to_use}_test")

In [None]:
## load the model and check if we obtain the same result

In [None]:
ts.load(LinearTS,f"{model_to_use}_test",load_last=False)
res = ts.inference_on_set(200,4,set='test',rescaling=True)
error = res.groupby('lag').apply(lambda x: np.nanmean((x.y-x.y_median)**2)).reset_index().rename(columns={0:'error'}) 
error

In [None]:
##print the mean MSE along the lag steps
plt.plot(error.lag,error.error)

In [None]:
%matplotlib inline
lag = 7
try:
    %matplotlib qast
    to_plot = res
except:
    print('better to have qt, i will reduce the dataset')
    plt.figure(figsize=(15,7))
    to_plot = res[res.time>pd.to_datetime('2020-12-28')]
plt.plot(to_plot[to_plot.lag==lag].time, to_plot[to_plot.lag==lag].y,label='real',alpha=0.5)
plt.plot(to_plot[to_plot.lag==lag].time, to_plot[to_plot.lag==lag].y_median,label='median',alpha=0.5)
plt.fill_between(to_plot[to_plot.lag==lag].time, to_plot[to_plot.lag==lag].y_low , to_plot[to_plot.lag==lag].y_high, alpha=0.2,label='error band')

plt.title('Prediction on test for lag=7')
plt.legend()

In [None]:
tot = pd.read_csv('/home/agobbi/Projects/ExpTS/csv/prova_test_tot_predictions.csv')
tot.time = pd.to_datetime(tot.time)

In [None]:
pers = tot[(tot.model=='persistent_weather_1')]

In [None]:
%matplotlib inline
lag = 7
try:
    %matplotlib qast
    to_plot = pers
except:
    print('better to have qt, i will reduce the dataset')
    plt.figure(figsize=(15,7))
    to_plot = pers[pers.time>pd.to_datetime('2020-12-28')]
plt.plot(to_plot[to_plot.lag==lag].time, to_plot[to_plot.lag==lag].y,label='real',alpha=0.5)
plt.plot(to_plot[to_plot.lag==lag].time, to_plot[to_plot.lag==lag].y_pred,label='median',alpha=0.5)
plt.fill_between(to_plot[to_plot.lag==lag].time, to_plot[to_plot.lag==lag].y_low , to_plot[to_plot.lag==lag].y_high, alpha=0.2,label='error band')

plt.title('Prediction on test for lag=7')
plt.legend()