In [1]:
import pandas as pd
import numpy as np
from dsipts import TimeSeries, RNN,read_public_dataset, LinearTS, Persistent, TFT
import matplotlib.pyplot as plt
from datetime import timedelta
import logging
import sys
import random

file_handler = logging.FileHandler(filename='tmp.log')
stdout_handler = logging.StreamHandler(stream=sys.stdout)
handlers = [file_handler, stdout_handler]

logging.basicConfig(
    level=logging.INFO, 
    format='[%(asctime)s] {%(filename)s:%(lineno)d} %(levelname)s - %(message)s',
    handlers=handlers
)


In [244]:
def random_walk(n):
    tot = np.zeros(n)
    probs = np.zeros(n)
    for i in range(n-1):
        prob = random.random()
        if prob<0.5:
            delta = 1
        else:
            delta = -1
        tot[i+1] = tot[i] + delta
        probs[i+1]= prob
    return tot, probs

In [254]:
data

Unnamed: 0,y,p,time
0,0.0,0.000000,0
1,-1.0,0.793340,1
2,-2.0,0.821954,2
3,-1.0,0.485035,3
4,0.0,0.261621,4
...,...,...,...
19995,97.0,0.785012,19995
19996,96.0,0.908452,19996
19997,95.0,0.918215,19997
19998,96.0,0.488358,19998


In [378]:
N = 20000
random.seed(6)
x, p = random_walk(N)
data = pd.DataFrame({'y':x/x.max(),'p':p,'time':range(N)})

In [379]:
plt.plot(x)

[<matplotlib.lines.Line2D at 0x7fab1c304c10>]

In [380]:
##load the timeseries to the datastructure, adding the hour column and use all the covariates
ts = TimeSeries('weather')
ts.load_signal(data,enrich_cat=[],target_variables=['y'],past_variables= [],future_variables=['p'])
ts

[2023-10-13 11:15:30,423] {utils.py:24} INFO - 

[2023-10-13 11:15:30,424] {utils.py:25} INFO - ######################################################################################################################################################
[2023-10-13 11:15:30,425] {utils.py:26} INFO -                                                        I will drop duplicates, I dont like them                                                       
[2023-10-13 11:15:30,426] {utils.py:27} INFO - ######################################################################################################################################################
[2023-10-13 11:15:30,429] {utils.py:29} INFO -                I will update past column adding all target columns, if you want to avoid this beahviour please use check_pass as false                


Timeseries named weather of length 20000.
 Categorical variable: [],
 Future variables: ['p'],
 Past variables: ['y'],
 Target variables: ['y']
 With no group

In [381]:
#Let now prepare a model predictin the next 16 step using the past 16 steps 
past_steps = 64
future_steps = 64


In [382]:
ts.past_variables

['y']

In [451]:


config = dict(model_configs =dict(
                                    past_steps = past_steps,
                                    future_steps = future_steps,
                                    past_channels = len(ts.past_variables),
                                    future_channels = len(ts.future_variables),
                                    embs = [ts.dataset[c].nunique() for c in ts.cat_var],
                                    cat_emb_dim = 8,
                                    kernel_size = 3,
                                     use_bn = False,
                                    dropout_rate=0.0,
                                      optim='torch.optim.Adam',
                                      activation= 'torch.nn.PReLU',
                                     sum_emb = True,
                                     out_channels = len(ts.target_variables),
                                    hidden_size = 16,
                                    kind='linear',
                                    quantiles= [],
                                    persistence_weight = 1,
                                    simple=False,loss_type='high_order'
                                    ),
                scheduler_config = dict(gamma=0.1,step_size=24000000000000000),
                optim_config = dict(lr = 0.0005,weight_decay=0.00))
model_linear = LinearTS(**config['model_configs'],optim_config = config['optim_config'],scheduler_config =config['scheduler_config'],verbose=False )


In [452]:
#set the desirere model
ts.set_model(model_linear,config=config )

[2023-10-13 12:22:43,153] {utils.py:17} INFO - 

[2023-10-13 12:22:43,154] {utils.py:18} INFO - ######################################################################################################################################################
[2023-10-13 12:22:43,155] {utils.py:19} INFO - ####################################                                                                              ####################################
[2023-10-13 12:22:43,155] {utils.py:20} INFO -                                                                   Setting the model                                                                   
[2023-10-13 12:22:43,156] {utils.py:21} INFO - ####################################                                                                              ####################################
[2023-10-13 12:22:43,157] {utils.py:22} INFO - ################################################################################################################

In [453]:
##splitting parameters
split_params = {'perc_train':0.6,'perc_valid':0.2,                             ##if not None it will split 70% 10% 20%
               'range_train':None, 'range_validation':None, 'range_test':None, ## or we can split using ranges for example range_train=['2021-02-03','2022-04-08']
               'past_steps':past_steps,
               'future_steps':future_steps,
               'shift':0,
               'starting_point':None,                                          ## do not skip samples
               'skip_step' : 1                                                 ## distance between two consecutive samples
                             }

In [None]:
#train the model for 50 epochs with auto_lr_find 
ts.train_model(dirpath=f"/home/agobbi/Projects/ExpTS/rf/linear",
               split_params=split_params,
               batch_size=32,
               num_workers=2,
               max_epochs=100,
               auto_lr_find=True)

[2023-10-13 12:22:44,776] {utils.py:17} INFO - 

[2023-10-13 12:22:44,777] {utils.py:18} INFO - ######################################################################################################################################################
[2023-10-13 12:22:44,778] {utils.py:19} INFO - ####################################                                                                              ####################################
[2023-10-13 12:22:44,779] {utils.py:20} INFO -                                                                   Training the model                                                                  
[2023-10-13 12:22:44,780] {utils.py:21} INFO - ####################################                                                                              ####################################
[2023-10-13 12:22:44,781] {utils.py:22} INFO - ################################################################################################################

Global seed set to 42
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(


Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

LR finder stopped early after 99 steps due to diverging loss.
Learning rate set to 1.9054607179632475e-07
Restoring states from the checkpoint path at /home/agobbi/Projects/ExpTS/rf/linear/.lr_find_497103a2-b7d9-4c33-a3b4-9d8d17f16619.ckpt
Restored all states from the checkpoint file at /home/agobbi/Projects/ExpTS/rf/linear/.lr_find_497103a2-b7d9-4c33-a3b4-9d8d17f16619.ckpt

  | Name   | Type       | Params
--------------------------------------
0 | embs   | ModuleList | 0     
1 | loss   | L1Loss     | 0     
2 | linear | ModuleList | 2.4 K 
--------------------------------------
2.4 K     Trainable params
0         Non-trainable params
2.4 K     Total params
0.010     Total estimated model params size (MB)
  rank_zero_warn(
Epoch 0, global step 371: 'val_loss' reached 1.30809 (best 1.30809), saving model to '/home/agobbi/Projects/ExpTS/rf/linear/checkpoint-v43.ckpt' as top 1
Epoch 1, global step 742: 'val_loss' reached 1.19119 (best 1.19119), saving model to '/home/agobbi/Projects/Ex

In [446]:
#Print the losses, check overfitting
ts.losses.plot()

<Axes: >

In [447]:
#make inferences on 
res = ts.inference_on_set(200,4,set='test',rescaling=True)

[2023-10-13 12:22:02,763] {utils.py:17} INFO - 

[2023-10-13 12:22:02,768] {utils.py:18} INFO - ######################################################################################################################################################
[2023-10-13 12:22:02,769] {utils.py:19} INFO - ######################                                                                                                          ######################
[2023-10-13 12:22:02,770] {utils.py:20} INFO -                                                     Inference on a set (train, validation o test)                                                     
[2023-10-13 12:22:02,772] {utils.py:21} INFO - ######################                                                                                                          ######################
[2023-10-13 12:22:02,774] {utils.py:22} INFO - ################################################################################################################

In [448]:
%matplotlib qt
lag = 10

plt.plot(res[res.lag==lag].time, res[res.lag==lag].y,label='real',alpha=0.5)
plt.plot(res[res.lag==lag].time, res[res.lag==lag].y_pred,label='pred',alpha=0.5)

plt.title('Prediction on test for lag=7')
plt.legend()

<matplotlib.legend.Legend at 0x7fab15dc4580>

In [439]:
res['prediction_time'] = res.apply(lambda x: int(x.time-x.lag), axis=1)

In [449]:
error = res.groupby('lag').apply(lambda x: np.sqrt(np.mean((x.y_pred-x.y)**2))).reset_index()
plt.plot(error.lag,error[0])

[<matplotlib.lines.Line2D at 0x7fab15b03b50>]

In [450]:
error[0].mean() 

0.026814144

In [429]:
##0 -- 0.026814144
##2 -- 0.026652765
##5 -- 0.026685152

0.026652765

In [341]:
error

Unnamed: 0,lag,0
0,1,2.188419
1,2,2.188246
2,3,2.109994
3,4,2.150867
4,5,2.118228
...,...,...
59,60,4.406476
60,61,4.446941
61,62,4.653346
62,63,4.655902


In [369]:
%matplotlib qt
date = 16066

plt.plot(res[res.prediction_time==date].time, res[res.prediction_time==date].y,label='real',alpha=0.5)
plt.plot(res[res.prediction_time==date].time, res[res.prediction_time==date].y_pred,label='pred',alpha=0.5)
#plt.ylim(res.y.min(),res.y.max())
plt.title('Prediction on test for lag=7')
plt.legend()

<matplotlib.legend.Legend at 0x7fab1edc9210>

In [74]:
## get the median MSE for each lag
import numpy as np
res.groupby('lag').apply(lambda x: np.nanmean((x.y-x.y_median)**2)).reset_index().rename(columns={0:'error'}) 

AttributeError: 'DataFrame' object has no attribute 'y_median'

In [None]:
#save model 
ts.save(f"{model_to_use}_test")

In [None]:
## load the model and check if we obtain the same result

In [None]:
ts.load(LinearTS,f"{model_to_use}_test",load_last=False)
res = ts.inference_on_set(200,4,set='test',rescaling=True)
error = res.groupby('lag').apply(lambda x: np.nanmean((x.y-x.y_median)**2)).reset_index().rename(columns={0:'error'}) 
error

In [None]:
##print the mean MSE along the lag steps
plt.plot(error.lag,error.error)

In [None]:
%matplotlib inline
lag = 7
try:
    %matplotlib qast
    to_plot = res
except:
    print('better to have qt, i will reduce the dataset')
    plt.figure(figsize=(15,7))
    to_plot = res[res.time>pd.to_datetime('2020-12-28')]
plt.plot(to_plot[to_plot.lag==lag].time, to_plot[to_plot.lag==lag].y,label='real',alpha=0.5)
plt.plot(to_plot[to_plot.lag==lag].time, to_plot[to_plot.lag==lag].y_median,label='median',alpha=0.5)
plt.fill_between(to_plot[to_plot.lag==lag].time, to_plot[to_plot.lag==lag].y_low , to_plot[to_plot.lag==lag].y_high, alpha=0.2,label='error band')

plt.title('Prediction on test for lag=7')
plt.legend()

In [None]:
tot = pd.read_csv('/home/agobbi/Projects/ExpTS/csv/prova_test_tot_predictions.csv')
tot.time = pd.to_datetime(tot.time)

In [None]:
pers = tot[(tot.model=='persistent_weather_1')]

In [None]:
%matplotlib inline
lag = 7
try:
    %matplotlib qast
    to_plot = pers
except:
    print('better to have qt, i will reduce the dataset')
    plt.figure(figsize=(15,7))
    to_plot = pers[pers.time>pd.to_datetime('2020-12-28')]
plt.plot(to_plot[to_plot.lag==lag].time, to_plot[to_plot.lag==lag].y,label='real',alpha=0.5)
plt.plot(to_plot[to_plot.lag==lag].time, to_plot[to_plot.lag==lag].y_pred,label='median',alpha=0.5)
plt.fill_between(to_plot[to_plot.lag==lag].time, to_plot[to_plot.lag==lag].y_low , to_plot[to_plot.lag==lag].y_high, alpha=0.2,label='error band')

plt.title('Prediction on test for lag=7')
plt.legend()