## Leveraging a Temporal AutoEncoder for forecasting


In this notebook we will go over using a temporal autoencoder (a modified CustomTransformerDecoder) model for first creating representations of temporal data then for forecasting time series based on those representations. We will be training the model primarily on data from `Virgin River` which is a good example of anomalous event forecasting. Then we will train the model on additional data with layers.

In [None]:
from google.colab import auth
from datetime import datetime
import os
auth.authenticate_user()
!git clone https://github.com/AIStream-Peelout/flow-forecast.git

Cloning into 'flow-forecast'...
remote: Enumerating objects: 17363, done.[K
remote: Counting objects: 100% (4220/4220), done.[K
remote: Compressing objects: 100% (1362/1362), done.[K
remote: Total 17363 (delta 2986), reused 4023 (delta 2831), pack-reused 13143[K
Receiving objects: 100% (17363/17363), 4.54 MiB | 6.27 MiB/s, done.
Resolving deltas: 100% (12537/12537), done.


In [None]:
import os
os.chdir('flow-forecast')
!pip install shortuuid==1.0.1
!pip install -r  requirements.txt
!python setup.py develop
!mkdir data

Collecting shortuuid==1.0.1
  Downloading shortuuid-1.0.1-py3-none-any.whl (7.5 kB)
Installing collected packages: shortuuid
Successfully installed shortuuid-1.0.1
Collecting shap==0.40.0
  Downloading shap-0.40.0-cp37-cp37m-manylinux2010_x86_64.whl (564 kB)
[K     |████████████████████████████████| 564 kB 14.1 MB/s 
[?25hCollecting scikit-learn==0.24.2
  Downloading scikit_learn-0.24.2-cp37-cp37m-manylinux2010_x86_64.whl (22.3 MB)
[K     |████████████████████████████████| 22.3 MB 1.5 MB/s 
Collecting tb-nightly
  Downloading tb_nightly-2.9.0a20220213-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 56.2 MB/s 
Collecting wandb==0.12.10
  Downloading wandb-0.12.10-py2.py3-none-any.whl (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 20.3 MB/s 
[?25hCollecting google-cloud
  Downloading google_cloud-0.34.0-py2.py3-none-any.whl (1.8 kB)
Collecting plotly~=5.6.0
  Downloading plotly-5.6.0-py2.py3-none-any.whl (27.7 MB)
[K     |█████████████████████

In [None]:
os.environ['MODEL_BUCKET'] = "coronaviruspublicdata"
os.environ["ENVIRONMENT_GCP"] = "Colab"
os.environ["GCP_PROJECT"] = "gmap-997"

In [None]:
!mkdir joined_final_3
import pandas as pd
!gsutil cp gs://aistream-datasets/flowdb/01010500FVE_flow.csv .
!gsutil cp gs://aistream-datasets/flowdb/09405500AZC_flow.csv .
df = pd.read_csv("09405500AZC_flow.csv")
df = df.dropna(subset=["hour_updated", "cfs", "p01m", "tmpf", "dwpf"])
df.to_csv("joined_final_3/09405500AZC_flow.csv")

Copying gs://aistream-datasets/flowdb/01010500FVE_flow.csv...
- [1 files][ 26.5 MiB/ 26.5 MiB]                                                
Operation completed over 1 objects/26.5 MiB.                                     
Copying gs://aistream-datasets/flowdb/09405500AZC_flow.csv...
- [1 files][ 19.6 MiB/ 19.6 MiB]                                                
Operation completed over 1 objects/19.6 MiB.                                     


  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
 def make_config_file():
  run = wandb.init(project="auto_flow", entity="autoencoder")
  wandb_sweep_config = wandb.config
  the_config = {                 
      "model_name": "CustomTransformerDecoder",
      "model_type": "PyTorch",
      "model_params": {
        "n_time_series":4,
        "d_model":32,
        "seq_length": wandb_sweep_config["forecast_history"],
        "output_seq_length": wandb_sweep_config["forecast_history"], 
        "n_layers_encoder": wandb_sweep_config["number_encoder_layers"],
        "output_dim":4,
        "squashed_embedding": True,
        "use_mask": wandb_sweep_config["use_mask"]
      }, 
      "early_stopping":
      {
          "patience":3
      },
      "n_targets":4,
      "dataset_params":
      {  "class": "AutoEncoder",
        "training_path": "/content/flow-forecast/joined_final_3/09405500AZC_flow.csv",
        "validation_path": "/content/flow-forecast/joined_final_3/09405500AZC_flow.csv",
        "test_path": "/content/flow-forecast/joined_final_3/09405500AZC_flow.csv",
        "forecast_history":wandb_sweep_config["forecast_history"],
        "sort_column": "datetime",
        "forecast_length": wandb_sweep_config["forecast_history"],
        "train_end": 30000,
        "valid_start":30001,
        "valid_end": 32000,
        "test_start": 32001,
        "test_end": 34000,
        "target_col": ["cfs", "p01m", "tmpf", "dwpf"],
        "relevant_cols": ["cfs", "p01m", "tmpf", "dwpf"],
        "scaler": "StandardScaler",
        "interpolate": {
            "method":"back_forward_generic",
            "params":{
                "relevant_columns":["cfs",  "p01m", "tmpf", "dwpf"]
            }
            
        }
      },
      "training_params":
      {
        "criterion":"MSE",
        "optimizer": "Adam",
        "optim_params":
        {
            "lr": wandb_sweep_config["learning_rate"]

        },
        "epochs": 10,
        "batch_size": wandb_sweep_config["batch_size"]
      
      },
      "GCS": True,
      
      "wandb": False,
      "sweep": True,
    "metrics":["MSE"]
  }
  wandb.config.update(the_config)
  return the_config

In [None]:
wandb_sweep_config = {
  "name": "Default sweep",
  "method": "grid",
  "parameters": {
        "batch_size": {
            "values": [100, 200]
        },
        "learning_rate":{
            "values":[0.001, 0.0001, .01, .00001]
        },
        "forecast_history":{
            "values":[10, 24, 100]
        },
        "number_encoder_layers":
        {
            "values":[1, 3, 6]
        },
        "use_mask":{
            "values":[True, False]
        }
    }}

In [None]:
import wandb
from flood_forecast.meta_train import train_function
wandb.init()
sweep_full = wandb.sweep(wandb_sweep_config, project="auto_flow")
wandb.agent(sweep_full, lambda: train_function("PyTorch", make_config_file()))


ModuleNotFoundError: ignored

In [None]:
!pip install wandb --upgrade

## Part II Examining Quality of Representations 

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import pytz
# trained_model.test_data["datetime"] = trained_model.test_data["datetime"].astype("datetime64[ns]")
trained_model.test_data.get_from_start_date(datetime(2020, 3, 13))
datetime(2020, 3, 13).astimezone(pytz.utc)

In [None]:
trained_model.test_data.original_df["datetime"]

In [None]:
trained_model.model._modules.keys()

In [None]:
dt_row = trained_model.test_data.original_df[trained_model.test_data.original_df["datetime"] == datetime(2020, 3, 13)]
dt_row.index[0]

In [None]:
normal_river_data = trained_model.test_data[0]
normal_river_data1 = trained_model.test_data[200]
normal_river_data2 = trained_model.test_data[250]
rain_event_data = trained_model.test_data[39689]
tn = trained_model.model.make_embedding(rain_event_data[0].unsqueeze(0))
t1 = trained_model.model.make_embedding(normal_river_data[0].unsqueeze(0))
t2 = trained_model.model.make_embedding(normal_river_data1[0].unsqueeze(0))
t3 = trained_model.model.make_embedding(normal_river_data2[0].unsqueeze(0))

In [None]:
cosine_similarity(t2[:, :, 0].detach(), t3[:, :, 0].detach())

In [None]:
trained_model.test_data.original_df[trained_model.test_data.original_df["datetime"].astype("datetime64[ns]") > datetime(2020, 3, 10)]

In [None]:
datetime(2020, 3, 13).replace(tzinfo=None)

In [None]:
trained_model.model.eval()

In [None]:
import torch
trained_model.training.df

In [None]:
torch.rand(2, 5, 4) 

## Part III Forecasting with trained auto-encoder model
Here we will forecast with the trained auto-encoder model in step.

In [None]:
the_config = {                 
    "model_name": "CustomTransformerDecoder",
    "model_type": "PyTorch",
    "model_params": {
      "n_time_series":4,
      "d_model":32,
      "seq_length":5,
      "output_seq_length": 3, 
      "n_layers_encoder": 6,
      "output_dim":1,
      "squashed_embedding": True
     }, 
    "weight_path": "/content/flow-forecast/model_save/08_November_202109_00PM_model.pth",
    "weight_path_add": {
        "excluded_layers": ["output_dim_layer.weight", "output_dim_layer.bias", "out_length_lay.weight", "out_length_lay.bias" ],
        "frozen_layers": ["dense_shape", "pe", "transformer_enc", "squashed", "unsquashed"]
    },
    "dataset_params":
    {  "class": "default",
       "training_path": "/content/flow-forecast/joined_final_3/09405500AZC_flow.csv",
       "validation_path": "/content/flow-forecast/joined_final_3/09405500AZC_flow.csv",
       "test_path": "/content/flow-forecast/joined_final_3/09405500AZC_flow.csv",
       "forecast_history":5,
       "sort_column": "datetime",
       "forecast_length":3,
       "train_end": 19000,
       "valid_start":20000,
       "valid_end": 21000,
       "test_start": 30000,
       "test_end": 310000,
     
       "target_col": ["cfs"],
       "relevant_cols": ["cfs", "p01m", "tmpf", "dwpf"],
       "scaler": "StandardScaler",
       "interpolate": {
           "method":"back_forward_generic",
           "params":{
               "relevant_columns":["cfs",  "p01m", "tmpf", "dwpf"]
           }
           
       }
    },
    "training_params":
    {
       "criterion":"DilateLoss",
       "optimizer": "SGD",
       "optim_params":
       {
        "lr": 0.0001

       },

       "epochs": ,
       "batch_size":100
    
    },
    "inference_params":
   {     
         "datetime_start":"2016-05-31",
          "hours_to_forecast":336, 
          "test_csv_path":"/content/flow-forecast/joined_final_3/09405500AZC_flow.csv",
          "decoder_params":{
            "decoder_function": "simple_decode", 
            "unsqueeze_dim": 1},
         
   },
    "GCS": False,
    
    "wandb": {
       "name": "flood_forecast_auto",
       "project": "auto_flow",
       "tags": ["autoencoder", "test"]
    },
   "metrics":["MSE"]
}

In [None]:
from flood_forecast.trainer import train_function

In [None]:
t = train_function("PyTorch", the_config)

In [None]:
from flood_forecast.deployment.inference import InferenceMode

In [None]:
f

In [None]:
from flood_forecast.plot_functions import plot_df_test_with_confidence_interval

In [None]:
new_shit.model

In [None]:
import plotly.graph_objects as go

In [None]:
f1 = go.Figure()

In [None]:
import wandb
import plotly
import kaleido
import numpy as np
!pip install kaleido
wandb.Image(np.asarray(plotly.io.to_image(f1)))

In [None]:
import json
with open("/content/flow-forecast/model_save/04_November_202108_45PM.json") as f1:
  data = json.load(f1)
a = InferenceMode(336, 20,  data, "/content/flow-forecast/09405500AZC_flow.csv", "/content/flow-forecast/model_save/04_November_202108_45PM_model.pth" )

In [None]:
a.make_plots(datetime(2020, 3, 13), wandb_plot_id="core1")

In [None]:
import torch
from torch.optim import Adam
from flood_forecast.custom.dilate_loss import DilateLoss

In [None]:
from flood_forecast.time_model import PyTorchForecast
p1 = the_config["dataset_params"]["training_path"] 
new_shit = PyTorchForecast("CustomTransformerDecoder", p1, p1, p1, the_config)

In [None]:
s = SimpleLinearModel(1, 32, 3)

In [None]:
from torch.utils.data import DataLoader
laoder = DataLoader(new_shit.training, batch_size=100)
crit = DilateLoss()

In [None]:
opt = Adam(s.parameters(), 0.01)

In [None]:
new_shit.model.out_length_lay.weight.data.uniform_(0.0, 1.0)
new_shit.model.out_length_lay.bias.data.fill_(0)

In [None]:
class SimpleLinearModel(torch.nn.Module):
    """
    A very simple baseline model to resolve some of the
    difficulties with bugs in the various train/validation loops
    in code. Has only two layers.
    """

    def __init__(self, seq_length: int, n_time_series: int, output_seq_len=1, probabilistic: bool = False):
        super().__init__()
        self.forecast_history = seq_length
        self.n_time_series = n_time_series
        self.initial_layer = torch.nn.Linear(n_time_series, 1)
        self.probabilistic = probabilistic
        if self.probabilistic:
            self.output_len = 2
        else:
            self.output_len = output_seq_len
        self.output_layer = torch.nn.Linear(seq_length, self.output_len)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        x: A tensor of dimension (B, L, M) where
        B is the batch size, L is the length of the sequence
        """
        x = self.initial_layer(x)
        x = x.permute(0, 2, 1)
        x = self.output_layer(x)
        if self.probabilistic:
            mean = x[..., 0][..., None]
            std = torch.clamp(x[..., 1][..., None], min=0.01)
            return torch.distributions.Normal(mean, std)
        else:
            return x.view(-1, self.output_len)

In [None]:
for name, param in new_shit.model.named_parameters():
    if param.requires_grad:
        print(name, param.data)

In [None]:
s(o)

In [None]:
from flood_forecast.pytorch_training import compute_loss
crit2 = torch.nn.MSELoss()
a = list(new_shit.model.parameters())[-6].clone()
new_shit.model.eval()
for src, trg in laoder:
  opt.zero_grad()
  o = new_shit.model.make_embedding(src)
  print(o)
  o = s(o.permute(0, 2, 1))
  # print(o)
  d = trg[:, :, 0]
  l = compute_loss(d, o, torch.rand(2, 3, 4), crit2, False)
  l.backward()
  print(l.item())
  opt.step()
  print(new_shit.model.out_length_lay.weight.grad)
  b = list(new_shit.model.parameters())[-6]

In [None]:
torch.equal(new_shit.model.parameters()[-1], new_shit.model.parameters()[-1])

In [None]:
torch.equal(a, b)

In [None]:
a

In [None]:
b

In [None]:
a.data

In [None]:
b.data

In [None]:
list(new_shit.model.named_parameters())[-6]

In [None]:
list(new_shit.model.named_parameters())[-4]

In [None]:
new_shit.model(torch.rand(4, 5, 4))

In [None]:

x = new_shit.model.output_dim_layer(torch.rand(5, 4, 32))
new_shit.model.out_length_lay(x.permute(1, 2, 0))

In [None]:
x = new_shit.model.output_dim_layer(torch.rand(5, 4, 32))
x.permute(1, 2, 0)

In [None]:
new_shit.model.make_embedding(new_shit.training[200][0].unsqueeze(0)).shape

In [None]:
new_shit.model(new_shit.training[200][0].unsqueeze(0))

In [None]:
compute_loss(torch.ones(2, 4), new_shit.training[0][1], torch.rand(2, 3), torch.nn.MSELoss(), None)

In [None]:
new_shit.training[200][1][:, 0]

In [None]:
o.shape

In [None]:
new_shit.training.df["cfs"].describe()

In [None]:
import torch
trained_model.model(torch.rand(2, 5, 4))

In [None]:
t.model(torch.rand(2, 5, 4))

In [None]:
new_shit.model.make_embedding(torch.rand(2, 5, 4))

In [None]:
trained_model.model.make_embedding(torch.rand(4, 5, 4))

NameError: ignored