In [1]:
from datetime import datetime
import pandas as pd
import os 
import sys 
from torch.optim import SGD 
import torch.nn as nn
import matplotlib.pyplot as plt 
import numpy as np

# Load dataset
from load_DataSet_subway_15 import replace_negative, load_data_and_pivot, load_normalized_dataset,load_subway_shp

# Data Loader
from DL_utilities import DictDataLoader,Trainer
from utilities import evaluate_metrics

# Load Model 
notebook_dir = os.getcwd()
code_dir = os.path.abspath(os.path.join(notebook_dir, '../'))
if code_dir not in sys.path:
    sys.path.insert(0,code_dir)

try : 
    from Ray.Ray_tune.dl_models.CNN_based_model import cnn_perso
    from Ray.Ray_tune.dl_models.RNN_based_model import rnn_perso
    from Ray.Ray_tune.dl_models.GCN_based_model import graphconv
except : 
    from Ray_tune.dl_models.CNN_based_model import cnn_perso
    from Ray_tune.dl_models.RNN_based_model import rnn_perso
    from Ray_tune.dl_models.GCN_based_model import graphconv

In [2]:
from load_DataSet_subway_15 import load_adjacency_matrix

In [3]:
# Init
folder_path = 'data/'
file_name = 'Metro_15min_mar2019_mai2019.csv'
station_location_name = 'ref_subway.csv'
time_step_per_hour=4
H,W,D = 6,1,1
step_ahead = 1
train_prop = 0.6
start,end = datetime(2019,3,16),datetime(2019,6,1)
reindex = pd.date_range(start,end,freq = f'{60/time_step_per_hour}min')
print(f'Number of time-slot: {4*24*(end-start).days}')

Number of time-slot: 7392


## Load DataSet, Feature Vector, Adjacency Matrix, Normalize

In [4]:
# Load data
subway_in,subway_out = load_data_and_pivot(folder_path, file_name, reindex)
df_locations = load_subway_shp(folder_path,station_location_name)
# Pre-processing
subway_in = replace_negative(subway_in,method = 'linear') 
subway_out = replace_negative(subway_out,method = 'linear')

# Set forbidden dates :
# Data from  23_03_2019 14:00:00 to 28_04_2019 12:00:00 included should not been taken into account 
invalid_dates = pd.date_range(datetime(2019,4,23,14),datetime(2019,4,28,14),freq = f'{60/time_step_per_hour}min')

(dataset_in,U_in,Utarget_in) = load_normalized_dataset(subway_in,time_step_per_hour,train_prop,step_ahead,H,D,W,invalid_dates)
(dataset_out,U_out,Utarget_out) = load_normalized_dataset(subway_out,time_step_per_hour,train_prop,step_ahead,H,D,W,invalid_dates)

# colname2indx allow to keep track on the position of a station ('Ampere', 'Brotteaux', ...) within the Tensor
colname2indx_in,indx2colname_in = dataset_in.bijection_name_indx()
colname2indx_out,indx2colname_out = dataset_out.bijection_name_indx()

# Adjacency Matrices : 
A_dist = load_adjacency_matrix(dataset_in, type = 'distance', df_locations = df_locations, treshold = 0.3)
A_neighbor = load_adjacency_matrix(dataset_in, type = 'adjacent')
A_corr = load_adjacency_matrix(dataset_in, type = 'correlation')

print(f'U shape: {U_in.shape} and Target shape: {Utarget_in.shape}')

U shape: torch.Size([5663, 40, 8]) and Target shape: torch.Size([5663, 40, 1])


### Outliers qui posent problèmes 

In [4]:
if False: 
    dic_maxi = {}
    for c in subway_in:
        df_tmps = pd.DataFrame()
        d = subway_in[[c]]
        for k,(indx,flow) in enumerate(d.iterrows()):
            if ((flow[c] > 1200) and not(indx in invalid_dates)) :
                concat = d.iloc[k-3:k+3,:]
                df_tmps = pd.concat([df_tmps,concat])

        dic_maxi[c] = df_tmps
    

## DataLoader, train,valid,test split

In [5]:
# Parameter
valid_prop = 0.2
batch_size = 32

dataset = dataset_in
U,Utarget = U_in, Utarget_in

In [6]:
# Generate a dataloader object, which propose different validation (classic, K-fold Cross ...). 
# data_loader is a dictionnary containing train, valid, and test dataset
data_loader_obj = DictDataLoader(U,Utarget,train_prop,valid_prop,validation = 'classic', shuffle = True)
data_loader = data_loader_obj.get_dictdataloader(batch_size)

## Load and Train Model 

In [7]:
# Parameters of the dataset : 
L = H+W+D 

# Hyperparameters
epochs = 10
lr = 1e-5
momentum = 0.99 # 0.9
h_dim = 64

# Model name: 
model_name = 'cnn'

In [8]:
# Model
model = cnn_perso(c_in=1, h_dim=h_dim, c_out=1, kernel_size = (2,), L=L, padding = 0)

# Optimizer, Loss, Scheduler
optimizer = SGD(model.parameters(),lr=lr,momentum = momentum)
loss_function = nn.MSELoss()  

trainer = Trainer(model,data_loader,epochs,optimizer,loss_function,scheduler = None)
trainer.train_and_valid()

# Access to the list of Loss : 
# trainer.train_loss,trainer.valid_loss

## testing

In [10]:
(test_pred,Y_true) = trainer.test()  # Normalized Pred and Y_true
#df_metrics = evaluate_metrics(norm_Pred,norm_Y_true,metrics= ['mse','mae'])
#df_metrics    #{'mse': tensor(0.0119), 'mae': tensor(0.0778)}  avec les données non interpolées

test_pred = dataset.unormalize_tensor(test_pred)
Y_true = dataset.unormalize_tensor(Y_true)

df_metrics = evaluate_metrics(Pred,Y_true,metrics= ['mse','mae'])
df_metrics    # {'mse': tensor(31837.8555), 'mae': tensor(95.1038)}  avec les données non interpolées

{'mse': tensor(49133.3281), 'mae': tensor(100.6426)}

## HyperParameter Tuning
- add checkpoint
- K-fold cross validation 
- Tuning classique ou Ray Tuning

## Benchmark Model

## Add (simple) PI intervall 

## Basleine of PI intervall