In [1]:

from DL_class import InvalidDatesCleaner,DatesVerifFeatureVect
from dataset import TrainValidTest_Split_Normalize

import os
import pandas as pd 
import torch
import pickle 
import numpy as np 

from paths import folder_path,file_name
from config import get_args
from utilities_DL import get_DataSet_and_invalid_dates, match_period_coverage_with_netmob


# ==================== Load Subway-in Dataset : ====================
# Load config
model_name = 'STGCN' #'CNN'
netmob = True
args = get_args(model_name)
#args = get_args(model_name = model_name,learn_graph_structure = True)  # MTGNN

# Modification : 
args.K_fold = 1
args.ray = False

# Load Init DataSet 
dataset,invalid_dates = get_DataSet_and_invalid_dates(args.abs_path, folder_path,file_name,
                                                      args.W,args.D,args.H,args.step_ahead,
                                                      single_station = False,coverage_period = None)

coverage = match_period_coverage_with_netmob(dataset)

dataset,invalid_dates = get_DataSet_and_invalid_dates(args.abs_path, folder_path,file_name,
                                                      args.W,args.D,args.H,args.step_ahead,
                                                      single_station = False,coverage_period = coverage)

# ==================== ........................ ====================


if torch.cuda.is_available():
    data_folder_path = '../../../data/' 
else:
    data_folder_path = '../../Data/'


# === INIT ====
save_folder = f"{data_folder_path}NetMob_tensor/"
netmob_data_folder_path = f"{data_folder_path}NetMob/"
step_south_north = 287  # Incremente by 287-ids when passing from south to north. 
epsilon=1000  #epsilon : radius, in meter (1000m) 
# W,H = 2*(epsilon//100 + 1), 2*(epsilon//100 + 1)

if not os.path.exists(save_folder):
    os.makedirs(save_folder)
# === .... ===




# ===== Load NetMob Data: =====
# NetMob Tensor : [T,N,C,H,W]
# dims : [0,-2,-1]  -> dimension for which we want to retrieve stats 
try :
    netmob_T = torch.stack([torch.load(f"{save_folder}station_{station}.pt") for station in ref_subway.COD_TRG])
    netmob_T = netmob_T.permute(1,0,*range(2, netmob_T.dim()))

except:
    netmob_T = torch.randn(7300,40,2,22,22)
    print("Load impossible. Création d'un random Tensor")

print('Init NetMob Dataset: ', netmob_T.size())
print('Number of Nan Value: ',torch.isnan(netmob_T).sum())
print('Total Number of Elements: ', netmob_T.numel() )
# ===== ....... =====
T = netmob_T.size(0)

# Tackle a specific fold : 
netmob_T1 = netmob_T[:100]

# Init :
dims = [0,-2,-1]
minmaxnorm = True
standardize = False

# ============ Load Train/Valid/Test Indices and removed forbidden dates : ============
# invalid dates = 
# invalid_indices = get_indices_from_dates(invalid)
indices = np.arange(T)
np.random.shuffle(indices)
invalid_indices = indices[:100]

# Get Split indices :
train_indices = np.arange(50)
valid_indices = np.arange(60,70)
test_indices = np.arange(80,100)

# Remove invalid_dates from indices :
cleaner = InvalidDatesCleaner(invalid_indices = invalid_indices)

train_indices = cleaner.clean_indices(train_indices)
valid_indices = cleaner.clean_indices(valid_indices)
test_indices = cleaner.clean_indices(test_indices)
# ============ .......................................................... ============

# Load Splitter Object
splitter = TrainValidTest_Split_Normalize(netmob_T1,dims,train_indices, valid_indices, test_indices,minmaxnorm=minmaxnorm,standardize=standardize)


# Split DataSet and Normalize accoding Stats from Training Set 
train_dataset,valid_dataset,test_dataset = splitter.load_normalize_tensor_datasets()

# Define DictDataLoader :



# ========== Comparaison avec le df_verif sur les timestamp dates:  ==========
# Trouver un moyen d'uniformiser 'time_step_per_hour' pour toute les données, genre dénominateur commun etc.
dates_verif_obj = DatesVerifFeatureVect(dataset.df_dates, Weeks = args.W, Days = args.D, historical_len = args.H, step_ahead = args.step_ahead, time_step_per_hour = dataset.time_step_per_hour)
dataset.get_feature_vect(invalid_dates)
# ========== .... ==========

Time-step per hour: 4.0
coverage period: 2019-01-01 00:00:00 - 2020-01-01 00:00:00
Time-step per hour: 4.0
coverage period: 2019-03-16 00:00:00 - 2019-05-31 23:45:00
Load impossible. Création d'un random Tensor
Init NetMob Dataset:  torch.Size([7300, 40, 2, 22, 22])
Number of Nan Value:  tensor(0)
Total Number of Elements:  282656000
Tackling Training Set
Values with issues:  0.000%
Regular Values that we have to set to 0:  0.000%
Tackling Validation Set
Tackling Testing Set
Values with issues:  0.000%
Regular Values that we have to set to 0:  0.000%
Values with issues:  0.000%
Regular Values that we have to set to 0:  0.000%


In [2]:
dataset_no_norm = dataset.clean_dataset_get_tensor_and_train_valid_test_split(dataset.df,invalid_dates,args.train_prop,args.valid_prop,args.test_prop, normalize = False)

TypeError: clean_dataset_get_tensor_and_train_valid_test_split() missing 1 required positional argument: 'tuple_agg'

In [38]:
import torch


def repeat_mini(X,S, dims):
    '''
    args
    ----
    X : feature vector              >>>> torch.randn(T,N,C,H,W,L)
    I : Train input                 >>>> torch.randn(T,N,C,H,W)
    S : statistics (mini,mean...)   >>>> torch.randn(N,H)
    dims : dimension for which we have aggregated >>>> [0,2,4] 
    '''
    reshaped_vector, repeat_vector = [1]*X.dim(),[1]*X.dim()
    conj_dims = [x for x in np.arange(feature_vect.dim()-1) if not x in dims]
    #print('reshaped_vector dimension: ',len(reshaped_vector))
    #print('dims : ',dims)
    for k,c in enumerate(conj_dims):
        #print('k: ',k)
        #print('c: ',c)
        reshaped_vector[c] = S.size(k)
    for k,c in enumerate(X.size()):
        if reshaped_vector[k] == 1:
            repeat_vector[k] = c

    #print('S.size: ',S.size())
    #print('reshaped_vector: ',reshaped_vector)
    S = S.reshape(tuple(reshaped_vector))
    S = S.repeat(tuple(repeat_vector))
    return(S)


T,N,C,H,W,L = 100,40,2,20,20,6

# Test avec subway-in
feature_vect = torch.randn(T,N,L)
train_input = torch.randn(T,N)
dims = [0]  # min = [N]
mini = torch.randn(N)

repeated_mini = repeat_mini(feature_vect,mini, dims)
print('Subway-in:', repeated_mini.size())
# ...


# Test avec NetMob Image
train_input = torch.randn(T,N,C,H,W)
feature_vect = torch.randn(T,N,C,H,W,L)
dims = [0,3,4]  # min = [N]

mini = torch.randn(N,C)

repeated_mini = repeat_mini(train_input,mini, dims)
print('NetMob1:', repeated_mini.size())
# ...



# Test avec NetMob sous une autre forme d'aggregation
train_input = torch.randn(T,N,C,H,W)
feature_vect = torch.randn(T,N,C,H,W,L)
dims = [0,2,4]  # min = [N]

mini = torch.randn(N,H)

repeated_mini = repeat_mini(train_input,mini, dims)
print('NetMob2:',repeated_mini.size())
# ...


# Test avec NetMob sous une autre forme d'aggregation
train_input = torch.randn(T,N,C,H,W)
feature_vect = torch.randn(T,N,C,H,W,L)
dims = [0]  # min = [N]

mini = torch.randn(N,C,H,W)

repeated_mini = repeat_mini(train_input,mini, dims)
print('NetMob3:',repeated_mini.size())
# ...

Subway-in: torch.Size([100, 40, 6])
NetMob1: torch.Size([100, 40, 2, 20, 20])
NetMob2: torch.Size([100, 40, 2, 20, 20])
NetMob3: torch.Size([100, 40, 2, 20, 20])


In [31]:
from dataset import TensorDataset
tensor = dataset_no_norm.train_input
tensor_ds = TensorDataset(tensor,mini=None,maxi=None,mean=None,std=None, normalized = False)


In [52]:
tensor_ds.mini = reshaped_inputs.min(-1).values  
tensor_ds.maxi = reshaped_inputs.max(-1).values  
tensor_ds.mean = reshaped_inputs.mean(-1).values  
tensor_ds.std = reshaped_inputs.std(-1).values  

In [55]:
# Normalize
normalized_tensor_1 = tensor_ds.transform(reshaped_inputs,minmaxnorm,standardize,reverse=False)

# reshape-back, inverse-permute
normalized_tensor_2 = tensor_ds.inverse_reshape_permute(normalized_tensor_1)

Values with issues:  0.000%
Regular Values that we have to set to 0:  0.000%


In [58]:
normalized_tensor_1.size()

torch.Size([40, 3606])

In [59]:
normalized_tensor_2.size()

torch.Size([3606, 40])

In [43]:
normalized_tensor_ds = tensor_ds.normalize_tensor(dims = [0], minmaxnorm=True)
normalized_tensor_ds.tensor

RuntimeError: The size of tensor a (40) must match the size of tensor b (3606) at non-singleton dimension 0

In [42]:
normalized_tensor_ds.mini

tensor([6., 2., 0.,  ..., 7., 3., 6.], dtype=torch.float64)

In [31]:
values = torch.randn(100,40)
values_bis = values.transpose(1,0)
mini  = torch.randn(40)

In [6]:
df = dataset.df
reindex = dataset.df_verif.stack().unique()[100:110]

In [9]:
# dataset.train_valid_test_split_indices(self,train_prop,valid_prop,test_prop)
dataset.train_input


AttributeError: 'DataSet' object has no attribute 'train_input'

In [15]:
re_indices = 

array(['2019-03-22T07:45:00.000000000', '2019-03-23T07:45:00.000000000',
       '2019-03-16T08:00:00.000000000', '2019-03-22T08:00:00.000000000',
       '2019-03-23T08:00:00.000000000', '2019-03-16T08:15:00.000000000',
       '2019-03-22T08:15:00.000000000', '2019-03-23T08:15:00.000000000',
       '2019-03-16T08:30:00.000000000', '2019-03-22T08:30:00.000000000'],
      dtype='datetime64[ns]')

In [5]:
df.reindex(self.df_verif_train.stack().unique())
        self.df_valid = self.df.reindex(self.df_verif_valid.stack().unique()) if valid_prop > 1e-3 else None
        self.df_test = self.df.reindex(self.df_verif_test.stack().unique()) if test_prop > 1e-3 else None

Station,Ampère Victor Hugo,Bellecour,Brotteaux,Charpennes,Cordeliers,Croix Paquet,Croix-Rousse,Cuire,Cusset,Debourg,...,Part-Dieu,Perrache,Place Guichard,Place Jean Jaurès,République Villeurbanne,Sans Souci,Saxe - Gambetta,Stade de Gerland,Valmy,Vieux Lyon
2019-03-16 00:00:00,34.0,396.0,37.0,164.0,143.0,6.0,33.0,17.0,24.0,40.0,...,68.0,144.0,26.0,48.0,53.0,55.0,112.0,59.0,71.0,155.0
2019-03-16 00:15:00,40.0,298.0,56.0,164.0,139.0,3.0,46.0,7.0,20.0,46.0,...,77.0,145.0,31.0,32.0,45.0,55.0,159.0,56.0,41.0,149.0
2019-03-16 00:30:00,17.0,258.0,24.0,74.0,100.0,19.0,6.0,0.0,4.0,13.0,...,43.0,55.0,15.0,19.0,4.0,45.0,98.0,4.0,72.0,127.0
2019-03-16 00:45:00,1.0,44.0,3.0,13.0,13.0,1.0,6.0,3.0,2.0,2.0,...,6.0,5.0,3.0,4.0,0.0,13.0,17.0,4.0,3.0,12.0
2019-03-16 01:00:00,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,4.0,4.0,2.0,2.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-05-31 22:45:00,28.0,355.0,37.0,163.0,133.0,1.0,44.0,10.0,42.0,79.0,...,92.0,224.0,129.0,34.0,32.0,58.0,152.0,83.0,49.0,86.0
2019-05-31 23:00:00,34.0,262.0,37.0,157.0,155.0,9.0,51.0,14.0,30.0,66.0,...,74.0,261.0,124.0,42.0,24.0,27.0,127.0,52.0,53.0,98.0
2019-05-31 23:15:00,28.0,325.0,37.0,154.0,126.0,13.0,45.0,8.0,28.0,44.0,...,104.0,140.0,66.0,33.0,14.0,45.0,140.0,46.0,57.0,110.0
2019-05-31 23:30:00,26.0,342.0,42.0,132.0,101.0,5.0,49.0,16.0,14.0,58.0,...,135.0,185.0,57.0,23.0,16.0,25.0,198.0,49.0,53.0,119.0


In [14]:
torch.tensor(dataset_no_norm.df_train.values).max(dim = 0)[0]

(tensor([  268.,  1342.,   255.,  1329.,   974.,    47.,   298.,   377.,   320.,
           504.,   406.,   414.,   885.,  1186.,   804.,   561.,  1510.,  1062.,
           587., 10798.,   433.,  1036.,  1072.,   751.,   545.,   528.,   419.,
           480.,   480.,   334.,  1366.,  1057.,   396.,   575.,   400.,   816.,
           771.,   987.,   454.,   601.], dtype=torch.float64),
 tensor(52.6216, dtype=torch.float64))

In [16]:
torch.tensor(dataset_no_norm.df_train.values).std(dim = 0)

tensor([ 52.6216, 293.6610,  52.4313, 254.9218, 139.7208,   8.3684,  53.1447,
         46.3802,  59.7228,  83.3747,  69.9714,  92.0631, 140.9368, 239.9919,
        130.1312, 117.5363, 255.4895, 211.9190, 105.1930, 267.2897,  48.4344,
        210.3718, 170.5582, 158.2803, 101.5777,  42.1111,  86.7357,  88.9991,
         86.1424,  58.0196, 286.4249, 218.4956,  76.9825, 108.5609,  71.4679,
        133.8276, 149.7332,  71.5599,  90.7328,  84.7020], dtype=torch.float64)

In [17]:
torch.tensor(dataset_no_norm.df_train.values).mean(dim = 0)

tensor([ 54.3095, 308.4409,  48.4077, 271.0086, 136.6765,   8.9445,  60.1783,
         37.6090,  63.1620,  82.7944,  70.6380,  91.0107, 119.5044, 201.6920,
        117.0514, 115.0198, 230.7813, 208.1015, 111.5652, 156.0616,  42.3608,
        251.7402, 167.9920, 149.7220, 108.6614,  38.6057,  91.7052,  87.2338,
         83.1197,  55.9168, 295.6901, 246.4996,  70.7958,  98.2704,  70.5975,
        123.9393, 177.1622,  54.1563,  97.9735,  90.2060], dtype=torch.float64)

In [7]:
dataset_no_norm.df_train.max()

Station
Ampère Victor Hugo                 268.0
Bellecour                         1342.0
Brotteaux                          255.0
Charpennes                        1329.0
Cordeliers                         974.0
Croix Paquet                        47.0
Croix-Rousse                       298.0
Cuire                              377.0
Cusset                             320.0
Debourg                            504.0
Flachet                            406.0
Foch                               414.0
Gare d'Oullins                     885.0
Gare de Vaise                     1186.0
Gare de Vénissieux                 804.0
Garibaldi                          561.0
Gorge de Loup                     1510.0
Grange Blanche                    1062.0
Gratte Ciel                        587.0
Guillotière                      10798.0
Hénon                              433.0
Hôtel de ville - Louis Pradel     1036.0
Jean Macé                         1072.0
La soie                            751.0
Laurent 

In [3]:
dataset_norm = dataset.clean_dataset_get_tensor_and_train_valid_test_split(dataset.df,invalid_dates,args.train_prop,args.valid_prop,args.test_prop, normalize = True)

Tackling Training Set


TypeError: expected Tensor as element 0 in argument 0, but got numpy.float64

In [4]:
dataset.split_normalize_load_feature_vect(args,invalid_dates,args.train_prop, args.valid_prop, args.test_prop)

Tackling Training Set


TypeError: expected Tensor as element 0 in argument 0, but got numpy.float64

## Pour le cas des NetMob Data : 

In [13]:
dataset.df_dates

Unnamed: 0,date
0,2019-03-16 00:00:00
1,2019-03-16 00:15:00
2,2019-03-16 00:30:00
3,2019-03-16 00:45:00
4,2019-03-16 01:00:00
...,...
7387,2019-05-31 22:45:00
7388,2019-05-31 23:00:00
7389,2019-05-31 23:15:00
7390,2019-05-31 23:30:00


In [14]:
dataset.df_verif

Unnamed: 0,t-672,t-96,t-6,t-5,t-4,t-3,t-2,t-1,t+0
672,2019-03-16 00:00:00,2019-03-22 00:00:00,2019-03-22 22:30:00,2019-03-22 22:45:00,2019-03-22 23:00:00,2019-03-22 23:15:00,2019-03-22 23:30:00,2019-03-22 23:45:00,2019-03-23 00:00:00
673,2019-03-16 00:15:00,2019-03-22 00:15:00,2019-03-22 22:45:00,2019-03-22 23:00:00,2019-03-22 23:15:00,2019-03-22 23:30:00,2019-03-22 23:45:00,2019-03-23 00:00:00,2019-03-23 00:15:00
674,2019-03-16 00:30:00,2019-03-22 00:30:00,2019-03-22 23:00:00,2019-03-22 23:15:00,2019-03-22 23:30:00,2019-03-22 23:45:00,2019-03-23 00:00:00,2019-03-23 00:15:00,2019-03-23 00:30:00
675,2019-03-16 00:45:00,2019-03-22 00:45:00,2019-03-22 23:15:00,2019-03-22 23:30:00,2019-03-22 23:45:00,2019-03-23 00:00:00,2019-03-23 00:15:00,2019-03-23 00:30:00,2019-03-23 00:45:00
676,2019-03-16 01:00:00,2019-03-22 01:00:00,2019-03-22 23:30:00,2019-03-22 23:45:00,2019-03-23 00:00:00,2019-03-23 00:15:00,2019-03-23 00:30:00,2019-03-23 00:45:00,2019-03-23 01:00:00
...,...,...,...,...,...,...,...,...,...
7387,2019-05-24 22:45:00,2019-05-30 22:45:00,2019-05-31 21:15:00,2019-05-31 21:30:00,2019-05-31 21:45:00,2019-05-31 22:00:00,2019-05-31 22:15:00,2019-05-31 22:30:00,2019-05-31 22:45:00
7388,2019-05-24 23:00:00,2019-05-30 23:00:00,2019-05-31 21:30:00,2019-05-31 21:45:00,2019-05-31 22:00:00,2019-05-31 22:15:00,2019-05-31 22:30:00,2019-05-31 22:45:00,2019-05-31 23:00:00
7389,2019-05-24 23:15:00,2019-05-30 23:15:00,2019-05-31 21:45:00,2019-05-31 22:00:00,2019-05-31 22:15:00,2019-05-31 22:30:00,2019-05-31 22:45:00,2019-05-31 23:00:00,2019-05-31 23:15:00
7390,2019-05-24 23:30:00,2019-05-30 23:30:00,2019-05-31 22:00:00,2019-05-31 22:15:00,2019-05-31 22:30:00,2019-05-31 22:45:00,2019-05-31 23:00:00,2019-05-31 23:15:00,2019-05-31 23:30:00
