In [1]:
import sys 
import os 
import pandas as pd
import numpy as np
import json
from shapely.geometry import Point
import geopandas as gpd 

current_file_path = os.getcwd()
parent_dir = os.path.abspath(os.path.join(current_file_path,'..'))
if parent_dir not in sys.path:
    sys.path.insert(0,parent_dir)

from dataset import DataSet
from datetime import datetime 
from utils.utilities import filter_args,get_time_step_per_hour
from constants.paths import FOLDER_PATH
from examples.train_model_on_k_fold_validation import train_model_on_k_fold_validation,load_configuration


''' This file has to :
 - return a DataSet object, with specified data, and spatial_units.
 - add argument 'n_vertex', 'C' to the NameSpace. These are specific to this data
 - Detail 'INVALID_DATE' and the 'coverage' period of the dataset.
'''

FILE_NAME = 'CRITER_3lanes/CRITER_3lanes'
START = '03/01/2019'
END = '06/01/2019'
FREQ = '30min'
list_of_invalid_period = []
#ist_of_invalid_period.append([datetime(2019,1,10,15,30),datetime(2019,1,14,15,30)])

#C = 1
#n_vertex = 


def load_data(args,coverage_period=None):
    # Load df: 
    df = pd.DataFrame()
    idptm_list = []
    for month_name in ['Mars','Avril','Mai']:
        df_i = pd.read_csv(f"{parent_dir}/{FOLDER_PATH}/{FILE_NAME}_{month_name}.csv",index_col = 0)
        df_i.HORODATE = pd.to_datetime(df_i.HORODATE)
        idptm_list.append(df_i.ID_POINT_MESURE.unique())
        df_i = df_i.groupby(['ID_POINT_MESURE',pd.Grouper(key = 'HORODATE',freq=args.freq)]).mean()
        df = pd.concat([df,df_i])

    df = df.reset_index()
    idptm_list = list(set.intersection(*map(set, idptm_list)))
    df = df[df.ID_POINT_MESURE.isin(idptm_list)]
    if coverage_period is not None:
        df = df[(df.HORODATE <= coverage_period.max())&(df.HORODATE >= coverage_period.min()) ]
    df_loop_occupancy_rate = df.pivot_table(index = 'HORODATE',columns = 'ID_POINT_MESURE',values = 'TAUX_HEURE').sort_index()
    df_flow = df.pivot_table(index = 'HORODATE',columns = 'ID_POINT_MESURE',values = 'DEBIT_HEURE').sort_index()

    df_loop_occupancy_rate_full,df_occupancy_with_nan,nan_too_empty_occupancy,sparse_columns_occupancy = remove_sparse_sensor(df_loop_occupancy_rate,limit_max_nan = 200)
    df_flow_full,df_flow_with_nan,nan_too_empty_flow,sparse_columns_flow = remove_sparse_sensor(df_flow,limit_max_nan = 200)

    return df_loop_occupancy_rate_full,df_flow_full,idptm_list


def remove_sparse_sensor(df,limit_max_nan = 200):
    df_with_nan = pd.DataFrame()
    for c in df.columns:
        if df[c].isna().sum() > 0:
            df_with_nan[c] = df[c]

    columns_with_nan = df_with_nan.isna().sum()
    sparse_columns = columns_with_nan[columns_with_nan>limit_max_nan].index

    df = df.drop(columns = sparse_columns)
    nan_too_empty = df_with_nan[sparse_columns]
    df_with_nan = df_with_nan.drop(columns = sparse_columns)
    return df,df_with_nan,nan_too_empty,sparse_columns


def load_gdf_criter(json_folder_path,json_name):
    criter_json = json.load(open(f"{json_folder_path}/{json_name}",'r'))

    #Convert the Json to a GeoDataFrame : 
    tmp_list = []
    for k in range(len(criter_json["values"])):
        dic = criter_json["values"][k]
        dic['geometry'] = Point(dic['lon'],dic['lat'])
        tmp_list.append(dic)
    gdf = gpd.GeoDataFrame(tmp_list)
    gdf.crs = 'EPSG:4326'
    return gdf

def filter_idptm(x,idptm_list,sparse_columns,columns_with_nan,limit_max_nan):
    if x in idptm_list:
        if x in sparse_columns:
            return f'More than {limit_max_nan} NaN Values'
        elif x in columns_with_nan:
            return f'Less than {limit_max_nan}  NaN Values'
        
        else:
            return 'Full Data'
    else:
        return 'Value Missing for at least one month'


Training and Hyper-parameter tuning with Ray is not possible


In [7]:
pois_data_path = f"{parent_dir}/{FOLDER_PATH}/POIs/netmob_POI_Lyon"
print(os.listdir(f"{pois_data_path}/stadium/Instagram"))
print(os.listdir(f"{pois_data_path}/Inputs/VEN/data.npy"))

['df_4585967_0_DL.csv', 'df_85200814_0_DL.csv', 'df_353267337_0_DL.csv', 'df_4585967_0_UL.csv', 'df_85200814_0_UL.csv', 'df_353267337_0_UL.csv']
['data.npy', 'metadata.pkl']


In [39]:
import glob
import torch 
import numpy as np 

for expanded in ['','_expanded']:
    apps = ['Instagram','Facebook','Uber','Google_Maps','Waze','Spotify','Deezer','Telegram','Facebook_Messenger','Snapchat','WhatsApp','Twitter','Pinterest']
    tag_types = ['park','stadium','university','shop','nightclub','station','parkings','theatre','iris','transit','public_transport']
    transfer_modes = ['DL','UL']

    pois_data_path = f"{parent_dir}/{FOLDER_PATH}/POIs/netmob_POI_Lyon{expanded}"

    for app in apps: 
        for tag_type in tag_types : 
            for transfer_mode in transfer_modes: 
                List_time_series = []
                folder_path_to_save_agg_data = f"{pois_data_path}/Inputs/agg_TS/{tag_type}/{app}/{transfer_mode}"
                if not os.path.exists(folder_path_to_save_agg_data):
                    os.makedirs(folder_path_to_save_agg_data)
                list_of_csv = glob.glob(f"{pois_data_path}/{tag_type}/{app}/*_{transfer_mode}.csv")
                for csv in list_of_csv :
                    List_time_series.append(pd.read_csv(csv,index_col = 0).sum(axis=1).sort_index().values)
                agg_ts = torch.Tensor(np.array(List_time_series))
                np.save(open(f"{folder_path_to_save_agg_data}/data.npy","wb"),agg_ts.numpy())

In [41]:
import torch 
print(app,tag_type,transfer_mode)
NetMob_Tensor = torch.Tensor(np.array(List_time_series))
NetMob_Tensor.size()

Pinterest public_transport UL


torch.Size([1977, 7392])

In [47]:
NetMob_Tensor[:30,:].mean(0)

tensor([2593.1333, 1017.7000, 1019.2333,  ..., 1845.0333, 1976.4667,
         119.6667])

In [11]:
csv_0 = os.listdir(f"{pois_data_path}/stadium/Instagram")[0]



apps = ['Instagram']
transfer_modes = ['DL']
for app in apps:
    # Tackle Tile-ids around station : 
    List_tensor_osmid = []
    for osmid,tag in zip(list_assigned_osmid,list_tag_osmid):
        List_time_series = []
        ## Tackle transfer modes: 
        for transfer_mode in transfer_modes:
            time_serie =pd.read_csv(f"{pois_data_path}/{tag_type}/{app}/{osmid}_DL.csv",index_col = 0).sum(axis=1).sort_index()  # Be sure index are ordered in an ascending manner
            List_time_series.append(time_serie.values)
        # ---> Torch.Tensor size :[2,len(dates)] torch.Tensor  (or convert into [1,len(dates)] if only one transfer-mode is considered
        T_transfer = torch.Tensor(np.array(List_time_series))
        List_tensor_osmid.append(T_transfer)

    # ---> Torch.Tensor size : [len(POIs associated to station i) + 1,2,len(dates)]
    T_osmids_around_station = torch.stack(List_tensor_osmid)
    List_tensor_apps.append(T_osmids_around_station)

2019-03-16 00:00:00     822005.0
2019-03-16 00:15:00     721026.0
2019-03-16 00:30:00     563336.0
2019-03-16 00:45:00    1054455.0
2019-03-16 01:00:00     681556.0
                         ...    
2019-05-31 22:45:00    1664501.0
2019-05-31 23:00:00    1855780.0
2019-05-31 23:15:00    1807695.0
2019-05-31 23:30:00    1587906.0
2019-05-31 23:45:00      91637.0
Length: 7392, dtype: float64

In [8]:
saved_data = f"{pois_data_path}/Inputs/VEN/data.npy"
np.load(open(saved_data,"rb")).shape  #[Apps,POIs,Transfer-modes,T]

(13, 179, 2, 7392)

In [11]:
save_folder = 'K_fold_validation/training_with_HP_tuning/re_validation'
trial_id = 'subway_in_STGCN_MSELoss_2025_01_20_14_27_20569'
epochs_validation = 1
args,folds = load_configuration(trial_id,True)
args.freq = '30min'
df_loop_occupancy_rate,df_flow,idptm_list = load_data(args)


>>>> Load best CONFIG


In [12]:
limit_max_nan = 200
df_loop_occupancy_rate_full,df_with_nan,nan_too_empty,sparse_columns = remove_sparse_sensor(df_loop_occupancy_rate,limit_max_nan = limit_max_nan)


json_folder_path = f"{parent_dir}/{FOLDER_PATH}/../raw_data/Comptages_Velo_Routier/CRITER"

json_name = 'pvo_patrimoine_voirie.pvocomptagecriter.json'
gdf_init = load_gdf_criter(json_folder_path,json_name)


gdf = gdf_init[gdf_init.nbvoies == 3].drop_duplicates('identifiantptm')

print(f'Numer of init sensor with 3 lanes: {len(gdf)}')
print(f'Number of sensor with data for each months: {len(idptm_list)}')
print(f'Among them, number of sensor with more than {limit_max_nan} missing values: {len(sparse_columns)} ')
print(f'Among them, number of sensor with less than {limit_max_nan} missing values:  {len(list(df_with_nan.columns))} ')


gdf['removed'] = gdf.identifiantptm.apply(lambda x : filter_idptm(x,idptm_list,sparse_columns,list(df_with_nan.columns),limit_max_nan))
gdf[['gid','identifiantptm','geometry','removed']].explore('removed',
                                                           style_kwds={"style_function":lambda x: {"radius":7},'color':'black'}
                                                           )

Numer of init sensor with 3 lanes: 144
Number of sensor with data for each months: 69
Among them, number of sensor with more than 200 missing values: 0 
Among them, number of sensor with less than 200 missing values:  13 


In [13]:
gdf[gdf.removed == 'Full Data'][['gid','identifiantptm','geometry']].explore(style_kwds={"style_function":lambda x: {"radius":7},'color':'black'})

In [68]:


from plotting.TS_analysis import plot_TS
plot_TS(df_nan,width=1400,height=600,bool_show=True,title=f"Time Serie Loop Occupation Rate")