In [3]:
import sys 
import os 
import pandas as pd
import numpy as np
current_file_path = os.getcwd()
parent_dir = os.path.abspath(os.path.join(current_file_path,'..'))
if parent_dir not in sys.path:
    sys.path.insert(0,parent_dir)

from dataset import DataSet
from datetime import datetime 
from utils.utilities import filter_args
from constants.paths import USELESS_DATES,FOLDER_PATH
''' This file has to :
 - return a DataSet object, with specified data, and spatial_units.
 - add argument 'n_vertex', 'C' to the NameSpace. These are specific to this data
 - Detail 'INVALID_DATE' and the 'coverage' period of the dataset.
'''

FILE_NAME = 'CRITER_3lanes/CRITER_3lanes'

list_of_invalid_period = []
#ist_of_invalid_period.append([datetime(2019,1,10,15,30),datetime(2019,1,14,15,30)])


INVALID_DATES = []
for start,end in list_of_invalid_period:
    INVALID_DATES = INVALID_DATES + list(pd.date_range(start,end,freq = f'15min'))
C = 1
n_vertex = 40
COVERAGE = pd.date_range(start='01/01/2019', end='01/01/2020', freq='15min')[:-1]

In [11]:
coverage_period = None
freq = '30min'
time_step_per_hour = 2

df = pd.DataFrame()
for month_name in ['Mars','Avril','Mai']:
    df_i = pd.read_csv(f"{parent_dir}/{FOLDER_PATH}/{FILE_NAME}_{month_name}.csv",index_col = 0)
    df_i.HORODATE = pd.to_datetime(df_i.HORODATE)
    df_i = df_i.groupby(['ID_POINT_MESURE',pd.Grouper(key = 'HORODATE',freq=freq)]).mean()
    df = pd.concat([df,df_i])
    df = df.reset_index()
    if coverage_period is not None:
        df = df[(df.HORODATE <= coverage_period.max())&(df.HORODATE >= coverage_period.min()) ]
    df_loop_occupancy_rate = df.pivot_table(index = 'HORODATE',column = 'ID_POINT_MESURE',value = 'TAUX_HEURE')
    df_flow = df.pivot_table(index = 'HORODATE',column = 'ID_POINT_MESURE',value = 'DEBIT_HEURE')

    for df_i,name_i in zip([df_loop_occupancy_rate,df_flow],['loop_occupancy_rate','flow']):
        df_i.columns.name = 'sensor'
        
        if (hasattr(args,'set_spatial_units')) and (args.set_spatial_units is not None) :
            print('Considered Spatial-Unit: ',args.set_spatial_units)
            spatial_unit = args.set_spatial_units
            indices_spatial_unit = [list(df_i.columns).index(station_i) for station_i in  spatial_unit]
            df_i = df_i[spatial_unit]
        else:
            spatial_unit = df_i.columns
            indices_spatial_unit = np.arange(len(df_i.columns))

        weekly_period =  int((24-len(USELESS_DATES['hour']))*(7-len(USELESS_DATES['weekday']))*time_step_per_hour)
        daily_period =  int((24-len(USELESS_DATES['hour']))*time_step_per_hour)
        periods = [weekly_period,daily_period]  

        args_DataSet = filter_args(DataSet, args)

        globals()[f"ataset_{name_i}"] = DataSet(df_i,
                        time_step_per_hour=time_step_per_hour, 
                        spatial_unit = spatial_unit,
                        indices_spatial_unit = indices_spatial_unit,
                        dims = [0],
                        city = 'Lyon',
                        periods = periods,
                        **args_DataSet)
    return globals()[f"dataset_loop_occupancy_rate"],globals()[f"dataset_flow"]
        
    

In [None]:

    df.columns.name = 'Station'
    df.index = pd.to_datetime(df.index)

    df = restrain_df_to_specific_period(df,coverage_period)
    time_step_per_hour = (60*60)/(df.iloc[1].name - df.iloc[0].name).seconds
    assert time_step_per_hour == 4, 'TIME STEP PER HOUR = {time_step_per_hour} ALORS QU ON VEUT =4 '

    df_correspondance = get_trigram_correspondance()
    df_correspondance.set_index('Station').reindex(df.columns)
    df.columns = df_correspondance.COD_TRG

    # Remove ouliers
    df = remove_outliers(df)

    if (hasattr(args,'set_spatial_units')) and (args.set_spatial_units is not None) :
        print('Considered Spatial-Unit: ',args.set_spatial_units)
        spatial_unit = args.set_spatial_units
        indices_spatial_unit = [list(df.columns).index(station_i) for station_i in  spatial_unit]
        df = df[spatial_unit]
    else:
        spatial_unit = df.columns
        indices_spatial_unit = np.arange(len(df.columns))

    weekly_period =  int((24-len(USELESS_DATES['hour']))*(7-len(USELESS_DATES['weekday']))*time_step_per_hour)
    daily_period =  int((24-len(USELESS_DATES['hour']))*time_step_per_hour)
    periods = [weekly_period,daily_period]  

    args_DataSet = filter_args(DataSet, args)

    dataset = DataSet(df,
                      time_step_per_hour=time_step_per_hour, 
                      spatial_unit = spatial_unit,
                      indices_spatial_unit = indices_spatial_unit,
                      dims = [0],
                      city = 'Lyon',
                      periods = periods,
                      **args_DataSet)

    return(dataset)
    

def remove_outliers(df):
    '''
    Replace the outliers by linear interpolation. Outliers are identified as MaxiMum flow recorded during the 'light festival' in Lyon. 
    It's an atypical event which reach the highest possible flow. Having higher flow on passenger is almost impossible.
    '''
    limits = {
        'BEL': 2700,
        'CHA': 1700,
        'GOR': 1700
    }
    default_limit = 1500

    # Appliquer les limites
    for column in df.columns:
        limit = limits.get(column, default_limit)
        df[column] = df[column].where(df[column] <= limit, None)

    # Interpolation linéaire
    df_interpolated = df.interpolate(method='linear')

    # Remplacer les valeurs originales par les interpolées
    df.update(df_interpolated)
    return df

def restrain_df_to_specific_period(df,coverage_period):
    if coverage_period is not None:
        df = df.loc[coverage_period]

    df = df.sort_index()
    return df


def get_trigram_correspondance():
    ''' Some surprise : 
        Vieux Lyon : Jea
        Gare d'oulins : OGA
    '''
    df = pd.DataFrame(columns = ['Station','COD_TRG'])
    df['COD_TRG'] = ['AMP','BEL','BRO','COR',
                     'CUI','CUS','FLA','GOR',
                     'BLA','GRA','GUI','GIL',
                     'HEN','HOT','LAE','MAS',
                     'MER','LUM','PRY','PER',
                     'SAN','SAX','VMY','JEA',
                     'BON','CHA','VAI','VEN',
                     'MAC','GAR','FOC','REP',
                     'GER','DEB','JAU','CPA',
                     'CRO','PAR','SOI','OGA']
    
    df['Station'] =['Ampère Victor Hugo','Bellecour','Brotteaux','Cordeliers',
                    'Cuire','Cusset','Flachet','Gorge de Loup',
                    'Grange Blanche','Gratte Ciel','Place Guichard','Guillotière',
                    'Hénon','Hôtel de ville - Louis Pradel','Laënnec','Masséna',
                    'Mermoz - Pinel','Monplaisir Lumière','Parilly','Perrache',
                    'Sans Souci','Saxe - Gambetta','Valmy','Vieux Lyon',
                    'Laurent Bonnevay','Charpennes','Gare de Vaise','Gare de Vénissieux',
                    'Jean Macé','Garibaldi','Foch','République Villeurbanne',
                    'Stade de Gerland','Debourg','Place Jean Jaurès','Croix Paquet',
                    'Croix-Rousse','Part-Dieu','La soie',"Gare d'Oullins"]
    return(df)