## Imports Python

In [44]:
import pandas as pd
import numpy as np
from datetime import datetime
from datetime import timedelta
import re

## Classe AnalyzerGTFS

Cette classe a pour but de fouiller des données GTFS et d'en retourner les destinations possible en partant d'une ville, avec les différents trajets réalisables

In [66]:
# Version Stable de l'AnalyzerGTFS

SEUIL_DISTANCE = 0.03

# Exemple de ville : StopArea:OCE87586545

class AnalyzerGTFS:
    stop_time_trip = [] # Contient StopPoints, Trajets et horaires des trajets
    trip = [] # Contient les trip_id, route_id et les services_id
    stops_trip_id = [] # Contient tous les stops des trajets corrects
    date = [] # Contient tous les services et toutes les dates comprises entre date_min et date_max, (Utile pour afficher à l'utilisateur toutes les dates possibles pour le trajet)
    destinations = [] # Contient les stops après la ville (StopSequence > StopSequence de la ville)
    stops = []
    calendar_dates = []
    routes = []
    stop_times = []
    trips = []
    lat = 0
    lon = 0
    date_min = None
    date_max = None

    def __init__(self, path ='TER'):
        self.calendar_dates = pd.read_csv('Data/'+ path +'/calendar_dates.txt')
        self.stop_times = pd.read_csv('Data/'+ path +'/stop_times.txt')[['trip_id', 'stop_id', 'departure_time']]
        self.stops = pd.read_csv('Data/'+ path +'/stops.txt')[['stop_id', 'stop_name', 'stop_lat', 'stop_lon', 'parent_station']]
        self.trips = pd.read_csv('Data/'+ path +'/trips.txt')[['service_id', 'trip_id', 'route_id']]
        self.calendar_dates.date = pd.to_datetime(self.calendar_dates["date"], format='%Y%m%d').dt.date
        self.calendar_dates['date'] = self.calendar_dates.apply(lambda x : datetime.combine(x['date'], datetime.min.time()), axis=1)
        self.calendar_dates = self.calendar_dates.drop_duplicates(subset='service_id')
        self.stop_times['departure_time'] = self.stop_times.apply(lambda x : pd.Timedelta(x['departure_time']), axis=1)
        #self.stop_times.departure_time = self.stop_times.apply(lambda x : date_to_timestamp(x['departure_time']), axis=1)
        self.stops['parent_station'] = self.stops['parent_station'].fillna('')
    
    # Etape 1 : Récupérer les stops de la ville / StopArea
    def get_stops(self):
        self.id_ville = self.villes_proches(self.lat, self.lon)
        ville_Set = set(self.id_ville['stop_id'].array)
        return self.stops[[set([l]).issubset(ville_Set) for l in self.stops.parent_station.values.tolist()]]['stop_id']
    
    # Etape 2 : Récupérer tous les trajets des différents StopPoints, peut importe la date
    def get_trips(self):
        stops_ids = self.get_stops()
        self.stop_time_trip = pd.merge(self.stop_times, stops_ids, on='stop_id')
        self.stop_time_trip = self.stop_time_trip.drop_duplicates(subset='trip_id') #Doublons possibles si deux StopPoints différents sont sur un même trajet
        return self.stop_time_trip['trip_id']

    # Etape 3 : Récupérer les services_id des Trajets
    def get_service_id(self):
        trip_ids = self.get_trips()
        self.trip = pd.merge(trip_ids, self.trips, on='trip_id')
        return self.trip['service_id']
    
    # Etape 4 : Récupérer les services fonctionnant sur les dates données
    def get_dates(self):
        service_id = self.get_service_id()
        dates = pd.merge(service_id, self.calendar_dates, on='service_id')
        return dates[(dates['date'] >= self.date_min) & (dates['date'] <= self.date_max)]
    
    # Etape 5 : Récupérer les trajets des services fonctionnant sur les dates données
    def get_trajets(self):
        services_uniques = self.get_dates().drop_duplicates(subset='service_id')
        return pd.merge(services_uniques, self.trip, on='service_id')
    
    # Etape 6 : Récupérer les stops des trajets corrects
    def get_stops_trajets(self):
        trajets = self.get_trajets()
        return pd.merge(trajets, self.stop_times, on='trip_id')
    
    # Etape 7 : Récupérer les ids des destinations, c'est à dire les stops après la ville (StopSequence > StopSequence de la ville)
    def get_stops_destinations(self):
        stops_trip_id = self.get_stops_trajets().assign(temps_ville = "", stop_id_ville = "")

        stop_ids_ville = self.stop_time_trip['stop_id']
        stop_ids_ville.index = self.stop_time_trip['trip_id']
        stop_id_ville_origine = stop_ids_ville.loc[stops_trip_id['trip_id']]

        stops_trip_id['stop_id_ville'] = stop_id_ville_origine.array
        time = self.stop_time_trip['departure_time'] #StopTime trip
        time.index = self.stop_time_trip['trip_id']
        value = time.loc[stops_trip_id['trip_id']]
        stops_trip_id['temps_ville'] = value.array
        self.destinations = stops_trip_id[stops_trip_id['departure_time'] > stops_trip_id['temps_ville']]
        return self.destinations['stop_id']
    
    # Etape 8 : Récupérer les destinations
    def get_destinations(self, lat, lon, date_min = datetime.today, date_max = datetime.today):
        self.load_search(lat, lon, date_min, date_max)
        destinations_uniques = self.get_stops_destinations()
        destinations_StopPoint = pd.merge(destinations_uniques, self.stops, on='stop_id')
        destinationSet = set(destinations_StopPoint['parent_station'].array)
        destinations_StopArea = self.stops[[set([l]).issubset(destinationSet) for l in self.stops.stop_id.values.tolist()]]
        destinations_StopArea.drop_duplicates(subset='stop_id')
        return destinations_StopArea
    
    # Retourne les stopArea proche du point de départ
    def villes_proches(self,lat,long):
        return self.stops[(self.stops['stop_lat'] > lat-SEUIL_DISTANCE) & (self.stops['stop_lat'] < lat+SEUIL_DISTANCE) & (self.stops['stop_lon'] > long-SEUIL_DISTANCE) & (self.stops['stop_lon'] < long+SEUIL_DISTANCE) & self.stops['stop_id'].str.contains('StopArea')]
    
    # Pas très efficace, à améliorer -> Trouver la source et enlever les doublons auparavant
    def trajet_destination(self,villeDestination):
        destinationsParentStation = pd.merge(self.destinations, self.stops, on='stop_id') # Merge entre les destinations et les stops pour récupérer les parent_station
        StopPoints = destinationsParentStation[villeDestination == destinationsParentStation['parent_station']] # Récupérer les StopPoints de la ville de destination
        trajets = pd.merge(StopPoints, self.destinations, on=['stop_id','service_id','temps_ville','departure_time','trip_id','stop_id_ville','route_id']) # Merge entre StopPoints de la ville et les destinations pour récupérer les trajets faisant le lien entre les deux
        trajets_services = pd.merge(trajets.drop_duplicates(subset='service_id'), self.get_dates(), on='service_id') 
        trajets_services.assign(jour_suivant_depart = 0, jour_suivant_arrivee = 0)
        # Rectification des temps de trajet si > 24h
        trajets_services['jour_suivant_depart'] = trajets_services.apply(lambda x : x['temps_ville'] > 24*3600, axis=1)
        trajets_services['temps_ville'] = trajets_services.apply(lambda x : x['temps_ville']-24*3600 if x['jour_suivant_depart'] else x['temps_ville'] , axis=1)
        trajets_services['jour_suivant_arrivee'] = trajets_services.apply(lambda x : x['departure_time'] > 24*3600, axis=1)
        trajets_services['departure_time'] = trajets_services.apply(lambda x : x['departure_time'] - 24*3600 if x['jour_suivant_arrivee'] else x['departure_time'], axis=1)
        trajets_to_destinations = trajets_services.drop_duplicates(subset=['temps_ville','departure_time','date'])
        return trajets_to_destinations # Retourne tous les trajets faisant le lien entre la ville et la destination marchant sur les critères sélectionnés

    @staticmethod
    def list_of_cities(path):
        stops = pd.read_csv('Data/'+ path +'/stops.txt')[['stop_id', 'stop_name', 'stop_lat', 'stop_lon', 'parent_station']]
        return stops[stops['stop_id'].str.contains('StopArea')][['stop_name','stop_lat','stop_lon','stop_id']]
    
    def load_search(self, lat, lon ,date_min= datetime.today,date_max = datetime.today):
        self.lat = lat
        self.lon = lon
        self.date_min = datetime.strptime(date_min, '%Y%m%d')
        self.date_max = datetime.strptime(date_max, '%Y%m%d')

In [64]:
def date_to_timestamp(date):
    pattern = re.compile(r'(\d{2}):(\d{2}):(\d{2})')
    if pattern.match(date) is None:
        return None
    else:
        match = pattern.search(date)
        return int(match[1])*3600 + int(match[2])*60 + int(match[3])

date_to_timestamp('23:59:120')

86352

In [70]:
analyzer = AnalyzerGTFS('INTERCITE')
#analyzer.load_search(42.4265,3.15805,'20240715','20240715')
villeDestination = 'StopArea:OCE87611004'
analyzer.get_destinations(47.931295,1.902569,'20240801','20240816')

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,parent_station
35,StopArea:OCE87545244,Juvisy,48.68917,2.38267,
37,StopArea:OCE87547000,Paris Austerlitz,48.842285,2.364891,
77,StopArea:OCE87611004,Toulouse Matabiau,43.611206,1.453616,
84,StopArea:OCE87611244,Montauban Ville Bourbon,44.01444,1.341999,
87,StopArea:OCE87611343,Pamiers,43.116207,1.619451,
89,StopArea:OCE87611368,Saverdun,43.239335,1.570636,
91,StopArea:OCE87611384,Auterive,43.348993,1.468831,
93,StopArea:OCE87611483,Latour-de-Carol - Enveitg,42.458991,1.904433,
95,StopArea:OCE87611509,Porté-Puymorens,42.538781,1.824852,
97,StopArea:OCE87611517,Andorre l'Hospitalet,42.589453,1.800264,


In [71]:

destinationsParentStation = pd.merge(analyzer.destinations, analyzer.stops, on='stop_id') # Merge entre les destinations et les stops pour récupérer les parent_station
StopPoints = destinationsParentStation[villeDestination == destinationsParentStation['parent_station']] # Récupérer les StopPoints de la ville de destination
trajets = pd.merge(StopPoints, analyzer.destinations, on=['stop_id','service_id','temps_ville','departure_time','trip_id','stop_id_ville','route_id']) # Merge entre StopPoints de la ville et les destinations pour récupérer les trajets faisant le lien entre les deux
trajets_services = pd.merge(trajets.drop_duplicates(subset='service_id'), analyzer.get_dates(), on='service_id') 
trajets_services.assign(horaire_depart = "", horaire_arrivee = "")
#trajets_services['temps_ville'] = trajets_services.apply(lambda x : pd.Timedelta(x['temps_ville']), axis=1)
#trajets_services['departure_time'] = trajets_services.apply(lambda x : pd.Timedelta(x['departure_time']), axis=1)
#trajets_services['date'] = trajets_services.apply(lambda x : datetime.combine(x['date'], datetime.min.time()), axis=1)
trajets_services['horaire_depart'] = trajets_services.apply(lambda x : x['temps_ville']+x['date'], axis=1)
trajets_services['horaire_arrivee'] = trajets_services.apply(lambda x : x['departure_time']+x['date'], axis=1)
trajets_to_destinations = trajets_services.drop_duplicates(subset=['temps_ville','departure_time','date'])
trajets_to_destinations

Unnamed: 0,service_id,date_x,exception_type_x,trip_id,route_id,stop_id,departure_time,temps_ville,stop_id_ville,stop_name,stop_lat,stop_lon,parent_station,date_y,exception_type_y,date,exception_type,horaire_depart,horaire_arrivee
0,121,2024-08-12,1,OCESN3751F3190798:2024-06-14T00:38:46Z,FR:Line::7c00cf1f-74fd-4fda-a4e5-c2b1acb937a9:,StopPoint:OCEINTERCITES de nuit-87611004,1 days 06:30:00,0 days 23:15:00,StopPoint:OCEINTERCITES de nuit-87543017,Toulouse Matabiau,43.611206,1.453616,StopArea:OCE87611004,2024-08-12,1,2024-08-12,1,2024-08-12 23:15:00,2024-08-13 06:30:00
