In [2]:
import pandas as pd
from datetime import datetime
import os
MARGE_DISTANCE: float = 0.03
THRESHOLD_CONNECTION: int = 100

# Nouvelle version de la classe AnalyzerGTFS, version qui est optimisée


class AnalyzerGTFS:
    arrets_depart: pd.DataFrame = pd.DataFrame()
    list_cities: pd.DataFrame = pd.DataFrame()
    stops: pd.DataFrame = pd.DataFrame()
    calendar_dates: pd.DataFrame = pd.DataFrame()
    routes: pd.DataFrame = pd.DataFrame()
    stop_times: pd.DataFrame = pd.DataFrame()
    trips: pd.DataFrame = pd.DataFrame()
    lat: float = 0.0
    lon: float = 0.0
    stops_id: pd.DataFrame = pd.DataFrame()
    stops_area: pd.DataFrame = pd.DataFrame()
    stop_times_trips : pd.DataFrame = pd.DataFrame()
    nearby_stops: pd.DataFrame = pd.DataFrame()

    def __init__(self, transport_type="TER"):
        self.calendar_dates = pd.read_csv(os.path.join("Data", transport_type, "calendar_dates.txt"))
        self.stop_times = pd.read_csv(os.path.join("Data", transport_type, "stop_times.txt"))[
            ["trip_id", "stop_id", "departure_time"]
        ]
        self.stops = pd.read_csv(os.path.join("Data", transport_type, "stops.txt"))[
            ["stop_id", "stop_name", "stop_lat", "stop_lon", "parent_station"]
        ]
        self.trips = pd.read_csv(os.path.join("Data", transport_type, "trips.txt"))[
            ["service_id", "trip_id", "route_id"]
        ]
        self.calendar_dates.date = pd.to_datetime(self.calendar_dates["date"], format="%Y%m%d")
        self.stop_times["departure_time"] = pd.to_timedelta(self.stop_times["departure_time"])
        self.stops_id = self.stops[~self.stops['parent_station'].isna()]
        self.stops_area = self.stops[self.stops['parent_station'].isna()]

    def set_search_destinations(self, lat: float, lon: float, date_min: str, date_max: str) -> None:
        self.lat = lat
        self.lon = lon
        self.date_min = pd.to_datetime(date_min, format="%Y%m%d")
        self.date_max = pd.to_datetime(date_max, format="%Y%m%d")

    # Retourne les StopPoints proche du point de départ
    def stops_proches(self, lat: float, lon: float):  # Stop ID pour Global, parent_station pour SNCF
        return self.stops_id[
            (self.stops_id["stop_lat"] > lat - MARGE_DISTANCE)
            & (self.stops_id["stop_lat"] < lat + MARGE_DISTANCE)
            & (self.stops_id["stop_lon"] > lon - MARGE_DISTANCE)
            & (self.stops_id["stop_lon"] < lon + MARGE_DISTANCE)
        ]

    def trips_stops_proches(self, lat: float, lon: float) -> pd.Series:
        self.nearby_stops: pd.DataFrame = self.stops_proches(lat, lon)
        trips: pd.DataFrame = self.stop_times[self.stop_times["stop_id"].isin(self.nearby_stops["stop_id"])]
        #trips = pd.merge(self.stop_times_sorted, stops_proches, on = "stop_id")[['trip_id', 'stop_id', 'departure_time']]
        #trips: pd.DataFrame = self.stop_times.query('stop_id in @stops_proches["stop_id"]', engine='python')
        self.arrets_depart = trips.drop_duplicates(subset="trip_id")
        trips_ids: pd.Series = self.arrets_depart["trip_id"]
        return trips_ids

    def trips_dans_periode(self, lat: float, lon: float, date_min: datetime, date_max: datetime) -> pd.Series:
        date_min = pd.to_datetime(date_min)
        date_max = pd.to_datetime(date_max)
        trips_ids: pd.Series = self.trips_stops_proches(lat, lon)
        trips: pd.DataFrame = self.trips[self.trips["trip_id"].isin(trips_ids)]
        services: pd.DataFrame = self.calendar_dates[self.calendar_dates["service_id"].isin(trips["service_id"])]
        services_dans_periode: pd.DataFrame = services[(services["date"] >= date_min) & (services["date"] <= date_max)]
        services_dans_periode.drop_duplicates(subset="service_id")
        trips_dans_periode: pd.DataFrame = trips[trips["service_id"].isin(services_dans_periode["service_id"])]
        return trips_dans_periode["trip_id"]

    def get_set_destinations(self, lat: float, lon: float, date_min: datetime, date_max: datetime) -> pd.DataFrame:
        trips_ids_dans_periode: pd.Series = self.trips_dans_periode(lat, lon, date_min, date_max)
        stop_times_arret_correct: pd.DataFrame = self.stop_times[
            self.stop_times["trip_id"].isin(trips_ids_dans_periode)
        ]
        stop_times_temps_superieur: pd.DataFrame = stop_times_arret_correct.assign(temps_depart="", stop_id_ville="")

        stop_ids_depart: pd.Series = self.arrets_depart["stop_id"]
        stop_ids_depart.index = self.arrets_depart["trip_id"]
        stop_id_depart = stop_ids_depart.loc[stop_times_temps_superieur["trip_id"]]
        stop_times_temps_superieur["stop_id_ville"] = stop_id_depart.array

        temps_depart: pd.Series = self.arrets_depart["departure_time"]  # StopTime trip
        temps_depart.index = self.arrets_depart["trip_id"]
        colonne_temps_depart = temps_depart.loc[stop_times_temps_superieur["trip_id"]]
        stop_times_temps_superieur["temps_ville"] = colonne_temps_depart.array

        destinations_doublons_stop_points: pd.DataFrame = stop_times_temps_superieur[
            stop_times_temps_superieur["departure_time"] > stop_times_temps_superieur["temps_ville"]
        ]
        destinations_doublons_stop_points = self.stops_id[
            self.stops_id["stop_id"].isin(destinations_doublons_stop_points["stop_id"])
        ]
        destinations_doublons_stop_area: pd.DataFrame = self.stops_area[
            self.stops_area["stop_id"].isin(destinations_doublons_stop_points["parent_station"])
        ]
        destinations_stop_area = destinations_doublons_stop_area.drop_duplicates(subset="stop_id")
        destinations: pd.DataFrame = destinations_stop_area[
            ~destinations_stop_area["stop_id"].isin(self.nearby_stops["parent_station"])
        ]

        return destinations

    def get_trajets(
        self,
        departure_lat,
        departure_lon,
        arrival_lat: float,
        arrival_lon: float,
        date_min: datetime,
        date_max: datetime,
        departure_time: pd.Timedelta,
    ) -> pd.DataFrame:
        date_min = pd.to_datetime(date_min)
        date_max = pd.to_datetime(date_max)
        stops_depart: pd.DataFrame = self.stops_id[
            (self.stops_id["stop_lat"] > departure_lat - MARGE_DISTANCE)
            & (self.stops_id["stop_lat"] < departure_lat + MARGE_DISTANCE)
            & (self.stops_id["stop_lon"] > departure_lon - MARGE_DISTANCE)
            & (self.stops_id["stop_lon"] < departure_lon + MARGE_DISTANCE)
        ]
        stops_arrivee: pd.DataFrame = self.stops_id[
            (self.stops_id["stop_lat"] > arrival_lat - MARGE_DISTANCE / 2)
            & (self.stops_id["stop_lat"] < arrival_lat + MARGE_DISTANCE / 2)
            & (self.stops_id["stop_lon"] > arrival_lon - MARGE_DISTANCE / 2)
            & (self.stops_id["stop_lon"] < arrival_lon + MARGE_DISTANCE / 2)
        ]
        trajets_avec_stops_depart: pd.DataFrame = self.stop_times[
            self.stop_times["stop_id"].isin(stops_depart["stop_id"])
        ]
        trajets_avec_stops_depart = trajets_avec_stops_depart[
            trajets_avec_stops_depart["departure_time"] > departure_time
        ]
        trajets_avec_stops_arrivee: pd.DataFrame = self.stop_times[
            self.stop_times["stop_id"].isin(stops_arrivee["stop_id"])
        ]
        trips_avec_depart_arrivee: pd.DataFrame = pd.merge(
            trajets_avec_stops_depart, trajets_avec_stops_arrivee, on="trip_id"
        )
        trips_heure: pd.DataFrame = trips_avec_depart_arrivee[
            trips_avec_depart_arrivee["departure_time_x"] < trips_avec_depart_arrivee["departure_time_y"]
        ]
        trips: pd.DataFrame = pd.merge(trips_heure, self.trips, on="trip_id")
        trips_et_jours: pd.DataFrame = pd.merge(trips, self.calendar_dates, on="service_id")
        trajets: pd.DataFrame = trips_et_jours[
            (trips_et_jours["date"] >= date_min) & (trips_et_jours["date"] <= date_max)
        ]
        trajets = trajets.assign(horaire_depart="", horaire_arrivee="")
        trajets["horaire_depart"] = trajets["date"] + trajets["departure_time_x"]
        trajets["horaire_arrivee"] = trajets["date"] + trajets["departure_time_y"]
        return trajets

    @staticmethod
    def list_of_cities_static(path: str) -> pd.DataFrame:
        stops: pd.DataFrame = pd.read_csv(os.path.join("Data", path, "stops.txt"))[
            ["stop_id", "stop_name", "stop_lat", "stop_lon", "parent_station"]
        ]
        stop_times: pd.DataFrame = pd.read_csv(os.path.join("Data", path, "stop_times.txt"))[
            ["trip_id", "stop_id", "departure_time"]
        ]
        appearance_count = stop_times.groupby("stop_id").count()['trip_id']
        appearance_stop_id = pd.merge(appearance_count,stops, on='stop_id')
        appeareance_stop_area = appearance_stop_id.groupby("parent_station").sum()['trip_id']
        df_stop_area = stops[stops["stop_id"].str.contains("StopArea")][["stop_name", "stop_lat", "stop_lon", "stop_id"]].set_index("stop_id")
        df_stop_area = df_stop_area.assign(number_of_appearance = appeareance_stop_area)
        return df_stop_area
    
    def list_of_cities(self) -> pd.DataFrame:
        appearance_count = self.stop_times.groupby("stop_id").count()['trip_id']
        appearance_stop_id = pd.merge(appearance_count,self.stops, on='stop_id')
        appeareance_stop_area = appearance_stop_id.groupby("parent_station").sum()['trip_id']
        df_stop_area = self.stops[self.stops["stop_id"].str.contains("StopArea")][["stop_name", "stop_lat", "stop_lon", "stop_id"]].set_index("stop_id")
        df_stop_area = df_stop_area.assign(number_of_appearance = appeareance_stop_area)
        self.list_cities = df_stop_area
        return df_stop_area

In [3]:
analyzer = AnalyzerGTFS()
analyzer.list_of_cities()
stop_times = analyzer.stop_times
stops = analyzer.stops

In [9]:
direct_cities_list = analyzer.get_set_destinations(48.8422850,2.36489100, datetime(year = 2024, month = 7, day = 1), datetime(year = 2024, month = 8, day = 1))
print(direct_cities_list)
connections_cities = analyzer.list_cities[analyzer.list_cities['number_of_appearance'] > THRESHOLD_CONNECTION]

                   stop_id         stop_name   stop_lat  stop_lon  \
4221  StopArea:OCE87543009           Orléans  47.907891  1.904242   
4223  StopArea:OCE87543017       Les Aubrais  47.926801  1.907129   
4226  StopArea:OCE87543033         Cercottes  47.986155  1.884981   
4229  StopArea:OCE87543041          Chevilly  48.026672  1.879113   
4232  StopArea:OCE87543058           Artenay  48.081057  1.883226   
...                    ...               ...        ...       ...   
7317  StopArea:OCE87722025     Lyon Perrache  45.748508  4.825777   
7367  StopArea:OCE87723197    Lyon Part Dieu  45.760596  4.859409   
7433  StopArea:OCE87725002  Chalon-sur-Saône  46.781622  4.843479   
7448  StopArea:OCE87725622           Tournus  46.566724  4.906562   
7454  StopArea:OCE87725689             Mâcon  46.302572  4.824887   

     parent_station  
4221            NaN  
4223            NaN  
4226            NaN  
4229            NaN  
4232            NaN  
...             ...  
7317            N

In [8]:
import time
connections_cities_connected_to_start = direct_cities_list[direct_cities_list['stop_id'].isin(connections_cities.index)]
connections_most_importants = connections_cities_connected_to_start.head(10)
#print(connections_cities_connected_to_start)

dict_df_connections = {}
for row in connections_most_importants.itertuples():
    time_start = time.time()
    dict_df_connections[row.stop_id] = analyzer.get_set_destinations(row.stop_lat, row.stop_lon, datetime(year = 2024, month = 7, day = 1), datetime(year = 2024, month = 8, day = 1))

def get_connections(stop_id):
    

    print(time.time() - time_start)

0.05205798149108887
0.05033373832702637
0.030308246612548828
0.03362870216369629
0.0350337028503418
0.049442291259765625
0.044123172760009766
0.06069159507751465
0.029655933380126953
0.02963542938232422


In [83]:
date_v_min = datetime(year = 2024, month = 7, day = 1)
date_v_max = datetime(year = 2024, month = 8, day = 1)
departure_lat,departure_lon = 47.9078910,1.90424200
arrival_lat,arrival_lon = 48.8422850,2.36489100
departure_time = pd.Timedelta(hours=8)

In [89]:
time_start_1 = time.time()
time_end_1 = time.time()
print(time_end_1 - time_start_1)

time_start_2 = time.time()
analyzer.get_trajets(departure_lat, departure_lon, arrival_lat, arrival_lon, date_v_min, date_v_max, departure_time)
time_end_2 = time.time()
print(time_end_2 - time_start_2)

0.0
0.11269783973693848
