In [22]:
import pandas as pd
from datetime import datetime
import os
import numpy as np
#from Analyzer import Analyzer

DISTANCE_MARGIN: float = 0.1

def find_best_name(names):
    # Exemple : on garde le nom le plus court
    return min(names, key=len)

def round_to_precision(value : float) -> float:
    return round(value / 0.03) * 0.03


class AnalyzerCalendarDates():
    unique_departures: pd.DataFrame = pd.DataFrame()
    city_list: pd.DataFrame = pd.DataFrame()
    stops: pd.DataFrame = pd.DataFrame()
    calendar_dates: pd.DataFrame = pd.DataFrame()
    routes: pd.DataFrame = pd.DataFrame()
    stop_times: pd.DataFrame = pd.DataFrame()
    trips: pd.DataFrame = pd.DataFrame()
    stops_id: pd.DataFrame = pd.DataFrame()
    stops_area: pd.DataFrame = pd.DataFrame()
    nearby_stops: pd.DataFrame = pd.DataFrame()

    def __init__(self, transport_type="TER"):
        self.calendar_dates = pd.read_csv(os.path.join("Data", transport_type, "calendar_dates.txt"))
        self.stop_times = pd.read_csv(os.path.join("Data", transport_type, "stop_times.txt"))[
            ["trip_id", "stop_id", "departure_time"]
        ]
        self.stops = pd.read_csv(os.path.join("Data", transport_type, "stops.txt"))[
            ["stop_id", "stop_name", "stop_lat", "stop_lon", "parent_station"]
        ]
        self.trips = pd.read_csv(os.path.join("Data", transport_type, "trips.txt"))[
            ["service_id", "trip_id", "route_id"]
        ]
        self.calendar_dates.date = pd.to_datetime(self.calendar_dates["date"], format="%Y%m%d")
        self.stop_times["departure_time"] = pd.to_timedelta(self.stop_times["departure_time"])
        self.stops_id = self.stops[~self.stops["parent_station"].isna()]
        self.stops_area = self.stops[self.stops["parent_station"].isna()]

    # Retourne les StopPoints proche du point de départ
    def find_nearby_stops(self, lat: float, lon: float):  # Stop ID pour Global, parent_station pour SNCF
        return self.stops_id[
            (self.stops_id["stop_lat"] > lat - DISTANCE_MARGIN/2)
            & (self.stops_id["stop_lat"] < lat + DISTANCE_MARGIN/2)
            & (self.stops_id["stop_lon"] > lon - DISTANCE_MARGIN/2)
            & (self.stops_id["stop_lon"] < lon + DISTANCE_MARGIN/2)
        ]

    def get_trips_nearby_location(self, lat: float, lon: float) -> pd.Series:
        self.nearby_stops: pd.DataFrame = self.find_nearby_stops(lat, lon)
        trips_containing_departure: pd.DataFrame = self.stop_times[
            self.stop_times["stop_id"].isin(self.nearby_stops["stop_id"])
        ]
        self.unique_departures = trips_containing_departure.drop_duplicates(subset="trip_id")
        trip_ids: pd.Series = self.unique_departures["trip_id"]
        return trip_ids

    def filter_trips_within_period(self, lat: float, lon: float, start_date: datetime, end_date: datetime) -> pd.Series:
        start_date = pd.to_datetime(start_date)
        end_date = pd.to_datetime(end_date)
        trip_ids: pd.Series = self.get_trips_nearby_location(lat, lon)
        relevant_trips: pd.DataFrame = self.trips[self.trips["trip_id"].isin(trip_ids)]
        relevant_services: pd.DataFrame = self.calendar_dates[
            self.calendar_dates["service_id"].isin(relevant_trips["service_id"])
        ]
        services_within_period: pd.DataFrame = relevant_services[
            (relevant_services["date"] >= start_date) & (relevant_services["date"] <= end_date)
        ]
        services_within_period.drop_duplicates(subset="service_id")
        trips_within_period: pd.DataFrame = relevant_trips[
            relevant_trips["service_id"].isin(services_within_period["service_id"])
        ]
        return trips_within_period["trip_id"]

    def find_destinations_from_location(
        self, lat: float, lon: float, start_date: datetime, end_date: datetime
    ) -> pd.DataFrame:
        trip_ids_within_period: pd.Series = self.filter_trips_within_period(lat, lon, start_date, end_date)
        stop_times_right_stops: pd.DataFrame = self.stop_times[self.stop_times["trip_id"].isin(trip_ids_within_period)]
        cities_after_inital_departure: pd.DataFrame = stop_times_right_stops.assign(
            city_departure_time="", stop_id_ville=""
        )

        departure_stop_ids: pd.Series = self.unique_departures["stop_id"]
        departure_stop_ids.index = self.unique_departures["trip_id"]
        departure_id = departure_stop_ids.loc[cities_after_inital_departure["trip_id"]]
        cities_after_inital_departure["stop_id_ville"] = departure_id.array

        departure_time: pd.Series = self.unique_departures["departure_time"]  # StopTime trip
        departure_time.index = self.unique_departures["trip_id"]
        column_departure_time = departure_time.loc[cities_after_inital_departure["trip_id"]]
        cities_after_inital_departure["city_departure_time"] = column_departure_time.array

        duplicate_destinations_stop_points: pd.DataFrame = cities_after_inital_departure[
            cities_after_inital_departure["departure_time"] > cities_after_inital_departure["city_departure_time"]
        ]
        duplicate_destinations_stop_points = self.stops_id[
            self.stops_id["stop_id"].isin(duplicate_destinations_stop_points["stop_id"])
        ]
        duplicate_destinations_stop_area: pd.DataFrame = self.stops_area[
            self.stops_area["stop_id"].isin(duplicate_destinations_stop_points["parent_station"])
        ]
        destinations_stop_area = duplicate_destinations_stop_area.drop_duplicates(subset="stop_id")
        destinations: pd.DataFrame = destinations_stop_area[
            ~destinations_stop_area["stop_id"].isin(self.nearby_stops["parent_station"])
        ]
        return destinations

    def find_trips_between_locations(
        self,
        departure_lat,
        departure_lon,
        arrival_lat: float,
        arrival_lon: float,
        start_date: datetime,
        end_date: datetime,
        departure_time: pd.Timedelta,
    ) -> pd.DataFrame:
        start_date = pd.to_datetime(start_date)
        end_date = pd.to_datetime(end_date)
        departure_stops: pd.DataFrame = self.stops_id[
            (self.stops_id["stop_lat"] > departure_lat - DISTANCE_MARGIN/2)
            & (self.stops_id["stop_lat"] < departure_lat + DISTANCE_MARGIN/2)
            & (self.stops_id["stop_lon"] > departure_lon - DISTANCE_MARGIN/2)
            & (self.stops_id["stop_lon"] < departure_lon + DISTANCE_MARGIN/2)
        ]
        arrival_stops: pd.DataFrame = self.stops_id[
            (self.stops_id["stop_lat"] > arrival_lat - DISTANCE_MARGIN/2)
            & (self.stops_id["stop_lat"] < arrival_lat + DISTANCE_MARGIN/2)
            & (self.stops_id["stop_lon"] > arrival_lon - DISTANCE_MARGIN/2)
            & (self.stops_id["stop_lon"] < arrival_lon + DISTANCE_MARGIN/2)
        ]
        trips_containing_departure: pd.DataFrame = self.stop_times[
            self.stop_times["stop_id"].isin(departure_stops["stop_id"])
        ]
        trips_containing_departure = trips_containing_departure[
            trips_containing_departure["departure_time"] > departure_time
        ]
        trips_containing_arrival: pd.DataFrame = self.stop_times[
            self.stop_times["stop_id"].isin(arrival_stops["stop_id"])
        ]
        trips_containing_both: pd.DataFrame = pd.merge(
            trips_containing_departure, trips_containing_arrival, on="trip_id"
        )
        trips_in_right_direction: pd.DataFrame = trips_containing_both[
            trips_containing_both["departure_time_x"] < trips_containing_both["departure_time_y"]
        ]
        trips: pd.DataFrame = pd.merge(trips_in_right_direction, self.trips, on="trip_id")
        trips_and_calendar_dates: pd.DataFrame = pd.merge(trips, self.calendar_dates, on="service_id")
        valid_trips: pd.DataFrame = trips_and_calendar_dates[
            (trips_and_calendar_dates["date"] >= start_date) & (trips_and_calendar_dates["date"] <= end_date)
        ]
        trips = valid_trips.assign(horaire_depart="", horaire_arrivee="")
        trips["dep_time"] = trips["date"] + trips["departure_time_x"]
        trips["arr_time"] = trips["date"] + trips["departure_time_y"]
        return trips

    def get_list_of_cities(self) -> pd.DataFrame:
        appearance_count = self.stop_times.groupby("stop_id").count()["trip_id"]
        appearance_stop_id = pd.merge(appearance_count, self.stops, on="stop_id")
        appeareance_stop_area = appearance_stop_id.groupby("parent_station").sum()["trip_id"]
        df_stop_area = self.stops_area[["stop_id", "stop_name", "stop_lat", "stop_lon"]].copy()
        df_stop_area.set_index("stop_id", inplace=True)
        df_stop_area = df_stop_area.assign(number_of_appearance=appeareance_stop_area)
        round_to_precision_vectorized = np.vectorize(round_to_precision)
        cities = df_stop_area
        cities = cities.assign(lat_round = round_to_precision_vectorized(cities["stop_lat"]), lon_round = round_to_precision_vectorized(cities["stop_lon"]))
        df_filtered = cities.groupby(['lat_round', 'lon_round']).agg({
            'number_of_appearance': 'sum',
            'stop_name': find_best_name,
            'stop_lat': 'first',
            'stop_lon': 'first'
        }).reset_index(drop=True)
        return df_filtered

In [35]:
import pandas as pd
import os
from datetime import datetime
from datetime import timezone
transport_type = "FLIXBUS"
calendar_dates = pd.read_csv(os.path.join("Data", transport_type, "calendar_dates.txt"))
calendar = pd.read_csv(os.path.join("Data", transport_type, "calendar.txt"))
trips = pd.read_csv(os.path.join("Data", transport_type, "trips.txt"))
stops = pd.read_csv(os.path.join("Data", transport_type, "stops.txt"))
stop_times = pd.read_csv(os.path.join("Data", transport_type, "stop_times.txt"))
stop_times["departure_time"] = pd.to_timedelta(stop_times["departure_time"])

In [36]:
#timezone = pd.to_datetime("2021-01-01 00:00:00").tz_localize(stops["stop_timezone"][0])
#print(timezone)
#timezone = timezone.tz_convert("UTC")
#print(timezone)
#
#datetime.astimezone(pd.to_datetime("2021-01-01 00:00:00"), stops["stop_timezone"][0])
import pytz
datetime(year = 2024, month = 8, day = 31,hour = 17, tzinfo = pytz.timezone(stops["stop_timezone"][40]))

stop_times
datetime(year = 2024, month = 8, day = 31,hour = 17, tzinfo = pytz.timezone(stops["stop_timezone"][40]))

Unnamed: 0,trip_id,stop_id,arrival_time,departure_time,timepoint,stop_sequence,stop_headsign,route_short_name,pickup_type,drop_off_type,shape_dist_traveled
0,N1700-1-2145082024-DPC#LVC-00,4aee50c3-ad4b-4c61-8572-896e31e78b5f,21:45:00,0 days 21:45:00,,1,,,0,0,
1,N1700-1-2145082024-DPC#LVC-00,dcc5426b-9603-11e6-9066-549f350fcb0c,22:30:00,0 days 22:45:00,,2,,,0,0,
2,N1700-1-2145082024-DPC#LVC-00,eeea5a9b-cb45-4cd7-93cc-48fc00af6a74,26:20:00,1 days 04:05:00,,3,,,0,0,
3,N1700-1-2145082024-DPC#LVC-00,14aa766b-48d0-4d84-ab4c-c2e610d731a4,29:35:00,1 days 05:40:00,,4,,,0,0,
4,N1700-1-2145082024-DPC#LVC-00,dcc0f769-9603-11e6-9066-549f350fcb0c,31:55:00,1 days 07:55:00,,5,,,0,0,
...,...,...,...,...,...,...,...,...,...,...,...
262407,095-41-0320082024-BZM#MUC-00,dcbcb138-9603-11e6-9066-549f350fcb0c,04:29:00,0 days 04:29:00,,6,,,0,0,
262408,095-41-0320082024-BZM#MUC-00,dcbcb2d7-9603-11e6-9066-549f350fcb0c,04:52:00,0 days 04:52:00,,7,,,0,0,
262409,095-41-0320082024-BZM#MUC-00,dcbcb402-9603-11e6-9066-549f350fcb0c,05:40:00,0 days 05:45:00,,8,,,0,0,
262410,095-41-0320082024-BZM#MUC-00,dcbabbfa-9603-11e6-9066-549f350fcb0c,08:00:00,0 days 08:10:00,,9,,,0,0,


In [3]:
import pandas as pd
from datetime import datetime
import os
import numpy as np
from Analyzer import Analyzer
import pytz

DISTANCE_MARGIN: float = 0.05


class AnalyzerCalendar(Analyzer):
    unique_departures: pd.DataFrame = pd.DataFrame()
    city_list: pd.DataFrame = pd.DataFrame()
    stops: pd.DataFrame = pd.DataFrame()
    calendar_dates: pd.DataFrame = pd.DataFrame()
    calendar: pd.DataFrame = pd.DataFrame()
    routes: pd.DataFrame = pd.DataFrame()
    stop_times: pd.DataFrame = pd.DataFrame()
    trips: pd.DataFrame = pd.DataFrame()
    lat: float = 0.0
    lon: float = 0.0
    nearby_stops: pd.DataFrame = pd.DataFrame()
    monday_integer_index: int = 0
    timezone : str = "UTC"

    def __init__(self, transport_type="FLIXBUS"):
        self.calendar_dates = pd.read_csv(os.path.join("Data", transport_type, "calendar_dates.txt"))
        self.calendar = pd.read_csv(os.path.join("Data", transport_type, "calendar.txt"))
        self.stop_times = pd.read_csv(os.path.join("Data", transport_type, "stop_times.txt"))[
            ["trip_id", "stop_id", "departure_time"]
        ]
        stops = pd.read_csv(os.path.join("Data", transport_type, "stops.txt"))
        if "stop_timezone" in stops.columns:
            self.stops = stops[["stop_id", "stop_name", "stop_lat", "stop_lon", "stop_timezone"]]
        else:
            self.stops = stops[["stop_id", "stop_name", "stop_lat", "stop_lon"]]
            self.stops["stop_timezone"] = "UTC"
        self.trips = pd.read_csv(os.path.join("Data", transport_type, "trips.txt"))[
            ["service_id", "trip_id", "route_id"]
        ]
        self.calendar_dates.date = pd.to_datetime(self.calendar_dates["date"], format="%Y%m%d")
        self.calendar.start_date = pd.to_datetime(self.calendar["start_date"], format="%Y%m%d")
        self.calendar.end_date = pd.to_datetime(self.calendar["end_date"], format="%Y%m%d")
        self.stop_times["departure_time"] = pd.to_timedelta(self.stop_times["departure_time"])
        self.monday_integer_index = self.calendar.columns.get_loc("monday")

        agency = pd.read_csv(os.path.join("Data", transport_type, "agency.txt"))
        self.timezone = agency["agency_timezone"].iloc[0]

    # All stops near the location, Step 1 of find_destinations_from_location
    def find_nearby_stops(self, lat: float, lon: float):
        return self.stops[
            (self.stops["stop_lat"] > lat - DISTANCE_MARGIN/2)
            & (self.stops["stop_lat"] < lat + DISTANCE_MARGIN/2)
            & (self.stops["stop_lon"] > lon - DISTANCE_MARGIN/2)
            & (self.stops["stop_lon"] < lon + DISTANCE_MARGIN/2)
        ]

    # All trips that fits location, Step 2 of find_destinations_from_location
    def get_trips_nearby_location(self, lat: float, lon: float) -> pd.Series:
        self.nearby_stops: pd.DataFrame = self.find_nearby_stops(lat, lon)
        relevant_trips: pd.DataFrame = self.stop_times[self.stop_times["stop_id"].isin(self.nearby_stops["stop_id"])]
        self.unique_departures = relevant_trips.drop_duplicates(subset="trip_id")
        trip_ids: pd.Series = self.unique_departures["trip_id"]
        return trip_ids

    # All trips that fits calendar, Step 3 of find_destinations_from_location
    def filter_trips_within_period(self, lat: float, lon: float, start_date: datetime, end_date: datetime) -> pd.Series:
        start_date = pd.to_datetime(start_date)
        end_date = pd.to_datetime(end_date)
        trip_ids: pd.Series = self.get_trips_nearby_location(lat, lon)
        relevant_trips: pd.DataFrame = self.trips[self.trips["trip_id"].isin(trip_ids)]
        relevant_services: pd.DataFrame = self.calendar[self.calendar["service_id"].isin(relevant_trips["service_id"])]
        services_within_period: pd.DataFrame = relevant_services[
            ((relevant_services["start_date"] >= start_date) & (relevant_services["start_date"] <= end_date))
            | ((relevant_services["end_date"] >= start_date) & (relevant_services["end_date"] <= end_date))
        ]
        if services_within_period.empty:
            return pd.Series()
        services_within_period = services_within_period.assign(
            days_ok_start=np.vectorize(lambda x: start_date if (start_date >= x) else x)(
                services_within_period["start_date"]
            ),
            days_ok_end=np.vectorize(lambda x: end_date if (end_date <= x) else x)(services_within_period["end_date"]),
        )
        services_within_period["days_ok"] = (
            services_within_period["days_ok_end"] - services_within_period["days_ok_start"]
        )
        # Optimisation de la ligne suivante à faire
        services_in_dates = services_within_period[services_within_period.apply(self.is_service_in_dates, axis=1)]
        services_in_dates.drop_duplicates(subset="service_id")
        trips_dans_periode: pd.DataFrame = relevant_trips[
            relevant_trips["service_id"].isin(services_in_dates["service_id"])
        ]
        return trips_dans_periode["trip_id"]

    # Function to check if a service is within the date range
    def is_service_in_dates(self, services: pd.Series) -> bool:
        if services.days_ok >= pd.Timedelta(days=6):
            return True
        day_offset = 0
        while day_offset <= services.days_ok.days:
            current_day_index = (services.days_ok_start + pd.Timedelta(days=day_offset)).weekday()
            if services.iloc[self.monday_integer_index + current_day_index] == 1:
                return True
            day_offset += 1
        return False

    # Find all destinations from a location and a period
    def find_destinations_from_location(
        self, lat: float, lon: float, date_min: datetime, date_max: datetime
    ) -> pd.DataFrame:
        trip_ids_within_period: pd.Series = self.filter_trips_within_period(lat, lon, date_min, date_max)
        stop_times_right_stops: pd.DataFrame = self.stop_times[self.stop_times["trip_id"].isin(trip_ids_within_period)]
        cities_after_inital_departure: pd.DataFrame = stop_times_right_stops.assign(
            city_departure_time="", city_stop_id=""
        )

        departure_stop_ids: pd.Series = self.unique_departures["stop_id"]
        departure_stop_ids.index = self.unique_departures["trip_id"]
        departure_id = departure_stop_ids.loc[cities_after_inital_departure["trip_id"]]
        cities_after_inital_departure["departure_stop_id"] = departure_id.array

        departure_time: pd.Series = self.unique_departures["departure_time"]  # StopTime trip
        departure_time.index = self.unique_departures["trip_id"]
        column_departure_time = departure_time.loc[cities_after_inital_departure["trip_id"]]
        cities_after_inital_departure["city_departure_time"] = column_departure_time.array

        duplicate_destinations_stop_times: pd.DataFrame = cities_after_inital_departure[
            cities_after_inital_departure["departure_time"] > cities_after_inital_departure["city_departure_time"]
        ]
        duplicate_destinations_stops = self.stops[
            self.stops["stop_id"].isin(duplicate_destinations_stop_times["stop_id"])
        ]
        destinations = duplicate_destinations_stops.drop_duplicates(subset="stop_id")
        destinations = destinations[~destinations["stop_id"].isin(self.nearby_stops["stop_id"])]
        return destinations

    def find_trips_between_locations(
        self,
        departure_lat: float,
        departure_lon: float,
        arrival_lat: float,
        arrival_lon: float,
        start_date: datetime,
        end_date: datetime,
        departure_time: pd.Timedelta,
    ) -> pd.DataFrame:
        departure_stops: pd.DataFrame = self.stops[
            (self.stops["stop_lat"] > departure_lat - DISTANCE_MARGIN/2)
            & (self.stops["stop_lat"] < departure_lat + DISTANCE_MARGIN/2)
            & (self.stops["stop_lon"] > departure_lon - DISTANCE_MARGIN/2)
            & (self.stops["stop_lon"] < departure_lon + DISTANCE_MARGIN/2)
        ]
        arrival_stops: pd.DataFrame = self.stops[
            (self.stops["stop_lat"] > arrival_lat - DISTANCE_MARGIN/2)
            & (self.stops["stop_lat"] < arrival_lat + DISTANCE_MARGIN/2)
            & (self.stops["stop_lon"] > arrival_lon - DISTANCE_MARGIN/2)
            & (self.stops["stop_lon"] < arrival_lon + DISTANCE_MARGIN/2)
        ]
        trips_containing_departure: pd.DataFrame = pd.merge(departure_stops, self.stop_times, on="stop_id")
        trips_containing_departure = trips_containing_departure[
            trips_containing_departure["departure_time"] > departure_time
        ]
        trips_containing_departure = trips_containing_departure.drop_duplicates(subset="trip_id")
        trips_containing_arrival: pd.DataFrame = pd.merge(arrival_stops, self.stop_times, on="stop_id")
        trips_containing_both: pd.DataFrame = pd.merge(
            trips_containing_departure, trips_containing_arrival, on="trip_id"
        )
        trips_in_right_direction: pd.DataFrame = trips_containing_both[
            trips_containing_both["departure_time_x"] < trips_containing_both["departure_time_y"]
        ]
        relevant_trips: pd.DataFrame = pd.merge(trips_in_right_direction, self.trips, on="trip_id")
        trips_in_dates = self.dates_from_trips(relevant_trips, start_date, end_date)
        return trips_in_dates

    def dates_from_trips(self, trips: pd.DataFrame, start_date: datetime, end_date: datetime) -> pd.DataFrame:
        start_date = pd.to_datetime(start_date)
        end_date = pd.to_datetime(end_date)
        services: pd.DataFrame = self.calendar[self.calendar["service_id"].isin(trips["service_id"])]
        services_within_period: pd.DataFrame = services[
            ((services["start_date"] >= start_date) & (services["start_date"] <= end_date))
            | ((services["end_date"] >= start_date) & (services["end_date"] <= end_date))
        ]
        if services_within_period.empty:
            return pd.DataFrame()
        services_within_period = services_within_period.assign(
            days_ok_start=np.vectorize(lambda x: start_date if (start_date >= pd.to_datetime(x)) else x)(
                services_within_period["start_date"]
            ),
            days_ok_end=np.vectorize(lambda x: end_date if (end_date <= pd.to_datetime(x)) else x)(
                services_within_period["end_date"]
            ),
        )
        dataframe_concat = pd.DataFrame()
        for _, row in services_within_period.iterrows():
            df = pd.DataFrame(
                {
                    "date": pd.date_range(start=row.days_ok_start, end=row.days_ok_end),
                    "service_id": row.service_id,
                    "monday": row.monday,
                    "tuesday": row.tuesday,
                    "wednesday": row.wednesday,
                    "thursday": row.thursday,
                    "friday": row.friday,
                    "saturday": row.saturday,
                    "sunday": row.sunday,
                }
            )
            monday_index = df.columns.get_loc("monday")
            df = df[df.apply(lambda x: True if x.iloc[monday_index + x.date.weekday()] == 1 else False, axis=1)]
            dataframe_concat = pd.concat([dataframe_concat, df])
        dataframe_valid_dates = pd.merge(trips, dataframe_concat, on="service_id")
        dataframe_valid_dates["dep_time"] = dataframe_valid_dates["date"] + dataframe_valid_dates["departure_time_x"]
        dataframe_valid_dates["arr_time"] = dataframe_valid_dates["date"] + dataframe_valid_dates["departure_time_y"]
        dataframe_valid_dates["dep_time"] = dataframe_valid_dates.apply(lambda x: x["dep_time"].replace(tzinfo = pytz.timezone(self.timezone)).astimezone(tz=x["stop_timezone_x"]), axis=1)
        dataframe_valid_dates["arr_time"] = dataframe_valid_dates.apply(lambda x: x["arr_time"].replace(tzinfo = pytz.timezone(self.timezone)).astimezone(tz=x["stop_timezone_y"]), axis=1)
        return dataframe_valid_dates

    def get_list_of_cities(self) -> pd.DataFrame:
        appearance_count = self.stop_times.groupby("stop_id").count()["trip_id"]
        stop_cities = self.stops.set_index("stop_id")
        stop_cities = stop_cities.assign(number_of_appearance=appearance_count)
        return stop_cities

In [4]:
analyzerCalendar = AnalyzerCalendar("FLIXBUS")
analyzerCalendar.find_trips_between_locations(48.835318,2.380519, 51.492186, -0.149079, datetime(2024, 8, 29), datetime(2024, 8, 29), pd.Timedelta("8:00:00"))

Unnamed: 0,stop_id_x,stop_name_x,stop_lat_x,stop_lon_x,stop_timezone_x,trip_id,departure_time_x,stop_id_y,stop_name_y,stop_lat_y,...,date,monday,tuesday,wednesday,thursday,friday,saturday,sunday,dep_time,arr_time
0,dcc5426b-9603-11e6-9066-549f350fcb0c,Paris (Bercy Seine),48.835318,2.380519,Europe/Paris,700-5-1200082024-PQB#LVC-00,0 days 12:00:00,dcc0f769-9603-11e6-9066-549f350fcb0c,London Victoria Coach Station,51.492186,...,2024-08-29,1,1,1,1,0,0,0,2024-08-29 14:00:00+02:00,2024-08-29 22:30:00+01:00
1,dcc5426b-9603-11e6-9066-549f350fcb0c,Paris (Bercy Seine),48.835318,2.380519,Europe/Paris,N1710-1-2230082024-PQB#LVC-00,0 days 22:30:00,dcc0f769-9603-11e6-9066-549f350fcb0c,London Victoria Coach Station,51.492186,...,2024-08-29,1,1,1,1,1,1,1,2024-08-30 00:30:00+02:00,2024-08-30 09:10:00+01:00
2,dcc5426b-9603-11e6-9066-549f350fcb0c,Paris (Bercy Seine),48.835318,2.380519,Europe/Paris,N700-3-2000082024-PQB#LVC-00,0 days 20:00:00,dcc0f769-9603-11e6-9066-549f350fcb0c,London Victoria Coach Station,51.492186,...,2024-08-29,0,0,0,1,1,0,0,2024-08-29 22:00:00+02:00,2024-08-30 06:30:00+01:00


In [1]:
import pandas as pd
from datetime import datetime
import os
from Analyzer import Analyzer
import pytz

DISTANCE_MARGIN: float = 0.05


class AnalyzerCalendarDates(Analyzer):
    unique_departures: pd.DataFrame = pd.DataFrame()
    city_list: pd.DataFrame = pd.DataFrame()
    stops: pd.DataFrame = pd.DataFrame()
    calendar_dates: pd.DataFrame = pd.DataFrame()
    routes: pd.DataFrame = pd.DataFrame()
    stop_times: pd.DataFrame = pd.DataFrame()
    trips: pd.DataFrame = pd.DataFrame()
    stops_id: pd.DataFrame = pd.DataFrame()
    stops_area: pd.DataFrame = pd.DataFrame()
    nearby_stops: pd.DataFrame = pd.DataFrame()
    timezone: str = "UTC"

    def __init__(self, transport_type="TER"):
        self.calendar_dates = pd.read_csv(os.path.join("Data", transport_type, "calendar_dates.txt"))
        self.stop_times = pd.read_csv(os.path.join("Data", transport_type, "stop_times.txt"))[
            ["trip_id", "stop_id", "departure_time"]
        ]
        agency = pd.read_csv(os.path.join("Data", transport_type, "agency.txt"))
        self.timezone = agency["agency_timezone"].iloc[0]
        self.stops = pd.read_csv(os.path.join("Data", transport_type, "stops.txt"))[
            ["stop_id", "stop_name", "stop_lat", "stop_lon", "parent_station"]
        ]
        self.trips = pd.read_csv(os.path.join("Data", transport_type, "trips.txt"))[
            ["service_id", "trip_id", "route_id"]
        ]
        self.calendar_dates.date = pd.to_datetime(self.calendar_dates["date"], format="%Y%m%d")
        self.stop_times["departure_time"] = pd.to_timedelta(self.stop_times["departure_time"])
        self.stops_id = self.stops[~self.stops["parent_station"].isna()]
        self.stops_area = self.stops[self.stops["parent_station"].isna()]

    # Retourne les StopPoints proche du point de départ
    def find_nearby_stops(self, lat: float, lon: float):  # Stop ID pour Global, parent_station pour SNCF
        return self.stops_id[
            (self.stops_id["stop_lat"] > lat - DISTANCE_MARGIN/2)
            & (self.stops_id["stop_lat"] < lat + DISTANCE_MARGIN/2)
            & (self.stops_id["stop_lon"] > lon - DISTANCE_MARGIN/2)
            & (self.stops_id["stop_lon"] < lon + DISTANCE_MARGIN/2)
        ]

    def get_trips_nearby_location(self, lat: float, lon: float) -> pd.Series:
        self.nearby_stops: pd.DataFrame = self.find_nearby_stops(lat, lon)
        trips_containing_departure: pd.DataFrame = self.stop_times[
            self.stop_times["stop_id"].isin(self.nearby_stops["stop_id"])
        ]
        self.unique_departures = trips_containing_departure.drop_duplicates(subset="trip_id")
        trip_ids: pd.Series = self.unique_departures["trip_id"]
        return trip_ids

    def filter_trips_within_period(self, lat: float, lon: float, start_date: datetime, end_date: datetime) -> pd.Series:
        start_date = pd.to_datetime(start_date)
        end_date = pd.to_datetime(end_date)
        trip_ids: pd.Series = self.get_trips_nearby_location(lat, lon)
        relevant_trips: pd.DataFrame = self.trips[self.trips["trip_id"].isin(trip_ids)]
        relevant_services: pd.DataFrame = self.calendar_dates[
            self.calendar_dates["service_id"].isin(relevant_trips["service_id"])
        ]
        services_within_period: pd.DataFrame = relevant_services[
            (relevant_services["date"] >= start_date) & (relevant_services["date"] <= end_date)
        ]
        services_within_period.drop_duplicates(subset="service_id")
        trips_within_period: pd.DataFrame = relevant_trips[
            relevant_trips["service_id"].isin(services_within_period["service_id"])
        ]
        return trips_within_period["trip_id"]

    def find_destinations_from_location(
        self, lat: float, lon: float, start_date: datetime, end_date: datetime
    ) -> pd.DataFrame:
        trip_ids_within_period: pd.Series = self.filter_trips_within_period(lat, lon, start_date, end_date)
        stop_times_right_stops: pd.DataFrame = self.stop_times[self.stop_times["trip_id"].isin(trip_ids_within_period)]
        cities_after_inital_departure: pd.DataFrame = stop_times_right_stops.assign(
            city_departure_time="", stop_id_ville=""
        )

        departure_stop_ids: pd.Series = self.unique_departures["stop_id"]
        departure_stop_ids.index = self.unique_departures["trip_id"]
        departure_id = departure_stop_ids.loc[cities_after_inital_departure["trip_id"]]
        cities_after_inital_departure["stop_id_ville"] = departure_id.array

        departure_time: pd.Series = self.unique_departures["departure_time"]  # StopTime trip
        departure_time.index = self.unique_departures["trip_id"]
        column_departure_time = departure_time.loc[cities_after_inital_departure["trip_id"]]
        cities_after_inital_departure["city_departure_time"] = column_departure_time.array

        duplicate_destinations_stop_points: pd.DataFrame = cities_after_inital_departure[
            cities_after_inital_departure["departure_time"] > cities_after_inital_departure["city_departure_time"]
        ]
        duplicate_destinations_stop_points = self.stops_id[
            self.stops_id["stop_id"].isin(duplicate_destinations_stop_points["stop_id"])
        ]
        duplicate_destinations_stop_area: pd.DataFrame = self.stops_area[
            self.stops_area["stop_id"].isin(duplicate_destinations_stop_points["parent_station"])
        ]
        destinations_stop_area = duplicate_destinations_stop_area.drop_duplicates(subset="stop_id")
        destinations: pd.DataFrame = destinations_stop_area[
            ~destinations_stop_area["stop_id"].isin(self.nearby_stops["parent_station"])
        ]
        return destinations

    def find_trips_between_locations(
        self,
        departure_lat,
        departure_lon,
        arrival_lat: float,
        arrival_lon: float,
        start_date: datetime,
        end_date: datetime,
        departure_time: pd.Timedelta,
    ) -> pd.DataFrame:
        start_date = pd.to_datetime(start_date)
        end_date = pd.to_datetime(end_date)
        departure_stops: pd.DataFrame = self.stops_id[
            (self.stops_id["stop_lat"] > departure_lat - DISTANCE_MARGIN)
            & (self.stops_id["stop_lat"] < departure_lat + DISTANCE_MARGIN)
            & (self.stops_id["stop_lon"] > departure_lon - DISTANCE_MARGIN)
            & (self.stops_id["stop_lon"] < departure_lon + DISTANCE_MARGIN)
        ]
        arrival_stops: pd.DataFrame = self.stops_id[
            (self.stops_id["stop_lat"] > arrival_lat - DISTANCE_MARGIN / 2)
            & (self.stops_id["stop_lat"] < arrival_lat + DISTANCE_MARGIN / 2)
            & (self.stops_id["stop_lon"] > arrival_lon - DISTANCE_MARGIN / 2)
            & (self.stops_id["stop_lon"] < arrival_lon + DISTANCE_MARGIN / 2)
        ]
        trips_containing_departure: pd.DataFrame = pd.merge(self.stop_times, departure_stops, on="stop_id")
        trips_containing_departure = trips_containing_departure[
            trips_containing_departure["departure_time"] > departure_time
        ]
        trips_containing_arrival: pd.DataFrame = pd.merge(self.stop_times, arrival_stops, on="stop_id")
        trips_containing_both: pd.DataFrame = pd.merge(
            trips_containing_departure, trips_containing_arrival, on="trip_id"
        )
        if trips_containing_both.empty:
            return pd.DataFrame()
        trips_in_right_direction: pd.DataFrame = trips_containing_both[
            trips_containing_both["departure_time_x"] < trips_containing_both["departure_time_y"]
        ]
        trips: pd.DataFrame = pd.merge(trips_in_right_direction, self.trips, on="trip_id")
        trips_and_calendar_dates: pd.DataFrame = pd.merge(trips, self.calendar_dates, on="service_id")
        valid_trips: pd.DataFrame = trips_and_calendar_dates[
            (trips_and_calendar_dates["date"] >= start_date) & (trips_and_calendar_dates["date"] <= end_date)
        ]
        trips = valid_trips
        trips.loc[:,["dep_time"]] = trips["date"] + trips["departure_time_x"]
        trips.loc[:,["arr_time"]] = trips["date"] + trips["departure_time_y"]
        trips.loc[:,["dep_time"]] = trips.apply(lambda x: x["dep_time"].replace(tzinfo = pytz.timezone(self.timezone)), axis=1)
        trips.loc[:,["arr_time"]] = trips.apply(lambda x: x["arr_time"].replace(tzinfo = pytz.timezone(self.timezone)), axis=1)
        trips = trips.assign(stop_timezone_x = self.timezone, stop_timezone_y = self.timezone)
        return trips

    def get_list_of_cities(self) -> pd.DataFrame:
        appearance_count = self.stop_times.groupby("stop_id").count()["trip_id"]
        appearance_stop_id = pd.merge(appearance_count, self.stops, on="stop_id")
        appeareance_stop_area = appearance_stop_id.groupby("parent_station").sum()["trip_id"]
        df_stop_area = self.stops_area[["stop_id", "stop_name", "stop_lat", "stop_lon"]].copy()
        df_stop_area.set_index("stop_id", inplace=True)
        df_stop_area = df_stop_area.assign(number_of_appearance=appeareance_stop_area)
        return df_stop_area


In [2]:
analyzer = AnalyzerCalendarDates("TGV")
analyzer.find_trips_between_locations(48.835318,2.380519, 47.3781700,8.54019000, datetime(2024, 8, 29), datetime(2024, 8, 29), pd.Timedelta("8:00:00"))

Unnamed: 0,trip_id,stop_id_x,departure_time_x,stop_name_x,stop_lat_x,stop_lon_x,parent_station_x,stop_id_y,departure_time_y,stop_name_y,...,stop_lon_y,parent_station_y,service_id,route_id,date,exception_type,dep_time,arr_time,stop_timezone_x,stop_timezone_y
30,OCESN9213F3156979:2024-08-28T00:25:25Z,StopPoint:OCELyria-87686006,0 days 10:52:00,Paris Gare de Lyon Hall 1 - 2,48.844945,2.373481,StopArea:OCE87686006,StopPoint:OCELyria-85030007,0 days 16:26:00,Zuerich HB,...,8.54019,StopArea:OCE85030007,666,FR:Line::68841c81-3ddf-4568-b525-6e7110d66727:,2024-08-29,1,2024-08-29 10:52:00+02:00,2024-08-29 16:26:00+02:00,Europe/Paris,Europe/Paris
62,OCESN9219F3245450:2024-08-28T00:25:25Z,StopPoint:OCELyria-87686006,0 days 16:22:00,Paris Gare de Lyon Hall 1 - 2,48.844945,2.373481,StopArea:OCE87686006,StopPoint:OCELyria-85030007,0 days 20:26:00,Zuerich HB,...,8.54019,StopArea:OCE85030007,668,FR:Line::68841c81-3ddf-4568-b525-6e7110d66727:,2024-08-29,1,2024-08-29 16:22:00+02:00,2024-08-29 20:26:00+02:00,Europe/Paris,Europe/Paris
84,OCESN9223F3156970:2024-08-29T00:25:22Z,StopPoint:OCELyria-87686006,0 days 18:22:00,Paris Gare de Lyon Hall 1 - 2,48.844945,2.373481,StopArea:OCE87686006,StopPoint:OCELyria-85030007,0 days 22:26:00,Zuerich HB,...,8.54019,StopArea:OCE85030007,670,FR:Line::68841c81-3ddf-4568-b525-6e7110d66727:,2024-08-29,1,2024-08-29 18:22:00+02:00,2024-08-29 22:26:00+02:00,Europe/Paris,Europe/Paris
