In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd

### Trip information

In [2]:
# Start with loading the trips
trips_path = "../../results/surveys/egt_2010/cleaned/trips.parquet"
df_trips = pd.read_parquet(trips_path)

# Clean up of attributes
df_trips["euclidean_distance_km"] = df_trips["euclidean_distance"] * 1e-3
df_trips["origin_home"] = df_trips["origin_activity_type"] == "home"
df_trips["destination_home"] = df_trips["destination_activity_type"] == "home"

# Markers
df_trips["is_first"] = df_trips["person_id"].shift(1) != df_trips["person_id"]
df_trips["is_last"] = df_trips["person_id"].shift(-1) != df_trips["person_id"]

# Rename
df_trips["mode"] = df_trips["mode"].replace({ "pt": "transit" })

### Person information

In [3]:
# Person: merge in person-level information
persons_path = "../../results/surveys/egt_2010/cleaned/persons.parquet"
df_persons = pd.read_parquet(persons_path)

df_trips = pd.merge(df_trips, df_persons[[
    "person_id", "weight", "age",
    "has_pt_subscription", "has_driving_permit", "has_motorbike_permit"]])

In [4]:
# Using income will remove about 30% of the households
use_income = False

if use_income:
    households_path = "../../results/surveys/egt_2010/cleaned/households.parquet"
    df_households = pd.read_parquet(households_path)

    df_trips = pd.merge(df_trips, df_households[[
        "household_id", "income_per_person_EUR", "income_EUR"]])
    
    df_trips = df_trips[~df_trips["income_EUR"].isna()].copy()

### Spatial information

In [5]:
# Spatial: merge in spatial information such as IRIS and whether trip starts/ends in Paris
spatial_path = "../../results/surveys/egt_2010/spatial.parquet"
df_spatial = gpd.read_parquet(spatial_path)

iris_path = "../../results/spatial/iris.parquet"
df_iris = gpd.read_parquet(iris_path).to_crs("EPSG:2154")

In [6]:
df_origin = gpd.sjoin(
    df_spatial.set_geometry("origin_geometry"), 
    df_iris, predicate = "within").rename(columns = { "iris": "origin_iris" })[["trip_id", "origin_iris"]]

df_destination = gpd.sjoin(
    df_spatial.set_geometry("destination_geometry"),
    df_iris, predicate = "within").rename(columns = { "iris": "destination_iris" })[["trip_id", "destination_iris"]]

In [7]:
df_trips = pd.merge(df_trips, df_origin, on = "trip_id")
df_trips = pd.merge(df_trips, df_destination, on = "trip_id")

df_trips["origin_paris"] = df_trips["origin_iris"].str[:2] == "75"
df_trips["destination_paris"] = df_trips["destination_iris"].str[:2] == "75"

In [8]:
df_coordinates = df_spatial[["trip_id", "origin_geometry", "destination_geometry"]].copy()
df_coordinates["origin_x"] = df_spatial["origin_geometry"].apply(lambda coord: coord.x)
df_coordinates["origin_y"] = df_spatial["origin_geometry"].apply(lambda coord: coord.y)
df_coordinates["destination_x"] = df_spatial["destination_geometry"].apply(lambda coord: coord.x)
df_coordinates["destination_y"] = df_spatial["destination_geometry"].apply(lambda coord: coord.y)
df_trips = pd.merge(df_trips, df_coordinates)

### Road trip information

In [9]:
road_path = "../../results/road/routing.parquet"

df_road = pd.read_parquet(road_path)[[
    "trip_id", "in_vehicle_time_min", "in_vehicle_distance_km",
    "access_time_min", "egress_time_min"
]]

df_road["walk_time_min"] = df_road["access_time_min"] + df_road["egress_time_min"]

In [10]:
df_car = df_road.copy()
df_car.columns = ["car_{}".format(c) if c != "trip_id" else c for c in df_car.columns]

car_cost_per_km = 0.2 # EUR
df_car["car_cost_EUR"] = car_cost_per_km * df_car["car_in_vehicle_distance_km"]

df_trips = pd.merge(df_trips, df_car)

In [11]:
df_motorbike = df_road.copy()
df_motorbike.columns = ["motorbike_{}".format(c) if c != "trip_id" else c for c in df_motorbike.columns]

motorbike_cost_per_km = car_cost_per_km * 0.5 # EUR
df_motorbike["motorbike_cost_EUR"] = motorbike_cost_per_km * df_motorbike["motorbike_in_vehicle_distance_km"]

df_trips = pd.merge(df_trips, df_motorbike)

In [12]:
df_car_passenger = df_road.copy()
df_car_passenger.columns = ["car_passenger_{}".format(c) if c != "trip_id" else c for c in df_car_passenger.columns]
df_trips = pd.merge(df_trips, df_car_passenger)

### Parking information

In [13]:
parking_path = "../../results/parking/parking_pressure.parquet"
df_pressure = gpd.read_parquet(parking_path)[["iris", "parking_pressure"]].rename(columns = {
    "iris": "destination_iris"
})

df_trips = pd.merge(df_trips, df_pressure, on = "destination_iris")

In [14]:
parking_duration_min = (df_trips["departure_time"].shift(-1) - df_trips["departure_time"]) / 60.0 - df_trips["car_in_vehicle_time_min"] - df_trips["car_access_time_min"]
parking_duration_min[df_trips["is_last"].values] = 8 * 60.0

is_resident = df_trips["origin_paris"] & df_trips["origin_home"]
is_resident |= df_trips["destination_paris"] & df_trips["destination_home"]
residents = df_trips.loc[~is_resident, "person_id"].unique()

is_relevant = df_trips["destination_paris"] & ~df_trips["person_id"].isin(residents)

parking_cost_per_hour_EUR = 3.0
parking_cost_EUR = parking_cost_per_hour_EUR * np.maximum(1.0, np.ceil(parking_duration_min / 60.0))

df_trips["parking_cost_EUR"] = 0.0
df_trips.loc[is_relevant, "parking_cost_EUR"] = parking_cost_EUR[is_relevant]

### Transit trip information

In [15]:
transit_path = "../../results/transit/routing.parquet"
df_transit = pd.read_parquet(transit_path)[[
    "trip_id", "is_only_walk", "transfers",
    "access_walk_time_min", "egress_walk_time_min", "transfer_walk_time_min",
    "transfer_wait_time_min", "initial_wait_time_min",
    "in_vehicle_travel_time", "in_vehicle_travel_time_by_mode_min"
]]

In [16]:
df_transit["in_vehicle_travel_time_min"] = df_transit["in_vehicle_travel_time"]

for mode in ("rail", "tram", "subway", "bus"):
    df_transit["in_vehicle_time_{}_min".format(mode)] = df_transit["in_vehicle_travel_time_by_mode_min"].apply(
        lambda item: item[mode]).fillna(0.0)
    
df_transit["only_bus"] = df_transit["in_vehicle_time_rail_min"] == 0.0
df_transit["only_bus"] &= df_transit["in_vehicle_time_subway_min"] == 0.0
df_transit["only_bus"] &= df_transit["in_vehicle_time_tram_min"] == 0.0
df_transit["only_bus"] &= df_transit["in_vehicle_time_bus_min"] > 0.0

df_transit["without_rail"] = df_transit["in_vehicle_time_rail_min"] == 0.0

df_transit["total_in_vehicle_time_min"] = df_transit["in_vehicle_travel_time"]

df_transit["total_walk_time_min"] = df_transit["access_walk_time_min"]
df_transit["total_walk_time_min"] += df_transit["egress_walk_time_min"]
df_transit["total_walk_time_min"] += df_transit["transfer_walk_time_min"]

In [17]:
df_transit.columns = ["transit_{}".format(c) if c != "trip_id" else c for c in df_transit.columns]
df_trips = pd.merge(df_trips, df_transit)

In [18]:
sigmoid = lambda x: 1.0 / (1.0 + np.exp(-x))
a, b, c, d = 0.098, 0.006, 0.006, -0.77
center_x, center_y = 651726, 6862287

distance_origin_destination = df_trips["euclidean_distance"] * 1e-3

distance_origin_center = np.sqrt(
    (df_trips["origin_x"] - center_x)**2 + (df_trips["origin_y"] - center_y)**2
) * 1e-3

distance_destination_center = np.sqrt(
    (df_trips["destination_x"] - center_x)**2 + (df_trips["destination_y"] - center_y)**2
) * 1e-3

# base price according to regression
cost = np.maximum(
    1.9,
    5.5 * sigmoid(
        a * distance_origin_destination + b * distance_origin_center + c * distance_destination_center + d)
)

# special case: Within Paris, or only metro and bus
f_paris = df_trips["origin_paris"] & df_trips["destination_paris"]
f_without_rail = df_trips["transit_without_rail"]
cost[f_paris | f_without_rail] = 1.8

# special case: Subscription
f_subscription = df_trips["has_pt_subscription"]
cost[f_subscription] = 0.0

# write back
df_trips["transit_cost_EUR"] = cost

### Walk trip information

In [19]:
df_walk = df_trips[["trip_id", "euclidean_distance_km"]].copy()

walk_factor = 1.3
walk_speed = 4.5 # km/h

df_walk["travel_time_min"] = (walk_factor * df_walk["euclidean_distance_km"] / walk_speed) * 60

In [20]:
maximum_walk_time_min = 120
df_walk["availability"] = df_walk["travel_time_min"] <= maximum_walk_time_min

In [21]:
df_walk = df_walk[["trip_id", "availability", "travel_time_min"]]
df_walk.columns = ["walk_{}".format(c) if c != "trip_id" else c for c in df_walk.columns]
df_trips = pd.merge(df_trips, df_walk)

### Bicycle trip information

In [22]:
df_bicycle = df_trips[["trip_id", "euclidean_distance_km"]].copy()

bicycle_factor = 1.3
bicycle_speed = 15 # km/h

df_bicycle["travel_time_min"] = (bicycle_factor * df_bicycle["euclidean_distance_km"] / bicycle_speed) * 60

In [23]:
df_bicycle = df_bicycle[["trip_id", "travel_time_min"]]
df_bicycle.columns = ["bicycle_{}".format(c) if c != "trip_id" else c for c in df_bicycle.columns]
df_trips = pd.merge(df_trips, df_bicycle)

### Availabilities

In [24]:
availabilities_path = "../../results/surveys/egt_2010/availabilities.parquet"
df_availabilities = pd.read_parquet(availabilities_path)

In [25]:
df_partial = df_availabilities.copy()
df_partial["car_availability"] = df_partial["car_availability_by_vehicles"] != "none"
df_partial["bicycle_availability"] = df_partial["bicycle_availability_by_vehicles"] != "none"
df_partial["motorbike_availability"] = df_partial["motorbike_availability_by_vehicles"] != "none"
df_trips = pd.merge(df_trips, df_partial[[
    "person_id", "car_availability", "bicycle_availability", "motorbike_availability"]])

In [26]:
df_trips.loc[~df_trips["has_driving_permit"], "car_availability"] = False
df_trips.loc[~df_trips["has_motorbike_permit"], "motorbike_availability"] = False

In [27]:
df_trips["transit_availability"] = ~df_trips["transit_is_only_walk"].astype(bool)

In [28]:
# passenger availabiltiy depends on vehicle availability in the household!
df_trips["car_passenger_availability"] = df_trips["car_availability"] | df_trips["motorbike_availability"]

### Validity checks

In [29]:
for column in df_trips.columns:
    if np.count_nonzero(df_trips[column].isna()) > 0:
        print(column, np.count_nonzero(df_trips[column].isna()) / len(df_trips))

for column in df_trips.columns:  
    assert np.count_nonzero(df_trips[column].isna()) == 0

In [30]:
df_trips = df_trips[df_trips["euclidean_distance_km"] > 0.0].copy()
(df_trips["euclidean_distance_km"] * df_trips["weight"]).sum() / df_trips["weight"].sum()

np.float64(4.418787040979056)

In [31]:
if "income_per_person_EUR" in df_trips:
    (df_trips["income_per_person_EUR"] * df_trips["weight"]).sum() / df_trips["weight"].sum()

In [32]:
use_motorbike = False

if not use_motorbike:
    df_trips.loc[df_trips["mode"] == "motorbike", "mode"] = "car"
    df_trips["has_driving_permit"] |= df_trips["has_motorbike_permit"]
    df_trips["car_availability"] |= df_trips["motorbike_availability"]
    
    df_trips = df_trips[[c for c in df_trips.columns if not c.startswith("motorbike_")]]

### Cleanup

In [33]:
f_remove = np.zeros((len(df_trips),), dtype = bool)

modes = ["car", "car_passenger", "transit", "bicycle", "walk"]
if use_motorbike: modes.append("motorbike")

for mode in modes:
    f_mode = ~df_trips["{}_availability".format(mode)] & (df_trips["mode"] == mode)
    print("Removing {} invalid trips for {}".format(np.count_nonzero(f_mode), mode))
    f_remove |= f_mode

df_trips = df_trips[~f_remove].copy()

Removing 485 invalid trips for car
Removing 7616 invalid trips for car_passenger
Removing 506 invalid trips for transit
Removing 117 invalid trips for bicycle
Removing 6 invalid trips for walk


### Output

In [34]:
output_path = "../../results/choice_model/input.parquet"

columns = [
    "trip_id", "weight", "mode", "euclidean_distance_km",

    # person
    "age",

    # car
    "car_availability", "has_driving_permit",
    "car_in_vehicle_time_min", "car_cost_EUR",
    "car_access_time_min", "car_egress_time_min", "car_walk_time_min",

    # parking
    "parking_cost_EUR", "parking_pressure",

    # car passenger
    "car_passenger_availability",
    "car_passenger_in_vehicle_time_min",
    "car_passenger_access_time_min", "car_passenger_egress_time_min", "car_passenger_walk_time_min",

    # transit
    "has_pt_subscription",
    "transit_availability", "transit_transfers", "transit_only_bus",
    "transit_total_in_vehicle_time_min", "transit_total_walk_time_min",
    "transit_transfer_wait_time_min", "transit_initial_wait_time_min",
    "transit_cost_EUR",
    "transit_in_vehicle_time_rail_min", "transit_in_vehicle_time_tram_min",
    "transit_in_vehicle_time_subway_min", "transit_in_vehicle_time_bus_min",

    # bicycle
    "bicycle_availability", "bicycle_travel_time_min",

    # walking
    "walk_availability", "walk_travel_time_min",
    
    # for spatial analysis
    "origin_iris", "destination_iris" 
]

if use_motorbike:
    columns += [
        # motorbike
        "motorbike_availability", "has_motorbike_permit",
        "motorbike_in_vehicle_time_min", "motorbike_cost_EUR",
        "motorbike_access_time_min", "motorbike_egress_time_min", "motorbike_walk_time_min",
    ]

if "income_EUR" in df_trips:
    columns += ["income_per_person_EUR", "income_EUR"]

df_trips[columns].to_parquet(output_path)