In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from datetime import time
import geopandas as gpd

from shapely.geometry import Point, LineString, shape

## Load Data

In [35]:
df = pd.read_csv(r'..\data\processed\trips_custom_variables.csv', dtype = {'VORIHORAINI':str, 'VDESHORAFIN':str}, parse_dates = ['start_time','end_time'])
etap = pd.read_excel (r'..\data\raw\EDM2018XETAPAS.xlsx')

In [36]:
df.set_index(["ID_HOGAR", "ID_IND", "ID_VIAJE"], inplace =True)
etap.set_index(["ID_HOGAR", "ID_IND", "ID_VIAJE"], inplace =True)
legs = df.join(etap, rsuffix = "_etap")

In [37]:
# select only public transport trips
legs = legs[legs.mode_simple == "public transport"]

In [38]:
codes = pd.read_csv(r'..\data\processed\codes_translated.csv', dtype = {'CODE': float})

In [39]:
stops = gpd.read_file(r'..\data\raw\public_transport_madrid\madrid_crtm_stops.shp')

In [40]:
legs_start_end = legs.sort_values("ID_ETAPA").groupby(["ID_HOGAR", "ID_IND", "ID_VIAJE"]).agg(
    {"C2SEXO": "first","ESUBIDA": "first", "ESUBIDA_cod": "first", "EBAJADA": "last", "EBAJADA_cod": "last", "N_ETAPAS_POR_VIAJE": "first", "VORIHORAINI": "first", "duration":"first", "DANNO": "first", "DMES": "first", "DDIA":"first"})

In [41]:
legs_start_end= legs_start_end[legs_start_end.ESUBIDA_cod.notna()]
legs_start_end= legs_start_end[legs_start_end.EBAJADA_cod.notna()]

### Preprocessing

In [42]:
# stops["id_custom"] = stops.stop_id.str.split("_").apply(lambda x: x[len(x)-1])
# s = stops.reset_index().set_index(["id_custom", "stop_name"])[["geometry"]]

# Problem: match not working properly: id_custom multiple times within df_stations. For names not a match for every start / end
stops_unique_name = stops.drop_duplicates("stop_name").set_index("stop_name")

df_stations = legs_start_end.join(stops_unique_name, on ='ESUBIDA', how= "inner")
df_stations = df_stations.join(stops_unique_name, how= "inner", on ='EBAJADA', lsuffix = "_dep", rsuffix = "_arrival")

#df_stations["line"] = df_stations.apply(lambda x: LineString([x.geometry_dep, x.geometry_arrival]), axis = 1)
#df_stations = gpd.GeoDataFrame(df_stations, geometry = df_stations.line)

In [43]:
# df_stations[["VORIHORAINI", "VDESHORAFIN", "start_time", "end_time", "duration", "DANNO", "DMES", "DDIA", "activity_simple", "motive_simple", "daytime", "speed", "C2SEXO", "EDAD_FIN", "ESUBIDA", "ESUBIDA_cod", "EBAJADA", "EBAJADA_cod", "geometry_dep", "geometry_arrival"]].to_csv(
#    r'..\data\processed\public_transport_georeferenced.csv')

In [44]:
#df_stations[["activity_simple", "motive_simple", "daytime", "speed", "C2SEXO", "EDAD_FIN", "ESUBIDA", "ESUBIDA_cod", "EBAJADA", "EBAJADA_cod", "geometry"]].to_file(
#    r'..\data\processed\public_transport_georeferenced.geojson', driver = "GeoJSON")

### (use preprocessed data)

In [45]:
# df_stations = pd.read_csv(r'..\data\processed\public_transport_georeferenced.csv', dtype = {'VORIHORAINI':str, 'VDESHORAFIN':str, 'geometry_dep':'geometry'})

### counts for Flowmap

In [None]:
# todo: add linestring again for flowmap
counts = df_stations.groupby(["ESUBIDA", "EBAJADA", "activity_simple", "C2SEXO"]).agg({"ID_ETAPA": "count", "ELE_G_POND_ESC2" : "sum", "geometry": "first"})

In [None]:
counts.rename({"ELE_G_POND_ESC2": "weighted_count"}, axis = 1, inplace = True)

In [None]:
df_counts = gpd.GeoDataFrame(counts, geometry = "geometry")

In [None]:
df_counts.to_file(
    r'..\data\processed\trip_counts_georef.geojson', driver = "GeoJSON")

In [298]:
counts.shape

(17659, 3)

In [None]:
counts_gender = df_stations.groupby(["ESUBIDA", "EBAJADA", "C2SEXO"]).agg({"ID_ETAPA": "count", "ELE_G_POND_ESC2" : "sum", "geometry": "first"})

counts_gender.rename({"ELE_G_POND_ESC2": "weighted_count"}, axis = 1, inplace = True)

df_counts_gender = gpd.GeoDataFrame(counts_gender, geometry = "geometry")

df_counts_gender.to_file(
    r'..\data\processed\trip_counts_gender_georef.geojson', driver = "GeoJSON")

In [None]:
counts_activity = df_stations.groupby(["ESUBIDA", "EBAJADA", "activity_simple"]).agg({"ID_ETAPA": "count", "ELE_G_POND_ESC2" : "sum", "geometry": "first"})

counts_activity.rename({"ELE_G_POND_ESC2": "weighted_count"}, axis = 1, inplace = True)

df_counts_activity = gpd.GeoDataFrame(counts_activity, geometry = "geometry")

df_counts_activity.to_file(
    r'..\data\processed\trip_counts_activity_georef.geojson', driver = "GeoJSON")

In [89]:
counts_motive = df_stations.groupby(["ESUBIDA", "EBAJADA", "motive_simple"]).agg({"ID_ETAPA": "count", "ELE_G_POND_ESC2" : "sum", "geometry": "first"})

counts_motive.rename({"ELE_G_POND_ESC2": "weighted_count"}, axis = 1, inplace = True)

df_counts_motive = gpd.GeoDataFrame(counts_motive, geometry = "geometry")

df_counts_motive.to_file(
    r'..\data\processed\trip_counts_motive_georef.geojson', driver = "GeoJSON")

### comparison to car

In [46]:
import herepy

In [47]:
routingApi = herepy.RoutingApi('i5L1qsCmPo7AkwqhCWGA9J2QKnuC-TSI9KNWBqEkdIk')

In [48]:
# time and speed 
df_stations['start_time'] = pd.to_datetime(df_stations.VORIHORAINI, format = '%H%M')
# df_stations['end_time'] = pd.to_datetime(df_stations.VDESHORAFIN, format = '%H%M', errors = 'coerce')
# df_stations['duration'] = df_stations.end_time - df_stations.start_time

In [49]:
df_stations["formatted_time"] = df_stations.DANNO.astype(str) + '-' + df_stations.DMES.astype(str).str.zfill(2) + '-' + df_stations.DDIA.astype(str).str.zfill(2) + 'T'+ df_stations.VORIHORAINI.str.slice(0,2) + ":" + df_stations.VORIHORAINI.str.slice(2,4)  + ':00'

In [50]:
df_stations["car_traveltime"] = None
df_stations["pt_traveltime"] = None

In [51]:
df_unique_routes = df_stations.drop_duplicates(["ESUBIDA", "EBAJADA", "geometry_dep", "geometry_arrival"]).copy()

In [52]:
df_unique_routes.reset_index(drop = True, inplace = True)

In [54]:
for i in range (len(df_unique_routes)):
    if(df_unique_routes.car_traveltime.notna()[i]):
        continue
    if i % 1000 == 0:
        print(i)
    try:
        resp_car = routingApi.car_route([df_unique_routes.iloc[i, ].geometry_dep.y, df_unique_routes.iloc[i, ].geometry_dep.x],
                                    [df_unique_routes.iloc[i, ].geometry_arrival.y, df_unique_routes.iloc[i, ].geometry_arrival.x],
                                    [herepy.RouteMode.car, herepy.RouteMode.fastest],
                                     departure = df_unique_routes.loc[i, "formatted_time"])
        
        df_unique_routes.loc[i, "car_traveltime"] = resp_car.response["route"][0]["summary"]["travelTime"]

    except:
        print('car no route found, id:', i)
        df_unique_routes.loc[i, "car_traveltime"] = None    
        
 
    try:
        resp_pt = routingApi.public_transport([df_unique_routes.iloc[i, ].geometry_dep.y, df_unique_routes.iloc[i, ].geometry_dep.x],
                                    [df_unique_routes.iloc[i, ].geometry_arrival.y, df_unique_routes.iloc[i, ].geometry_arrival.x],
                                       True,
                                       modes = [herepy.RouteMode.publicTransport, herepy.RouteMode.fastest],
                                     departure = df_unique_routes.loc[i, "formatted_time"])
        df_unique_routes.loc[i, "pt_traveltime"] = resp_pt.response["route"][0]["summary"]["travelTime"]

    except:
        print('pt no route found, id:', i)
        df_unique_routes.loc[i, "pt_traveltime"] = None   
        

0
pt no route found, id: 215
1000
2000
pt no route found, id: 2098
pt no route found, id: 2100
pt no route found, id: 2101
3000
pt no route found, id: 3092
pt no route found, id: 3991
4000
pt no route found, id: 4398
pt no route found, id: 4509
pt no route found, id: 4876
pt no route found, id: 4957
5000
pt no route found, id: 5205
pt no route found, id: 5206
pt no route found, id: 5365
car no route found, id: 5746
pt no route found, id: 5782
6000
pt no route found, id: 6265
pt no route found, id: 6769
7000
pt no route found, id: 7877
pt no route found, id: 7886
pt no route found, id: 7934
pt no route found, id: 7960
8000
pt no route found, id: 8125
pt no route found, id: 8139
pt no route found, id: 8697
pt no route found, id: 8736
pt no route found, id: 8757
9000
pt no route found, id: 9132
pt no route found, id: 9202
10000
pt no route found, id: 10930
11000
pt no route found, id: 11464
pt no route found, id: 11507
12000
pt no route found, id: 12752
13000
pt no route found, id: 13072


In [58]:
df_unique_routes[df_unique_routes.pt_traveltime.isna()].shape

(68, 20)

In [57]:
df_unique_routes[df_unique_routes.car_traveltime.isna()].shape

(1, 20)

In [56]:
df_unique_routes.to_csv(r'..\data\processed\unique_routings_run2_2.csv')

In [59]:
df_unique_routes["car_traveltime_min"] = df_unique_routes.car_traveltime / 60
df_unique_routes["pt_traveltime_min"] = df_unique_routes.pt_traveltime / 60

In [60]:
df_stations = df_stations.join(df_unique_routes.set_index(["ESUBIDA", "EBAJADA"])[["car_traveltime_min", "pt_traveltime_min"]], on = ["ESUBIDA", "EBAJADA"])

In [66]:
df_stations = df_stations.join(legs["C2SEXO"],how = "left")

In [99]:
df_stations = df_stations.join(legs["age_group"],how = "left")

In [69]:
#days, seconds = df_stations.duration.dt.days, df_stations.duration.dt.seconds
#df_stations["minutes"] = seconds % 3600

In [100]:
df_stations.drop_duplicates(inplace = True)

In [101]:
df_stations["tt_ratio"] = None

In [102]:
df_stations.loc[df_stations.pt_traveltime_min != 0, "tt_ratio"] = df_stations[df_stations.pt_traveltime_min != 0].pt_traveltime_min / df_stations[df_stations.pt_traveltime_min != 0].car_traveltime_min

In [103]:
df_stations.loc[df_stations.car_traveltime_min != 0, "tt_ratio_duration"] = df_stations[df_stations.car_traveltime_min != 0].duration / df_stations[df_stations.car_traveltime_min != 0].car_traveltime_min

In [104]:
df_stations[["start_time", "duration", "car_traveltime_min", "pt_traveltime_min", "tt_ratio", "tt_ratio_duration", "age_group"]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,start_time,duration,car_traveltime_min,pt_traveltime_min,tt_ratio,tt_ratio_duration,age_group
ID_HOGAR,ID_IND,ID_VIAJE,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
742,2,1.0,1900-01-01 08:00:00,90.0,21.1333,67.9167,3.21372,4.25868,adults
742,2,2.0,1900-01-01 17:00:00,80.0,20,69.8333,3.49167,4,adults
1096,1,1.0,1900-01-01 07:40:00,20.0,23.0833,77.9833,3.37834,0.866426,adults
1096,1,2.0,1900-01-01 15:00:00,60.0,21.7167,75.15,3.46048,2.76285,adults
1479,2,1.0,1900-01-01 07:30:00,45.0,9.88333,29.5,2.98482,4.55312,kids
...,...,...,...,...,...,...,...,...,...
6008999,2,1.0,1900-01-01 07:40:00,25.0,11.2167,63.9,5.69688,2.22883,kids
6008999,2,2.0,1900-01-01 14:30:00,30.0,11.2333,49.95,4.44659,2.67062,kids
6009043,2,3.0,1900-01-01 16:40:00,110.0,25.6167,114.417,4.46649,4.29408,kids
6010779,1,1.0,1900-01-01 10:00:00,30.0,20.3667,35.8167,1.75859,1.473,young adults


In [105]:
df_stations.tt_ratio = df_stations.tt_ratio.astype(float)

In [106]:
df_stations.tt_ratio_duration = df_stations.tt_ratio_duration.astype(float)

In [107]:
df_stations.groupby(["age_group", "C2SEXO"]).tt_ratio_duration.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
age_group,C2SEXO,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
adults,1,4661.0,2.867918,1.641428,0.229709,1.962388,2.562278,3.321033,28.658074
adults,2,7757.0,2.882704,1.835693,0.0,2.038505,2.608696,3.347639,100.0
kids,1,1256.0,2.927261,1.855637,0.123421,1.804783,2.553191,3.546808,27.835052
kids,2,841.0,3.336633,10.406611,0.310976,1.854141,2.605863,3.680982,300.0
seniors,1,3181.0,2.76429,1.936612,0.19544,1.814516,2.442334,3.2287,53.181818
seniors,2,3860.0,2.795835,1.821012,0.12766,1.894239,2.521892,3.315681,69.902913
young adults,1,1979.0,2.888981,2.283238,0.310976,1.967751,2.523364,3.311128,60.0
young adults,2,2151.0,2.889548,1.6322,0.253807,1.988072,2.578797,3.336425,22.429907


In [108]:
df_stations.groupby(["age_group", "C2SEXO"]).tt_ratio.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
age_group,C2SEXO,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
adults,1,4652.0,1.936849,0.755302,0.177074,1.462174,1.80762,2.265432,11.897915
adults,2,7732.0,1.990268,0.77689,0.317623,1.484778,1.84065,2.337422,8.84507
kids,1,1252.0,2.293603,1.770997,0.177074,1.420405,1.885696,2.71517,35.548485
kids,2,840.0,2.314621,1.665524,0.331776,1.441784,1.846016,2.608686,19.468713
seniors,1,3178.0,1.900978,0.870605,0.363319,1.410738,1.748097,2.211457,19.187146
seniors,2,3854.0,1.95292,1.069895,0.27334,1.416247,1.775855,2.282697,36.279736
young adults,1,1964.0,2.116669,1.362108,0.27334,1.499041,1.882353,2.457666,36.279736
young adults,2,2143.0,2.090513,1.05858,0.363319,1.516022,1.892463,2.437484,20.562926
