In [9]:
import geopandas as gpd
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

#### Read in Datasets and Files

* railOD_tt_fare contains the travel times, distances and fares provided by WMATA
* stndists contains the OD pair distances via OSMNX
* mstns contains the station ID and two formattings of the station name 

In [10]:
railOD_tt_fare = pd.read_excel("../../Data/railOD_TravelTimesAndFares.xlsx")
stndists = pd.read_excel("../../Data/25_35_drive_times_distances.xlsx").iloc[:,  [1, 2, 3, 5]]
mstns = pd.read_csv("../../Data/metro_walkshed_name.csv")

#### Data Preprocessing

* Remove extra zeros in the origin and destination MSTN ids and create an OD pair column
* filter the necessary columns
* Consolidate the MSTNS for mergins
* Assign MSTNs for station names in the stndists table
* Merge the stndists and railOD_peak_fare and calculate
* Merge the stndists and railOD_off_peak_fare and calculate
* Export both the peak and off peak fares per mile

In [11]:
#create pairs
railOD_tt_fare['mstn_id_o'] = railOD_tt_fare['O_MSTN_ID'].str[-2:].str.lstrip('0')
railOD_tt_fare['mstn_id_d'] = railOD_tt_fare['D_MSTN_ID'].str[-2:].str.lstrip('0')
railOD_tt_fare['pairs'] = railOD_tt_fare['mstn_id_o']+str(0)+railOD_tt_fare['mstn_id_d']
railOD_tt_fare.head()

Unnamed: 0,O_MSTN_ID,D_MSTN_ID,O_PRIMARY_NAME,D_PRIMARY_NAME,COMP_MILE,PEAK_FARE,OFF_PEAK_FARE,SD_FARE,TRAVEL_TIME,mstn_id_o,mstn_id_d,pairs
0,MSTN_001,MSTN_001,Anacostia,Anacostia,0.05,2.25,2.0,1.1,0,1,1,101
1,MSTN_001,MSTN_002,Anacostia,Archives,2.83,2.25,2.0,1.1,9,1,2,102
2,MSTN_001,MSTN_003,Anacostia,Benning Road,5.76,3.15,2.6,1.55,26,1,3,103
3,MSTN_001,MSTN_004,Anacostia,Brookland-CUA,5.98,3.2,2.6,1.6,27,1,4,104
4,MSTN_001,MSTN_005,Anacostia,Capitol South,2.55,2.25,2.0,1.1,16,1,5,105


In [12]:
railOD_tt_fare.head()

Unnamed: 0,O_MSTN_ID,D_MSTN_ID,O_PRIMARY_NAME,D_PRIMARY_NAME,COMP_MILE,PEAK_FARE,OFF_PEAK_FARE,SD_FARE,TRAVEL_TIME,mstn_id_o,mstn_id_d,pairs
0,MSTN_001,MSTN_001,Anacostia,Anacostia,0.05,2.25,2.0,1.1,0,1,1,101
1,MSTN_001,MSTN_002,Anacostia,Archives,2.83,2.25,2.0,1.1,9,1,2,102
2,MSTN_001,MSTN_003,Anacostia,Benning Road,5.76,3.15,2.6,1.55,26,1,3,103
3,MSTN_001,MSTN_004,Anacostia,Brookland-CUA,5.98,3.2,2.6,1.6,27,1,4,104
4,MSTN_001,MSTN_005,Anacostia,Capitol South,2.55,2.25,2.0,1.1,16,1,5,105


In [13]:
railOD_peak_fare = railOD_tt_fare[['O_MSTN_ID', 'D_MSTN_ID', 'pairs', 'PEAK_FARE']]
railOD_off_peak_fare = railOD_tt_fare[['O_MSTN_ID', 'D_MSTN_ID','pairs', 'OFF_PEAK_FARE']]

In [14]:
nm_mstn = mstns.drop_duplicates(subset='Name_1').reset_index()[['Name_1', 'MSTN']].rename(columns = {'Name_1': 'NM'})
nm_mstn.head()

Unnamed: 0,NM,MSTN
0,ADDISON ROAD-SEAT PLEASANT,MSTN_062
1,ANACOSTIA,MSTN_001
2,ARCHIVES-NAVY MEMORIAL-PENN QUARTER,MSTN_002
3,ARLINGTON CEMETERY,MSTN_065
4,BALLSTON-MU,MSTN_068


In [15]:
stndists = stndists.merge(nm_mstn, left_on = 'origins', right_on = 'NM').rename(columns = {'MSTN': 'O_MSTN_ID'})[['index', 'origins', 'destinations', 'distance', 'O_MSTN_ID']]
stndists = stndists.merge(nm_mstn, left_on = 'destinations', right_on = 'NM').rename(columns = {'MSTN': 'D_MSTN_ID'})[['index', 'origins', 'destinations', 'O_MSTN_ID', 'D_MSTN_ID', 'distance']]
stndists.head()

Unnamed: 0,index,origins,destinations,O_MSTN_ID,D_MSTN_ID,distance
0,0,ANACOSTIA,ADDISON ROAD-SEAT PLEASANT,MSTN_001,MSTN_062,6.664029
1,1,ARCHIVES-NAVY MEMORIAL-PENN QUARTER,ADDISON ROAD-SEAT PLEASANT,MSTN_002,MSTN_062,7.358313
2,2,ARLINGTON CEMETERY,ADDISON ROAD-SEAT PLEASANT,MSTN_065,MSTN_062,9.583608
3,4,BALLSTON-MU,ADDISON ROAD-SEAT PLEASANT,MSTN_068,MSTN_062,12.456791
4,5,BENNING ROAD,ADDISON ROAD-SEAT PLEASANT,MSTN_003,MSTN_062,2.611865


In [16]:
railOD_peak_fare_per_mile = railOD_peak_fare.merge(stndists, left_on=['O_MSTN_ID', 'D_MSTN_ID'], right_on=['O_MSTN_ID', 'D_MSTN_ID'])
railOD_peak_fare_per_mile['peak_fare_per_mile'] = railOD_peak_fare_per_mile['PEAK_FARE']/railOD_peak_fare_per_mile['distance']
railOD_peak_fare_per_mile = railOD_peak_fare_per_mile[['pairs', 'O_MSTN_ID', 'D_MSTN_ID', 'peak_fare_per_mile']]

In [17]:
railOD_peak_fare_per_mile.head()

Unnamed: 0,pairs,O_MSTN_ID,D_MSTN_ID,peak_fare_per_mile
0,102,MSTN_001,MSTN_002,0.747972
1,103,MSTN_001,MSTN_003,0.75851
2,104,MSTN_001,MSTN_004,0.539648
3,105,MSTN_001,MSTN_005,1.063716
4,106,MSTN_001,MSTN_006,0.489141


In [18]:
# merge the stndists and railOD_off_peak_fare and calculate 
railOD_off_peak_fare_per_mile = railOD_off_peak_fare.merge(stndists, left_on=['O_MSTN_ID', 'D_MSTN_ID'], right_on=['O_MSTN_ID', 'D_MSTN_ID'])
railOD_off_peak_fare_per_mile['off_peak_fare_per_mile'] = railOD_off_peak_fare_per_mile['OFF_PEAK_FARE']/railOD_off_peak_fare_per_mile['distance']
railOD_off_peak_fare_per_mile = railOD_off_peak_fare_per_mile[['pairs', 'O_MSTN_ID', 'D_MSTN_ID', 'off_peak_fare_per_mile']]

In [19]:
railOD_off_peak_fare_per_mile.head()

Unnamed: 0,pairs,O_MSTN_ID,D_MSTN_ID,off_peak_fare_per_mile
0,102,MSTN_001,MSTN_002,0.664864
1,103,MSTN_001,MSTN_003,0.626072
2,104,MSTN_001,MSTN_004,0.438464
3,105,MSTN_001,MSTN_005,0.945526
4,106,MSTN_001,MSTN_006,0.379632


In [20]:
railOD_off_peak_fare_per_mile.to_excel("output/railOD_off_peak_fare_per_mile.xlsx", sheet_name='railOD_off_peak_fare_per_mile', index=True)

In [21]:
railOD_peak_fare_per_mile.to_excel("output/railOD_peak_fare_per_mile.xlsx", sheet_name='railOD_peak_fare_per_mile', index=True)