In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import networkx as nx
import osmnx as ox
from shapely.geometry import LineString, Point, MultiPoint, Polygon
import itertools
import warnings
warnings.filterwarnings("ignore")

#### Read In Datasets and Files

* am2015 is the AM dataframe that was created previously
* pm2015 is the PM dataframe that was created previously
* off2015 is the Off-Peak dataframe that was created previously
* track_miles is contains the track distance in miles between OD pairs
* mstns contains the station names and ID numbers

In [None]:
am2015 = pd.read_excel("../../Data/100_AM_Peak_Other_Regular_Riders.xlsx")
pm2015 = pd.read_excel("../../Data/200_PMPeak_Model_Long_file.xlsx")
off2015 = pd.read_excel("../../Data/300_OffPeak_Other_Regular_Riders.xlsx")
track_miles = pd.read_excel("../../Data/railOD_trackMiles_spring2022.xlsx", header=1, index_col=0)
mstns = pd.read_excel("../../Data/mstn_id_to_stn_name.xlsx")

#### AM Preprocessing

* Create a names column for the station names in mstns so that track_miles and the mstns can merge easily
* Format the track_miles into OD pairs
* Merge the station IDs into track_miles, drop unnecessary columns
* Create an OD pair column in the track_miles dataframe that can be used to merge track_miles and 2015 dataframes together
* Format the 2015 OD pair column into strings
* Subset am 2015 by the OD Pair column, auto travel time, auto travel time per mile and log auto travel time per mile columns
* Merge the track_miles and 2015 AM dataframes together by the od pairs columns

In [None]:
mstns.sort_values(by="PRIMARY_NAME_FY23", inplace=True)
mstns['track_miles_names'] = ['Addison Road', 'Anacostia', 'Archives', 'Arlington Cemetery', "",
       'Ballston-MU', 'Benning Road', 'Bethesda', 'Braddock Road',
       'Branch Ave', 'Brookland-CUA', 'Capitol Heights', 'Capitol South',
       'Cheverly', 'Clarendon', 'Cleveland Park', 'College Park-U of Md',
       'Columbia Heights', 'Congress Heights', 'Court House',
       'Crystal City', 'Deanwood', 'Largo Town Center', "", 'Dunn Loring', 'Dupont Circle',
       'East Falls Church', 'Eastern Market', 'Eisenhower Ave',
       'Farragut North', 'Farragut West', 'Federal Center SW',
       'Federal Triangle', 'Foggy Bottom-GWU', 'Forest Glen',
       'Fort Totten', 'Franconia-Springfield', 'Friendship Heights',
       'Gallery Place', 'Georgia Ave-Petworth', 'Glenmont', 'Greenbelt',
       'Greensboro', 'Grosvenor-Strathmore', "",'Huntington',"Prince George's Plaza", "",
       'Judiciary Square', 'King St-Old Town', "L'Enfant Plaza",
       'Landover',"", 'McLean', 'McPherson Square',
       'Medical Center', 'Metro Center', 'Minnesota Ave',
       'Morgan Boulevard', 'Mt Vernon Sq', 'Navy Yard-Ballpark',
       'Naylor Road', 'New Carrollton', 'NoMa-Gallaudet U','White Flint', 'Pentagon',
       'Pentagon City', 'Potomac Ave', "","",
       'Rhode Island Ave', 'Rockville',
       'Ronald Reagan Washington National Airport', 'Rosslyn',
       'Shady Grove', 'Shaw-Howard Univ', 'Silver Spring', 'Smithsonian',
       'Southern Ave', 'Spring Hill', 'Stadium-Armory', 'Suitland',
       'Takoma', 'Tenleytown-AU', 'Twinbrook', 'Tysons Corner',
       'U Street', 'Union Station', 'Van Dorn Street', 'Van Ness-UDC',
       'Vienna', 'Virginia Square-GMU', 'Waterfront', 'West Falls Church',
       'West Hyattsville', 'Wheaton',  'Wiehle',
       'Woodley Park']

In [None]:
track_miles = track_miles.stack().rename_axis(('O', 'D')).reset_index(name="track_miles")
track_miles.head()

In [None]:
track_miles2 = mstns.merge(track_miles, left_on ='track_miles_names' , right_on= 'O')
track_miles2 = mstns.merge(track_miles2, left_on ='track_miles_names' , right_on='D' )

In [None]:
track_miles2.drop(["PRIMARY_NAME_FY23_x", "PRIMARY_NAME_FY23_y", "track_miles_names_x", "track_miles_names_y"], axis=1, inplace=True)

In [None]:
track_miles2['pairs'] = track_miles2.apply(lambda x: str(x['ID_y'])+str(0)+str(x['ID_x']), axis=1)
track_miles2['ID1'] = track_miles2['ID_y'].replace('[A-Z]{4}_0', '', regex=True)
track_miles2['ID2'] = track_miles2['ID_x'].replace('[A-Z]{4}_0', '', regex=True)
track_miles2['ID1'] = track_miles2['ID1'].str.lstrip("0")
track_miles2['ID2'] = track_miles2['ID2'].str.lstrip("0")
track_miles2['pairs2'] = track_miles2.apply(lambda x: str(x['ID1'])+str(0)+str(x['ID2']), axis=1)
track_miles2.head()

In [None]:
am2015['odpair_mstn'] = am2015['odpair_mstn'].astype(str)

In [None]:
subset = am2015[['odpair_mstn', 'auto_tt', 'auto_tt_per_mile', 'log_auto_tt_per_mile']]
subset.head()

In [None]:
am_merged = track_miles2.merge(subset, left_on='pairs2', right_on='odpair_mstn', how = 'left')

* Filter the rows that are NA in the auto_tt_per_mile column
* Fit a simple linear regression between auto travel time per mile and track miles
* fit the regression to get a new auto travel time column in the dataframe where auto travel time per mile was missing
* Divide the new auto travel time by track miles to get the new auto travel time per mile
* Subset only the required variable names in am_merged2 
* Merge am_merged and am_merged2, then fill the values of auto travel time and auto travel time per mile with the missing values
* Plot the relationship between the old auto travel time and the new travel time
* Export

In [None]:
am_merged2 = am_merged[am_merged['auto_tt_per_mile'].isna()]

In [None]:
Y = am2015['auto_tt']
X = am2015['track_mile']
X = sm.add_constant(X)
model = sm.OLS(Y,X)
results = model.fit()
results.summary()

In [None]:
am_merged2.columns

In [None]:
am_merged2['new_auto_tt'] = 6.488705 + 2.111382*am_merged2['track_miles']
am_merged2['new_auto_tt_per_mile'] =am_merged2['new_auto_tt'] / am_merged2['track_miles']
am_merged2 = am_merged2[['pairs2', 'new_auto_tt', 'new_auto_tt_per_mile']]

In [None]:
am_merged2.head()

In [None]:
am_merged3 = am_merged.merge(am_merged2, on='pairs2', how='left')

In [None]:
am_merged3['new_auto_tt_per_mile2'] = am_merged3['auto_tt_per_mile'].fillna(am_merged3['new_auto_tt_per_mile'])
am_merged3['new_auto_tt2'] = am_merged3['auto_tt'].fillna(am_merged3['new_auto_tt'])
am_merged3.head(2)

In [None]:
am_merged3.plot('auto_tt_per_mile', 'new_auto_tt_per_mile2', kind='scatter')

In [None]:
am_interpolated_times = am_merged3[['pairs','new_auto_tt2','new_auto_tt_per_mile2']]
am_interpolated_times.to_csv("output/am_interpolated_auto_times.csv")

#### PM Preprocessing

* Format the 2015 OD pair column into strings
* Subset pm 2015 by the OD Pair column, auto travel time, auto travel time per mile and log auto travel time per mile columns
* Merge the track_miles and 2015 AM dataframes together by the od pairs columns

In [None]:
pm2015['odpair_mstn'] = pm2015['odpair_mstn'].astype(str)

In [None]:
subset = pm2015[['odpair_mstn', 'auto_tt', 'auto_tt_per_mile', 'log_auto_tt_per_mile']]
subset

In [None]:
pm_merged = track_miles2.merge(subset, left_on='pairs2', right_on='odpair_mstn', how = 'left')

* Filter the rows that are NA in the auto_tt_per_mile column
* Fit a simple linear regression between auto travel time per mile and track miles
* fit the regression to get a new auto travel time column in the dataframe where auto travel time per mile was missing
* Divide the new auto travel time by track miles to get the new auto travel time per mile
* Subset only the required variable names in am_merged2 
* Merge am_merged and am_merged2, then fill the values of auto travel time and auto travel time per mile with the missing values
* Plot the relationship between the old auto travel time and the new travel time
* Export

In [None]:
pm_merged2 = pm_merged[pm_merged['auto_tt_per_mile'].isna()]
pm_merged2['track_miles']

In [None]:
Y = pm2015['auto_tt'][pm2015['auto_tt'].notna()]
X = pm2015['track_mile'][pm2015['auto_tt'].notna()]
X = sm.add_constant(X)
model = sm.OLS(Y,X)
results = model.fit()
results.summary()

In [None]:
pm_merged2['new_auto_tt'] = 6.5882	+ 2.0964*pm_merged2['track_miles']
pm_merged2['new_auto_tt_per_mile'] =pm_merged2['new_auto_tt'] / pm_merged2['track_miles']
pm_merged2 = pm_merged2[['pairs2', 'new_auto_tt', 'new_auto_tt_per_mile']]

In [None]:
pm_merged2.head()

In [None]:
pm_merged3 = pm_merged.merge(pm_merged2, on='pairs2', how='left')

In [None]:
pm_merged3['new_auto_tt_per_mile2'] = pm_merged3['auto_tt_per_mile'].fillna(pm_merged3['new_auto_tt_per_mile'])
pm_merged3['new_auto_tt2'] = pm_merged3['auto_tt'].fillna(pm_merged3['new_auto_tt'])
pm_merged3.head()

In [None]:
pm_merged3.plot('auto_tt_per_mile', 'new_auto_tt_per_mile2', kind='scatter')

In [None]:
pm_interpolated_times = pm_merged3[['pairs','new_auto_tt2','new_auto_tt_per_mile2']]
pm_interpolated_times.to_csv("output/pm_interpolated_auto_times.csv")

#### Off-Peak Preprocessing

* Format the 2015 OD pair column into strings
* Subset off 2015 by the OD Pair column, auto travel time, auto travel time per mile and log auto travel time per mile columns
* Merge the track_miles and 2015 AM dataframes together by the od pairs columns

In [None]:
off2015['odpair_mstn'] = off2015['odpair_mstn'].astype(str)

In [None]:
subset = off2015[['odpair_mstn', 'auto_tt', 'auto_tt_per_mile', 'log_auto_tt_per_mile']]
subset

In [None]:
off_merged = track_miles2.merge(subset, left_on='pairs2', right_on='odpair_mstn', how = 'left')

* Filter the rows that are NA in the auto_tt_per_mile column
* Fit a simple linear regression between auto travel time per mile and track miles
* fit the regression to get a new auto travel time column in the dataframe where auto travel time per mile was missing
* Divide the new auto travel time by track miles to get the new auto travel time per mile
* Subset only the required variable names in am_merged2 
* Merge am_merged and am_merged2, then fill the values of auto travel time and auto travel time per mile with the missing values
* Plot the relationship between the old auto travel time and the new travel time
* Export

In [None]:
off_merged2 = off_merged[off_merged['auto_tt_per_mile'].isna()]
off_merged2['track_miles']

In [None]:
Y = off2015['auto_tt'][off2015['auto_tt'].notna()]
X = off2015['track_mile'][off2015['auto_tt'].notna()]
X = sm.add_constant(X)
model = sm.OLS(Y,X)
results = model.fit()
results.summary()

In [None]:
off_merged2['new_auto_tt'] = 5.1336	+ 1.4237*off_merged2['track_miles']
off_merged2['new_auto_tt_per_mile'] =off_merged2['new_auto_tt'] / off_merged2['track_miles']
off_merged2 = off_merged2[['pairs2', 'new_auto_tt', 'new_auto_tt_per_mile']]

In [None]:
off_merged2.head()

In [None]:
off_merged3 = off_merged.merge(off_merged2, on='pairs2', how='left')

In [None]:
off_merged3['new_auto_tt_per_mile2'] = off_merged3['auto_tt_per_mile'].fillna(off_merged3['new_auto_tt_per_mile'])
off_merged3['new_auto_tt2'] = off_merged3['auto_tt'].fillna(off_merged3['new_auto_tt'])
off_merged3

In [None]:
off_merged3.plot('auto_tt_per_mile', 'new_auto_tt_per_mile2', kind='scatter')

In [None]:
off_interpolated_times = off_merged3[['pairs','new_auto_tt2','new_auto_tt_per_mile2']]
off_interpolated_times.to_csv("output/off_interpolated_auto_times.csv")