In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
import openpyxl
import warnings
warnings.filterwarnings("ignore")

# AM Dataframe

* passengers includes the number of passengers in the desired rider class
* fare_track_miles includes the fare per track mile
* auto_tt_minpmile includes the interpolated auto travel times
* bus_tt_minpmile includes the bus travel times per mile
* households_half_mile includes the proportional number of households within each walkshed
* am_parking_users includes the number of parking users per OD pair
* num_bus_lines contains the number of bus lines and stops
* jobs_half_mile contains the proportional number of jobs within each walkshed
* trains_per_hour contains the number of trains in each time period (only AM will be used)
* terminal_station contains a dummy variable of whether or not a station is at the end of a line (1) or not (0)
* dist_to_core contains the distance from a station to the Metro Center station in miles
* parking_capacity contains the number of parking spots available per station
* mstns contains the station names and ID's used for merging the variables together

In [2]:
passengers = pd.read_excel("../Data Preprocessing/Ridership Data/output/full_fare_ridership.xlsx")
fare_track_miles = pd.read_excel("../Data Preprocessing/Fare per Track Mile/output/railOD_peak_fare_per_mile.xlsx")
auto_tt_minpmile = pd.read_csv("../Data Preprocessing/Interpolated Auto Travel Times/output/am_interpolated_auto_times.csv")
bus_tt_minpmile = pd.read_excel("../Data Preprocessing/Bus Travel Time/output/busttpermile_ML.xlsx") 
households_half_mile = pd.read_excel("../Data Preprocessing/Proportion of Households/output/walkshed_proportional_households_stations.xlsx") 
am_parking_users = pd.read_csv("../Data Preprocessing/Parking Users/output/am_parking_updated.csv")
num_bus_lines = pd.read_csv("../Data Preprocessing/Bus Lines and Stops/output/bus_line_stop.csv")
jobs_half_mile = pd.read_excel("../Data Preprocessing/Proportion of Jobs/output/proportional_jobs_stations_ML2.xlsx")
trains_per_hour = pd.read_excel("../Data/avgTrainThroughput_byPeriod_PeakOnly.xlsx") 
terminal_station = pd.read_csv("../Data/metro_ternimal_dummy.csv")
track_miles = pd.read_excel("../Data/railOD_trackMiles_spring2022.xlsx", header=1, index_col=0)
dist_to_core = pd.read_excel("../Data Preprocessing/Distance to the Core/output/stations.xlsx")
parking_capacity = pd.read_excel("../Data/parkingCapacityByStation.xlsx")
mstns = pd.read_excel("../Data/mstn_id_to_stn_name.xlsx")

* Sort the station names alphabetically
* Make the mstn file have columns of alternatively formatted station names to help with merging

In [3]:
mstns.sort_values(by = 'PRIMARY_NAME_FY23',inplace=True)
mstns['walkshed_filename'] = ['ADDISON ROAD-SEAT PLEASANT','ANACOSTIA','ARCHIVES-NAVY MEMORIAL-PENN QUARTER','ARLINGTON CEMETERY','Ashburn','BALLSTON-MU',
                              'BENNING ROAD','BETHESDA','BRADDOCK ROAD','BRANCH AVE','BROOKLAND-CUA','CAPITOL HEIGHTS','CAPITOL SOUTH','CHEVERLY',
                              'CLARENDON','CLEVELAND PARK','COLLEGE PARK-U OF MD','COLUMBIA HEIGHTS','CONGRESS HEIGHTS','COURT HOUSE','CRYSTAL CITY',
                              'DEANWOOD','LARGO TOWN CENTER','Washington Dulles International Airport', 'DUNN LORING-MERRIFIELD','DUPONT CIRCLE','EAST FALLS CHURCH','EASTERN MARKET','EISENHOWER AVENUE','FARRAGUT NORTH',
                              'FARRAGUT WEST','FEDERAL CENTER SW','FEDERAL TRIANGLE','FOGGY BOTTOM-GWU','FOREST GLEN','FORT TOTTEN','FRANCONIA-SPRINGFIELD',
                              'FRIENDSHIP HEIGHTS','GALLERY PL-CHINATOWN','GEORGIA AVE-PETWORTH','GLENMONT','GREENBELT','GREENSBORO','GROSVENOR-STRATHMORE',
                              'Herndon','HUNTINGTON',"PRINCE GEORGE'S PLAZA",'Innovation Center','JUDICIARY SQUARE','KING ST-OLD TOWN',"L'ENFANT PLAZA",'LANDOVER',
                              'Loudoun Gateway','MCLEAN','MCPHERSON SQUARE','MEDICAL CENTER','METRO CENTER','MINNESOTA AVE','MORGAN BOULEVARD',
                              'MT VERNON SQ 7TH ST-CONVENTION CENTER','NAVY YARD-BALLPARK','NAYLOR ROAD','NEW CARROLLTON','NOMA-GALLAUDET','WHITE FLINT','PENTAGON',
                              'PENTAGON CITY','POTOMAC AVE', "",'Reston Town Center','RHODE ISLAND AVE-BRENTWOOD','ROCKVILLE','RONALD REAGAN WASHINGTON NATIONAL AIRPORT',
                              'ROSSLYN','SHADY GROVE','SHAW-HOWARD U','SILVER SPRING','SMITHSONIAN','SOUTHERN AVENUE','SPRING HILL',
                              'STADIUM-ARMORY','SUITLAND','TAKOMA','TENLEYTOWN-AU','TWINBROOK','TYSONS CORNER','U STREET/AFRICAN-AMER CIVIL WAR MEMORIAL/CARDOZO',
                              'UNION STATION','VAN DORN STREET','VAN NESS-UDC','VIENNA/FAIRFAX-GMU','VIRGINIA SQUARE-GMU','WATERFRONT','WEST FALLS CHURCH-VT/UVA',
                              'WEST HYATTSVILLE','WHEATON','WIEHLE-RESTON EAST','WOODLEY PARK-ZOO/ADAMS MORGAN',]

mstns['WMATA_filename'] = ['Addison Road', 'Anacostia', 'Archives-Navy Memorial',
       'Arlington Cemetery', "", 'Ballston', 'Benning Road', 'Bethesda',
       'Braddock Road', 'Branch Avenue', 'Brookland', 'Capitol Heights',
       'Capitol South', 'Cheverly', 'Clarendon', 'Cleveland Park',
       'College Park-U of MD', 'Columbia Heights', 'Congress Heights',
       'Court House', 'Crystal City', 'Deanwood','Largo Town Center', "", 'Dunn Loring',
       'Dupont Circle', 'East Falls Church', 'Eastern Market',
       'Eisenhower Avenue', 'Farragut North', 'Farragut West',
       'Federal Center SW', 'Federal Triangle', 'Foggy Bottom',
       'Forest Glen', 'Fort Totten', 'Franconia-Springfield',
       'Friendship Heights', 'Gallery Place-Chinatown',
       'Georgia Avenue-Petworth', 'Glenmont', 'Greenbelt', 'Greensboro',
       'Grosvenor', "", 'Huntington',"Prince George's Plaza", "", 'Judiciary Square', 'King Street',
       "L'Enfant Plaza", 'Landover', "", 'McLean',
       'McPherson Square', 'Medical Center', 'Metro Center',
       'Minnesota Avenue', 'Morgan Blvd.', 'Mt. Vernon Square-UDC',
       'Navy Yard', 'Naylor Road', 'New Carrollton', 'New York Ave','White Flint',
       'Pentagon', 'Pentagon City', 'Potomac Avenue', "", "",'Rhode Island Avenue','Rockville', 
        'Reagan Washington National Airport',
        'Rosslyn', 'Shady Grove',
       'Shaw-Howard University', 'Silver Spring', 'Smithsonian',
       'Southern Avenue', 'Spring Hill', 'Stadium-Armory', 'Suitland',
       'Takoma', 'Tenleytown-AU', 'Twinbrook', 'Tysons Corner',
       'U Street-Cardozo', 'Union Station', 'Van Dorn Street',
       'Van Ness-UDC', 'Vienna', 'Virginia Square-GMU', 'Waterfront',
       'West Falls Church', 'West Hyattsville', 'Wheaton', 
       'Wiehle', 'Woodley Park-Zoo']

mstns['track_miles_names'] = ['Addison Road', 'Anacostia', 'Archives', 'Arlington Cemetery', "",
       'Ballston-MU', 'Benning Road', 'Bethesda', 'Braddock Road',
       'Branch Ave', 'Brookland-CUA', 'Capitol Heights', 'Capitol South',
       'Cheverly', 'Clarendon', 'Cleveland Park', 'College Park-U of Md',
       'Columbia Heights', 'Congress Heights', 'Court House',
       'Crystal City', 'Deanwood', 'Largo Town Center', "", 'Dunn Loring', 'Dupont Circle',
       'East Falls Church', 'Eastern Market', 'Eisenhower Ave',
       'Farragut North', 'Farragut West', 'Federal Center SW',
       'Federal Triangle', 'Foggy Bottom-GWU', 'Forest Glen',
       'Fort Totten', 'Franconia-Springfield', 'Friendship Heights',
       'Gallery Place', 'Georgia Ave-Petworth', 'Glenmont', 'Greenbelt',
       'Greensboro', 'Grosvenor-Strathmore', "",'Huntington',"Prince George's Plaza", "",
       'Judiciary Square', 'King St-Old Town', "L'Enfant Plaza",
       'Landover',"", 'McLean', 'McPherson Square',
       'Medical Center', 'Metro Center', 'Minnesota Ave',
       'Morgan Boulevard', 'Mt Vernon Sq', 'Navy Yard-Ballpark',
       'Naylor Road', 'New Carrollton', 'NoMa-Gallaudet U','White Flint', 'Pentagon',
       'Pentagon City', 'Potomac Ave', "","",
       'Rhode Island Ave', 'Rockville',
       'Ronald Reagan Washington National Airport', 'Rosslyn',
       'Shady Grove', 'Shaw-Howard Univ', 'Silver Spring', 'Smithsonian',
       'Southern Ave', 'Spring Hill', 'Stadium-Armory', 'Suitland',
       'Takoma', 'Tenleytown-AU', 'Twinbrook', 'Tysons Corner',
       'U Street', 'Union Station', 'Van Dorn Street', 'Van Ness-UDC',
       'Vienna', 'Virginia Square-GMU', 'Waterfront', 'West Falls Church',
       'West Hyattsville', 'Wheaton',  'Wiehle',
       'Woodley Park']
mstns['auto_filename'] = ['ADDISON ROAD-SEAT PLEASANT', 'ANACOSTIA',
       'ARCHIVES-NAVY MEMORIAL-PENN QUARTER', 'ARLINGTON CEMETERY',
       'ASHBURN', 'BALLSTON-MU', 'BENNING ROAD', 'BETHESDA',
       'BRADDOCK ROAD', 'BRANCH AVE', 'BROOKLAND-CUA', 'CAPITOL HEIGHTS',
       'CAPITOL SOUTH', 'CHEVERLY', 'CLARENDON', 'CLEVELAND PARK',
       'COLLEGE PARK-U OF MD', 'COLUMBIA HEIGHTS', 'CONGRESS HEIGHTS',
       'COURT HOUSE', 'CRYSTAL CITY', 'DEANWOOD', 'DOWNTOWN LARGO', 'WASHINGTON DULLES INTERNATIONAL AIRPORT',
       'DUNN LORING-MERRIFIELD', 'DUPONT CIRCLE', 'EAST FALLS CHURCH',
       'EASTERN MARKET', 'EISENHOWER AVENUE', 'FARRAGUT NORTH',
       'FARRAGUT WEST', 'FEDERAL CENTER SW', 'FEDERAL TRIANGLE',
       'FOGGY BOTTOM-GWU', 'FOREST GLEN', 'FORT TOTTEN',
       'FRANCONIA-SPRINGFIELD', 'FRIENDSHIP HEIGHTS',
       'GALLERY PLACE-CHINATOWN', 'GEORGIA AVENUE-PETWORTH', 'GLENMONT',
       'GREENBELT', 'GREENSBORO', 'GROSVENOR-STRATHMORE', 'HERNDON',
       'HUNTINGTON', 'HYATTSVILLE CROSSING', 'INNOVATION CENTER',
       'JUDICIARY SQUARE', 'KING STREET-OLD TOWN', "L'ENFANT PLAZA",
       'LANDOVER', 'LOUDOUN GATEWAY', 'MCLEAN', 'MCPHERSON SQUARE',
       'MEDICAL CENTER', 'METRO CENTER', 'MINNESOTA AVENUE',
       'MORGAN BOULEVARD', 'MT VERNON SQ 7TH ST-CONVENTION CENTER',
       'NAVY YARD-BALLPARK', 'NAYLOR ROAD', 'NEW CARROLLTON',
       'NOMA-GALLAUDET U', 'NORTH BETHESDA', 'PENTAGON', 'PENTAGON CITY',
       'POTOMAC AVE', "",'RESTON', 'RHODE ISLAND AVE-BRENTWOOD', 'ROCKVILLE',
       'RONALD REAGAN WASHINGTON NATIONAL AIRPORT', 'ROSSLYN',
       'SHADY GROVE', 'SHAW-HOWARD UNIVERSITY', 'SILVER SPRING',
       'SMITHSONIAN', 'SOUTHERN AVENUE', 'SPRING HILL', 'STADIUM-ARMORY',
       'SUITLAND', 'TAKOMA', 'TENLEYTOWN-AU', 'TWINBROOK', 'TYSONS',
       'U STREET/AFRICAN-AMER CIVIL WAR MEMORIAL/CARDOZO',
       'UNION STATION', 'VAN DORN STREET', 'VAN NESS-UDC',
       'VIENNA/FAIRFAX-GMU', 'VIRGINIA SQUARE-GMU',
       'WATERFRONT',
       'WEST FALLS CHURCH-VT/UVA', 'WEST HYATTSVILLE', 'WHEATON',
       'WIEHLE-RESTON EAST', 'WOODLEY PARK-ZOO/ADAMS MORGAN']

## AM Passenger Miles

* Filter for the AM trip passengers
* Format track_miles into OD pairs
* Give station names in track_miles proper MSTN IDs
* Create an od_pair key column in both the passengers and track miles dataframes
* merge track miles to passengers in the am, multiply the track miles by passengers, consolidate the columns, rename to passenger column

In [4]:
am_passengers = passengers[passengers['period']=='AM Peak']

In [5]:
track_miles = track_miles.stack().rename_axis(('O',"D")).reset_index(name='track_miles')

In [6]:
track_miles2 = mstns.merge(track_miles, left_on ='track_miles_names' , right_on= 'O')
track_miles2 = mstns.merge(track_miles2, left_on ='track_miles_names' , right_on='D' )

In [7]:
am_passengers['pairs'] = am_passengers.apply(lambda x: str(x['ID'])+str(0)+str(x['ID_1']), axis=1)
track_miles2['pairs'] = track_miles2.apply(lambda x: str(x['ID_y'])+str(0)+str(x['ID_x']), axis=1)


In [8]:
milesmerge = track_miles2.merge(am_passengers, left_on='pairs', right_on='pairs')
milesmerge['riders_miles'] = milesmerge['track_miles'] * milesmerge['COUNT']
am_dataframe_new = milesmerge[['O', 'D','walkshed_filename_x', 'walkshed_filename_y', 'track_miles', 'ID_x', 'ID_y','pairs', 'COUNT', 'riders_miles', ]]
am_dataframe_new.rename(columns={'COUNT':'passengers'}, inplace=True)
am_dataframe_new.head()

Unnamed: 0,O,D,walkshed_filename_x,walkshed_filename_y,track_miles,ID_x,ID_y,pairs,passengers,riders_miles
0,Anacostia,Addison Road,ADDISON ROAD-SEAT PLEASANT,ANACOSTIA,10.35,MSTN_062,MSTN_001,MSTN_0010MSTN_062,9,93.15
1,Ballston-MU,Addison Road,ADDISON ROAD-SEAT PLEASANT,BALLSTON-MU,14.27,MSTN_062,MSTN_068,MSTN_0680MSTN_062,8,114.16
2,Benning Road,Addison Road,ADDISON ROAD-SEAT PLEASANT,BENNING ROAD,2.46,MSTN_062,MSTN_003,MSTN_0030MSTN_062,56,137.76
3,Bethesda,Addison Road,ADDISON ROAD-SEAT PLEASANT,BETHESDA,16.36,MSTN_062,MSTN_052,MSTN_0520MSTN_062,7,114.52
4,Braddock Road,Addison Road,ADDISON ROAD-SEAT PLEASANT,BRADDOCK ROAD,15.0,MSTN_062,MSTN_078,MSTN_0780MSTN_062,4,60.0


## AM Trains per Hour

* Filter only for weekday services and in the AM peak
* Groupby and get the sum of trains per hour
* Merge into the main AM dataframe

In [9]:
trains_per_hour2 = trains_per_hour[(trains_per_hour['ARRIVAL_PERIOD']=='AM Peak') & (trains_per_hour['SERVICETYPE']=='Weekday')]
trains_per_hour2.head()

Unnamed: 0,STATION_ID,STOP_ID,DIR_ID,SERVICETYPE,ARRIVAL_PERIOD,AVG_TRAINS
8,MSTN_001,F06,1,Weekday,AM Peak,17
10,MSTN_001,F06,2,Weekday,AM Peak,16
20,MSTN_002,F02,1,Weekday,AM Peak,31
22,MSTN_002,F02,2,Weekday,AM Peak,31
32,MSTN_003,G01,1,Weekday,AM Peak,24


In [10]:
trains_per_hour2 = trains_per_hour2.groupby(['STATION_ID']).sum().reset_index()
trains_per_hour2 = trains_per_hour2[['STATION_ID','AVG_TRAINS']]
trains_per_hour2.head()

Unnamed: 0,STATION_ID,AVG_TRAINS
0,MSTN_001,33
1,MSTN_002,62
2,MSTN_003,51
3,MSTN_004,56
4,MSTN_005,78


In [11]:
am_dataframe_new = am_dataframe_new.merge(trains_per_hour2, left_on='ID_x', right_on='STATION_ID')
am_dataframe_new = am_dataframe_new.merge(trains_per_hour2, left_on='ID_y', right_on='STATION_ID')

In [12]:
am_dataframe_new.drop(['STATION_ID_x', 'STATION_ID_y'], axis=1, inplace=True)

## Fare per Track Mile

* Subset only the station ID columns from fare_track_miles
* Create a unique OD pair column
* Merge into the main AM dataframe

In [13]:
fare_track_miles= fare_track_miles[['O_MSTN_ID', 'D_MSTN_ID', 'peak_fare_per_mile']]

In [14]:
fare_track_miles['pairs'] = fare_track_miles.apply(lambda x: str(x['O_MSTN_ID'])+str(0)+str(x['D_MSTN_ID']), axis=1)

In [15]:
am_dataframe_new = am_dataframe_new.merge(fare_track_miles)
am_dataframe_new.head()

Unnamed: 0,O,D,walkshed_filename_x,walkshed_filename_y,track_miles,ID_x,ID_y,pairs,passengers,riders_miles,AVG_TRAINS_x,AVG_TRAINS_y,O_MSTN_ID,D_MSTN_ID,peak_fare_per_mile
0,Anacostia,Addison Road,ADDISON ROAD-SEAT PLEASANT,ANACOSTIA,10.35,MSTN_062,MSTN_001,MSTN_0010MSTN_062,9,93.15,53,33,MSTN_001,MSTN_062,0.570226
1,Anacostia,Archives,ARCHIVES-NAVY MEMORIAL-PENN QUARTER,ANACOSTIA,3.16,MSTN_002,MSTN_001,MSTN_0010MSTN_002,436,1377.76,62,33,MSTN_001,MSTN_002,0.747972
2,Anacostia,Arlington Cemetery,ARLINGTON CEMETERY,ANACOSTIA,6.24,MSTN_065,MSTN_001,MSTN_0010MSTN_065,5,31.2,27,33,MSTN_001,MSTN_065,0.602491
3,Anacostia,Ballston-MU,BALLSTON-MU,ANACOSTIA,9.12,MSTN_068,MSTN_001,MSTN_0010MSTN_068,95,866.4,53,33,MSTN_001,MSTN_068,0.477162
4,Anacostia,Benning Road,BENNING ROAD,ANACOSTIA,7.89,MSTN_003,MSTN_001,MSTN_0010MSTN_003,12,94.68,51,33,MSTN_001,MSTN_003,0.75851


## Auto Travel Time in Minutes/Mile

* Merge the interpolated auto travel times into the main AM dataframe

In [16]:
am_dataframe_new = am_dataframe_new.merge(auto_tt_minpmile, on='pairs')
am_dataframe_new.head()

Unnamed: 0.1,O,D,walkshed_filename_x,walkshed_filename_y,track_miles,ID_x,ID_y,pairs,passengers,riders_miles,AVG_TRAINS_x,AVG_TRAINS_y,O_MSTN_ID,D_MSTN_ID,peak_fare_per_mile,Unnamed: 0,new_auto_tt2,new_auto_tt_per_mile2
0,Anacostia,Addison Road,ADDISON ROAD-SEAT PLEASANT,ANACOSTIA,10.35,MSTN_062,MSTN_001,MSTN_0010MSTN_062,9,93.15,53,33,MSTN_001,MSTN_062,0.570226,1,13.18,1.641345
1,Anacostia,Archives,ARCHIVES-NAVY MEMORIAL-PENN QUARTER,ANACOSTIA,3.16,MSTN_002,MSTN_001,MSTN_0010MSTN_002,436,1377.76,62,33,MSTN_001,MSTN_002,0.747972,183,13.160672,4.16477
2,Anacostia,Arlington Cemetery,ARLINGTON CEMETERY,ANACOSTIA,6.24,MSTN_065,MSTN_001,MSTN_0010MSTN_065,5,31.2,27,33,MSTN_001,MSTN_065,0.602491,274,19.663729,3.151239
3,Anacostia,Ballston-MU,BALLSTON-MU,ANACOSTIA,9.12,MSTN_068,MSTN_001,MSTN_0010MSTN_068,95,866.4,53,33,MSTN_001,MSTN_068,0.477162,365,38.189999,4.896153
4,Anacostia,Benning Road,BENNING ROAD,ANACOSTIA,7.89,MSTN_003,MSTN_001,MSTN_0010MSTN_003,12,94.68,51,33,MSTN_001,MSTN_003,0.75851,456,23.147509,2.933778


## Bus Travel Time in Minutes/Mile

* Create a unique OD pair column
* Subset only the pairs, bus travel time and bus travel time per mile columns
* Merge into the main AM dataframe

In [17]:
bus_tt_minpmile['pairs'] = bus_tt_minpmile.apply(lambda x: str(x['O_MSTN_ID'])+str(0)+str(x['D_MSTN_ID']), axis=1)

In [18]:
bus_tt_minpmile2 = bus_tt_minpmile[['pairs', 'Travel Time', 'bus_tt_per_mile']]
bus_tt_minpmile2.rename(columns={'Travel Time':'bus_tt'}, inplace=True)

In [19]:
am_dataframe_new = am_dataframe_new.merge(bus_tt_minpmile2, on='pairs')
am_dataframe_new.head()

Unnamed: 0.1,O,D,walkshed_filename_x,walkshed_filename_y,track_miles,ID_x,ID_y,pairs,passengers,riders_miles,AVG_TRAINS_x,AVG_TRAINS_y,O_MSTN_ID,D_MSTN_ID,peak_fare_per_mile,Unnamed: 0,new_auto_tt2,new_auto_tt_per_mile2,bus_tt,bus_tt_per_mile
0,Anacostia,Addison Road,ADDISON ROAD-SEAT PLEASANT,ANACOSTIA,10.35,MSTN_062,MSTN_001,MSTN_0010MSTN_062,9,93.15,53,33,MSTN_001,MSTN_062,0.570226,1,13.18,1.641345,82.0,12.304868
1,Anacostia,Archives,ARCHIVES-NAVY MEMORIAL-PENN QUARTER,ANACOSTIA,3.16,MSTN_002,MSTN_001,MSTN_0010MSTN_002,436,1377.76,62,33,MSTN_001,MSTN_002,0.747972,183,13.160672,4.16477,31.0,10.305395
2,Anacostia,Arlington Cemetery,ARLINGTON CEMETERY,ANACOSTIA,6.24,MSTN_065,MSTN_001,MSTN_0010MSTN_065,5,31.2,27,33,MSTN_001,MSTN_065,0.602491,274,19.663729,3.151239,,
3,Anacostia,Ballston-MU,BALLSTON-MU,ANACOSTIA,9.12,MSTN_068,MSTN_001,MSTN_0010MSTN_068,95,866.4,53,33,MSTN_001,MSTN_068,0.477162,365,38.189999,4.896153,95.0,12.088109
4,Anacostia,Benning Road,BENNING ROAD,ANACOSTIA,7.89,MSTN_003,MSTN_001,MSTN_0010MSTN_003,12,94.68,51,33,MSTN_001,MSTN_003,0.75851,456,23.147509,2.933778,44.0,10.595066


## Households per Half Mile

* Subset only the ID, proportion of houses and the total number of households. Rename ID to MSTN
* Merge once based on the destinations and again based on the origins
* Drop duplicate columns

In [20]:
households_half_mile = households_half_mile[['ID', 'proportionhouses', 'Total Households', ]]
households_half_mile.rename(columns={'ID':'MSTN'}, inplace=True)

In [21]:
am_dataframe_new = am_dataframe_new.merge(households_half_mile, left_on='ID_x', right_on='MSTN')
am_dataframe_new = am_dataframe_new.merge(households_half_mile, left_on='ID_y', right_on='MSTN')
am_dataframe_new.head()

Unnamed: 0,O,D,walkshed_filename_x,walkshed_filename_y,track_miles,ID_x,ID_y,pairs,passengers,riders_miles,...,new_auto_tt2,new_auto_tt_per_mile2,bus_tt,bus_tt_per_mile,MSTN_x,proportionhouses_x,Total Households_x,MSTN_y,proportionhouses_y,Total Households_y
0,Anacostia,Addison Road,ADDISON ROAD-SEAT PLEASANT,ANACOSTIA,10.35,MSTN_062,MSTN_001,MSTN_0010MSTN_062,9,93.15,...,13.18,1.641345,82.0,12.304868,MSTN_062,529.032225,1663,MSTN_001,2076.692445,3639
1,Anacostia,Archives,ARCHIVES-NAVY MEMORIAL-PENN QUARTER,ANACOSTIA,3.16,MSTN_002,MSTN_001,MSTN_0010MSTN_002,436,1377.76,...,13.160672,4.16477,31.0,10.305395,MSTN_002,2215.020592,3007,MSTN_001,2076.692445,3639
2,Anacostia,Arlington Cemetery,ARLINGTON CEMETERY,ANACOSTIA,6.24,MSTN_065,MSTN_001,MSTN_0010MSTN_065,5,31.2,...,19.663729,3.151239,,,MSTN_065,0.003139,1,MSTN_001,2076.692445,3639
3,Anacostia,Ballston-MU,BALLSTON-MU,ANACOSTIA,9.12,MSTN_068,MSTN_001,MSTN_0010MSTN_068,95,866.4,...,38.189999,4.896153,95.0,12.088109,MSTN_068,11605.121631,13574,MSTN_001,2076.692445,3639
4,Anacostia,Benning Road,BENNING ROAD,ANACOSTIA,7.89,MSTN_003,MSTN_001,MSTN_0010MSTN_003,12,94.68,...,23.147509,2.933778,44.0,10.595066,MSTN_003,3304.921667,4621,MSTN_001,2076.692445,3639


In [22]:
am_dataframe_new.drop(['O_MSTN_ID','D_MSTN_ID','MSTN_x','MSTN_y', 'Unnamed: 0'], axis =1, inplace=True)

## AM Parking Users

* Give station names for the am_parking_users proper MSTN IDs
* Create a unique OD pairs column
* Subset only the pairs and the parking_user column
* Merge into the main AM dataframe

In [23]:
am_parking_users2 = mstns.merge(am_parking_users, left_on = 'WMATA_filename', right_on= 'START_PLACE_NAME')
am_parking_users2 = mstns.merge(am_parking_users2, left_on = 'WMATA_filename' , right_on='END_PLACE_NAME')
am_parking_users2.head()

Unnamed: 0.1,ID_x,PRIMARY_NAME_FY23_x,walkshed_filename_x,WMATA_filename_x,track_miles_names_x,auto_filename_x,ID_y,PRIMARY_NAME_FY23_y,walkshed_filename_y,WMATA_filename_y,track_miles_names_y,auto_filename_y,Unnamed: 0,START_PLACE_NAME,END_PLACE_NAME,parking_user
0,MSTN_062,Addison Road,ADDISON ROAD-SEAT PLEASANT,Addison Road,Addison Road,ADDISON ROAD-SEAT PLEASANT,MSTN_001,Anacostia,ANACOSTIA,Anacostia,Anacostia,ANACOSTIA,87,Anacostia,Addison Road,0.0
1,MSTN_062,Addison Road,ADDISON ROAD-SEAT PLEASANT,Addison Road,Addison Road,ADDISON ROAD-SEAT PLEASANT,MSTN_068,Ballston-MU,BALLSTON-MU,Ballston,Ballston-MU,BALLSTON-MU,258,Ballston,Addison Road,0.0
2,MSTN_062,Addison Road,ADDISON ROAD-SEAT PLEASANT,Addison Road,Addison Road,ADDISON ROAD-SEAT PLEASANT,MSTN_003,Benning Road,BENNING ROAD,Benning Road,Benning Road,BENNING ROAD,348,Benning Road,Addison Road,0.0
3,MSTN_062,Addison Road,ADDISON ROAD-SEAT PLEASANT,Addison Road,Addison Road,ADDISON ROAD-SEAT PLEASANT,MSTN_052,Bethesda,BETHESDA,Bethesda,Bethesda,BETHESDA,437,Bethesda,Addison Road,0.0
4,MSTN_062,Addison Road,ADDISON ROAD-SEAT PLEASANT,Addison Road,Addison Road,ADDISON ROAD-SEAT PLEASANT,MSTN_078,Braddock Road,BRADDOCK ROAD,Braddock Road,Braddock Road,BRADDOCK ROAD,521,Braddock Road,Addison Road,0.0


In [24]:
am_parking_users2['pairs'] = am_parking_users2.apply(lambda x: str(x['ID_y'])+str(0)+str(x['ID_x']), axis=1)

In [25]:
am_parking_users2 = am_parking_users2[['pairs', 'parking_user']]

In [26]:
am_dataframe_new = am_dataframe_new.merge(am_parking_users2, on='pairs', how='left')
am_dataframe_new.head()

Unnamed: 0,O,D,walkshed_filename_x,walkshed_filename_y,track_miles,ID_x,ID_y,pairs,passengers,riders_miles,...,peak_fare_per_mile,new_auto_tt2,new_auto_tt_per_mile2,bus_tt,bus_tt_per_mile,proportionhouses_x,Total Households_x,proportionhouses_y,Total Households_y,parking_user
0,Anacostia,Addison Road,ADDISON ROAD-SEAT PLEASANT,ANACOSTIA,10.35,MSTN_062,MSTN_001,MSTN_0010MSTN_062,9,93.15,...,0.570226,13.18,1.641345,82.0,12.304868,529.032225,1663,2076.692445,3639,0.0
1,Anacostia,Archives,ARCHIVES-NAVY MEMORIAL-PENN QUARTER,ANACOSTIA,3.16,MSTN_002,MSTN_001,MSTN_0010MSTN_002,436,1377.76,...,0.747972,13.160672,4.16477,31.0,10.305395,2215.020592,3007,2076.692445,3639,17.0
2,Anacostia,Arlington Cemetery,ARLINGTON CEMETERY,ANACOSTIA,6.24,MSTN_065,MSTN_001,MSTN_0010MSTN_065,5,31.2,...,0.602491,19.663729,3.151239,,,0.003139,1,2076.692445,3639,0.0
3,Anacostia,Ballston-MU,BALLSTON-MU,ANACOSTIA,9.12,MSTN_068,MSTN_001,MSTN_0010MSTN_068,95,866.4,...,0.477162,38.189999,4.896153,95.0,12.088109,11605.121631,13574,2076.692445,3639,0.0
4,Anacostia,Benning Road,BENNING ROAD,ANACOSTIA,7.89,MSTN_003,MSTN_001,MSTN_0010MSTN_003,12,94.68,...,0.75851,23.147509,2.933778,44.0,10.595066,3304.921667,4621,2076.692445,3639,0.0


## Parking Capacity

* Drop the station column from parking_capacity and rename the MSTN_ID column to avoid errors while merging
* Merge into the AM dataframe first based on the origins and again based on the destinations
* Fill NAs in parking capacity with 0

In [27]:
parking_capacity.drop('STATION', axis=1, inplace=True)
parking_capacity.rename(columns = {'MSTN_ID':'mstn1'}, inplace=True)

In [28]:
am_dataframe_new = am_dataframe_new.merge(parking_capacity, left_on='ID_x', right_on='mstn1', how='left')
am_dataframe_new = am_dataframe_new.merge(parking_capacity, left_on='ID_y', right_on='mstn1', how='left')
am_dataframe_new.head()

Unnamed: 0,O,D,walkshed_filename_x,walkshed_filename_y,track_miles,ID_x,ID_y,pairs,passengers,riders_miles,...,bus_tt_per_mile,proportionhouses_x,Total Households_x,proportionhouses_y,Total Households_y,parking_user,mstn1_x,PARKING_CAPACITY_x,mstn1_y,PARKING_CAPACITY_y
0,Anacostia,Addison Road,ADDISON ROAD-SEAT PLEASANT,ANACOSTIA,10.35,MSTN_062,MSTN_001,MSTN_0010MSTN_062,9,93.15,...,12.304868,529.032225,1663,2076.692445,3639,0.0,MSTN_062,1268.0,MSTN_001,808.0
1,Anacostia,Archives,ARCHIVES-NAVY MEMORIAL-PENN QUARTER,ANACOSTIA,3.16,MSTN_002,MSTN_001,MSTN_0010MSTN_002,436,1377.76,...,10.305395,2215.020592,3007,2076.692445,3639,17.0,,,MSTN_001,808.0
2,Anacostia,Arlington Cemetery,ARLINGTON CEMETERY,ANACOSTIA,6.24,MSTN_065,MSTN_001,MSTN_0010MSTN_065,5,31.2,...,,0.003139,1,2076.692445,3639,0.0,,,MSTN_001,808.0
3,Anacostia,Ballston-MU,BALLSTON-MU,ANACOSTIA,9.12,MSTN_068,MSTN_001,MSTN_0010MSTN_068,95,866.4,...,12.088109,11605.121631,13574,2076.692445,3639,0.0,,,MSTN_001,808.0
4,Anacostia,Benning Road,BENNING ROAD,ANACOSTIA,7.89,MSTN_003,MSTN_001,MSTN_0010MSTN_003,12,94.68,...,10.595066,3304.921667,4621,2076.692445,3639,0.0,,,MSTN_001,808.0


In [29]:
am_dataframe_new[['PARKING_CAPACITY_x','PARKING_CAPACITY_y',]] = am_dataframe_new[['PARKING_CAPACITY_x','PARKING_CAPACITY_y',]].fillna(0)

## Number of Bus Lines

* Rename the bus line count and bus stop columns to distinguish them
* Merge into the main AM dataframe, first based on destinations and again based on origins
* Drop columns that are redundant

In [30]:
num_bus_lines.rename(columns={'line_count':'bus_line_count','stop_count':'bus_stop_count'}, inplace=True)
num_bus_lines = num_bus_lines[['MSTN','bus_line_count','bus_stop_count']]

In [31]:
am_dataframe_new = am_dataframe_new.merge(num_bus_lines, left_on='ID_x', right_on='MSTN')
am_dataframe_new = am_dataframe_new.merge(num_bus_lines, left_on='ID_y', right_on='MSTN')
am_dataframe_new.head()

Unnamed: 0,O,D,walkshed_filename_x,walkshed_filename_y,track_miles,ID_x,ID_y,pairs,passengers,riders_miles,...,mstn1_x,PARKING_CAPACITY_x,mstn1_y,PARKING_CAPACITY_y,MSTN_x,bus_line_count_x,bus_stop_count_x,MSTN_y,bus_line_count_y,bus_stop_count_y
0,Anacostia,Addison Road,ADDISON ROAD-SEAT PLEASANT,ANACOSTIA,10.35,MSTN_062,MSTN_001,MSTN_0010MSTN_062,9,93.15,...,MSTN_062,1268.0,MSTN_001,808.0,MSTN_062,29.0,18.0,MSTN_001,54.0,36.0
1,Anacostia,Archives,ARCHIVES-NAVY MEMORIAL-PENN QUARTER,ANACOSTIA,3.16,MSTN_002,MSTN_001,MSTN_0010MSTN_002,436,1377.76,...,,0.0,MSTN_001,808.0,MSTN_002,68.0,50.0,MSTN_001,54.0,36.0
2,Anacostia,Arlington Cemetery,ARLINGTON CEMETERY,ANACOSTIA,6.24,MSTN_065,MSTN_001,MSTN_0010MSTN_065,5,31.2,...,,0.0,MSTN_001,808.0,MSTN_065,4.0,0.0,MSTN_001,54.0,36.0
3,Anacostia,Ballston-MU,BALLSTON-MU,ANACOSTIA,9.12,MSTN_068,MSTN_001,MSTN_0010MSTN_068,95,866.4,...,,0.0,MSTN_001,808.0,MSTN_068,28.0,30.0,MSTN_001,54.0,36.0
4,Anacostia,Benning Road,BENNING ROAD,ANACOSTIA,7.89,MSTN_003,MSTN_001,MSTN_0010MSTN_003,12,94.68,...,,0.0,MSTN_001,808.0,MSTN_003,19.0,34.0,MSTN_001,54.0,36.0


In [32]:
am_dataframe_new.drop(['MSTN_x','MSTN_y', 'mstn1_x','mstn1_y'], axis =1, inplace=True)

## Jobs per Half Mile

* Merge Jobs into the AM dataframe first based on the origins and again based on the destinations

In [33]:
am_dataframe_new = am_dataframe_new.merge(jobs_half_mile, left_on='walkshed_filename_x', right_on='Name_1')
am_dataframe_new = am_dataframe_new.merge(jobs_half_mile, left_on='walkshed_filename_y', right_on='Name_1')

In [34]:
am_dataframe_new.head()

Unnamed: 0,O,D,walkshed_filename_x,walkshed_filename_y,track_miles,ID_x,ID_y,pairs,passengers,riders_miles,...,PARKING_CAPACITY_x,PARKING_CAPACITY_y,bus_line_count_x,bus_stop_count_x,bus_line_count_y,bus_stop_count_y,Name_1_x,All_Jobs_x,Name_1_y,All_Jobs_y
0,Anacostia,Addison Road,ADDISON ROAD-SEAT PLEASANT,ANACOSTIA,10.35,MSTN_062,MSTN_001,MSTN_0010MSTN_062,9,93.15,...,1268.0,808.0,29.0,18.0,54.0,36.0,ADDISON ROAD-SEAT PLEASANT,205.510639,ANACOSTIA,2332.571897
1,Anacostia,Archives,ARCHIVES-NAVY MEMORIAL-PENN QUARTER,ANACOSTIA,3.16,MSTN_002,MSTN_001,MSTN_0010MSTN_002,436,1377.76,...,0.0,808.0,68.0,50.0,54.0,36.0,ARCHIVES-NAVY MEMORIAL-PENN QUARTER,0.0,ANACOSTIA,2332.571897
2,Anacostia,Arlington Cemetery,ARLINGTON CEMETERY,ANACOSTIA,6.24,MSTN_065,MSTN_001,MSTN_0010MSTN_065,5,31.2,...,0.0,808.0,4.0,0.0,54.0,36.0,ARLINGTON CEMETERY,122.300629,ANACOSTIA,2332.571897
3,Anacostia,Ballston-MU,BALLSTON-MU,ANACOSTIA,9.12,MSTN_068,MSTN_001,MSTN_0010MSTN_068,95,866.4,...,0.0,808.0,28.0,30.0,54.0,36.0,BALLSTON-MU,51873.732661,ANACOSTIA,2332.571897
4,Anacostia,Benning Road,BENNING ROAD,ANACOSTIA,7.89,MSTN_003,MSTN_001,MSTN_0010MSTN_003,12,94.68,...,0.0,808.0,19.0,34.0,54.0,36.0,BENNING ROAD,1301.51626,ANACOSTIA,2332.571897


## Distance to the Core

* Give the station names in dist_to_core proper formatting to merge with mstns
* Give station names proper MSTN IDs
* Subset only the ID and distance_miles columns. Rename both for easy merging and clarity
* Merge into the AM dataframe first based on destinations, then based on origins

In [35]:
dist_to_core['STATIONNAM']=dist_to_core['STATIONNAM'].str.replace(' : 0 - 2640','')
dist_to_core['STATIONNAM']=dist_to_core['STATIONNAM'].str.replace(' : 0 - 22.4525758392805','')
dist_to_core['STATIONNAM'] = dist_to_core['STATIONNAM'].str.title()
dist_to_core['STATIONNAM'] = dist_to_core['STATIONNAM'].replace({"Addison Road-Seat Pleasant": 'Addison Road', "Archives-Navy Mem'L-Penn Quarter":"Archives", "Ballston-Mu": "Ballston-MU", "Brookland-Cua":"Brookland-CUA", "Dunn Loring-Merrifield": "Dunn Loring", "Largo Town Center":"Downtown Largo", "Eisenhower Avenue":"Eisenhower Ave", 'Federal Center Sw':'Federal Center SW', "Foggy Bottom-Gwu":"Foggy Bottom-GWU","Gallery Pl-Chinatown":"Gallary Place", "Mclean":"McLean", "Mcpherson Square":"McPherson Sq", "Mt Vernon Sq/7Th St-Convention Center":"Mt Vernon Sq", "Noma-Gallaudet U-New York Ave":"NoMa-Gallaudet U", "Rhode Island Ave-Brentwood":"Rhode Island Ave",  "Prince George'S Plaza":"Hyattsville Crossing", "Tenleytown-Au": "Tenleytown-AU", "Tysons Corner":"Tysons", "U St/African-Amer Civil War Memorial/Cardozo": "U Street", "Southern Avenue": "Southern Ave", "Van Ness-Udc":"Van Ness-UDC", "Virginia Square-Gmu":"Virginia Sq-GMU", "Vienna/Fairfax-Gmu":"Vienna", "Washington Dulles International Airport":"Dulles Airport", "West Falls Church-Vt/Uva":"West Falls Church", "Woodley Park-Zoo/Adams Morgan":"Woodley Park", "White Flint":"North Bethesda" })
dist_to_core['STATIONNAM'] = dist_to_core['STATIONNAM'].replace({"College Park-U Of Md": 'College Park-U of Md', "Gallary Place": "Gallery Place", "Shaw-Howard Univ":"Shaw-Howard U", "Innovation":"Innovation Center", "Reston":"Reston Town Center"})

In [36]:
dist_to_core2 = mstns.merge(dist_to_core, left_on = 'PRIMARY_NAME_FY23', right_on= 'STATIONNAM')

In [37]:
dist_to_core2 = dist_to_core2[['ID','distance_miles']]
dist_to_core2.rename(columns={'ID':'id1', 'distance_miles':'distance_to_core'}, inplace=True)

In [38]:
am_dataframe_new = am_dataframe_new.merge(dist_to_core2, left_on='ID_x', right_on='id1')
am_dataframe_new = am_dataframe_new.merge(dist_to_core2, left_on='ID_y', right_on='id1')
am_dataframe_new.head()

Unnamed: 0,O,D,walkshed_filename_x,walkshed_filename_y,track_miles,ID_x,ID_y,pairs,passengers,riders_miles,...,bus_line_count_y,bus_stop_count_y,Name_1_x,All_Jobs_x,Name_1_y,All_Jobs_y,id1_x,distance_to_core_x,id1_y,distance_to_core_y
0,Anacostia,Addison Road,ADDISON ROAD-SEAT PLEASANT,ANACOSTIA,10.35,MSTN_062,MSTN_001,MSTN_0010MSTN_062,9,93.15,...,54.0,36.0,ADDISON ROAD-SEAT PLEASANT,205.510639,ANACOSTIA,2332.571897,MSTN_062,7.181143,MSTN_001,3.007895
1,Anacostia,Archives,ARCHIVES-NAVY MEMORIAL-PENN QUARTER,ANACOSTIA,3.16,MSTN_002,MSTN_001,MSTN_0010MSTN_002,436,1377.76,...,54.0,36.0,ARCHIVES-NAVY MEMORIAL-PENN QUARTER,0.0,ANACOSTIA,2332.571897,MSTN_002,0.435964,MSTN_001,3.007895
2,Anacostia,Arlington Cemetery,ARLINGTON CEMETERY,ANACOSTIA,6.24,MSTN_065,MSTN_001,MSTN_0010MSTN_065,5,31.2,...,54.0,36.0,ARLINGTON CEMETERY,122.300629,ANACOSTIA,2332.571897,MSTN_065,2.127338,MSTN_001,3.007895
3,Anacostia,Ballston-MU,BALLSTON-MU,ANACOSTIA,9.12,MSTN_068,MSTN_001,MSTN_0010MSTN_068,95,866.4,...,54.0,36.0,BALLSTON-MU,51873.732661,ANACOSTIA,2332.571897,MSTN_068,4.652737,MSTN_001,3.007895
4,Anacostia,Benning Road,BENNING ROAD,ANACOSTIA,7.89,MSTN_003,MSTN_001,MSTN_0010MSTN_003,12,94.68,...,54.0,36.0,BENNING ROAD,1301.51626,ANACOSTIA,2332.571897,MSTN_003,4.917774,MSTN_001,3.007895


## Terminal Station Dummy

* Subset only the dummy column and the station IDs
* Merge into the AM dataframe once based on destinations and again based on origins

In [39]:
terminal_station2 = terminal_station[['MSTN','terminal_dummy_2023']]

In [40]:
am_dataframe_new = am_dataframe_new.merge(terminal_station2, left_on='ID_x', right_on='MSTN')
am_dataframe_new = am_dataframe_new.merge(terminal_station2, left_on='ID_y', right_on='MSTN')

## Log of Variables and Export

* Turn column names with _x or _y to _O and _D
* Drop redundant columns
* Fill 0s and NAs with 0.01 to prevent errors with taking the natural log
* Fill terminal dummy columns with 0

In [41]:
am_dataframe_new.columns = am_dataframe_new.columns.str.replace('_x','_D')
am_dataframe_new.columns = am_dataframe_new.columns.str.replace('_y','_O')
am_dataframe_new.columns

Index(['O', 'D', 'walkshed_filename_D', 'walkshed_filename_O', 'track_miles',
       'ID_D', 'ID_O', 'pairs', 'passengers', 'riders_miles', 'AVG_TRAINS_D',
       'AVG_TRAINS_O', 'peak_fare_per_mile', 'new_auto_tt2',
       'new_auto_tt_per_mile2', 'bus_tt', 'bus_tt_per_mile',
       'proportionhouses_D', 'Total Households_D', 'proportionhouses_O',
       'Total Households_O', 'parking_user', 'PARKING_CAPACITY_D',
       'PARKING_CAPACITY_O', 'bus_line_count_D', 'bus_stop_count_D',
       'bus_line_count_O', 'bus_stop_count_O', 'Name_1_D', 'All_Jobs_D',
       'Name_1_O', 'All_Jobs_O', 'id1_D', 'distance_to_core_D', 'id1_O',
       'distance_to_core_O', 'MSTN_D', 'terminal_dummy_2023_D', 'MSTN_O',
       'terminal_dummy_2023_O'],
      dtype='object')

In [42]:
am_dataframe_new.drop(['Name_1_O','Name_1_D','id1_O','id1_D', 'MSTN_O', 'MSTN_D', 'walkshed_filename_D','walkshed_filename_O'], axis =1, inplace=True)

In [43]:
am_dataframe_new[['log_passengers', 'log_riders_miles', 'log_AVG_TRAINS_O',
       'log_AVG_TRAINS_D', 'log_peak_fare_per_mile',
       'log_auto_tt2', 'log_auto_tt_per_mile_2',
        'log_bus_tt_per_mile', 'log_bus_tt','log_proportionhouses_O',
       'log_Total Households_O', 'log_proportionhouses_D', 'log_Total Households_D',
       'log_parking_user', 'log_PARKING_CAPACITY_O', 'log_PARKING_CAPACITY_D',
       'log_bus_line_count_O', 'log_bus_stop_count_O', 'log_bus_line_count_D',
       'log_bus_stop_count_D', 'log_All_Jobs_O', 'log_All_Jobs_D', 'log_distance_to_core_O',
       'log_distance_to_core_D']] = np.log(am_dataframe_new[['passengers', 'riders_miles', 'AVG_TRAINS_O',
       'AVG_TRAINS_D', 'peak_fare_per_mile',
       'new_auto_tt2', 'new_auto_tt_per_mile2',
        'bus_tt_per_mile', 'bus_tt', 'proportionhouses_O',
       'Total Households_O', 'proportionhouses_D', 'Total Households_D',
       'parking_user', 'PARKING_CAPACITY_O', 'PARKING_CAPACITY_D',
       'bus_line_count_O', 'bus_stop_count_O', 'bus_line_count_D',
       'bus_stop_count_D', 'All_Jobs_O', 'All_Jobs_D', 'distance_to_core_O',
       'distance_to_core_D']].replace({0 : 0.01, np.nan : 0.01}))


In [44]:
am_dataframe_new[['terminal_dummy_2023_O', 'terminal_dummy_2023_D' ]] = am_dataframe_new[['terminal_dummy_2023_O', 'terminal_dummy_2023_D' ]].fillna(0)

In [45]:
am_dataframe_new.head()

Unnamed: 0,O,D,track_miles,ID_D,ID_O,pairs,passengers,riders_miles,AVG_TRAINS_D,AVG_TRAINS_O,...,log_PARKING_CAPACITY_O,log_PARKING_CAPACITY_D,log_bus_line_count_O,log_bus_stop_count_O,log_bus_line_count_D,log_bus_stop_count_D,log_All_Jobs_O,log_All_Jobs_D,log_distance_to_core_O,log_distance_to_core_D
0,Anacostia,Addison Road,10.35,MSTN_062,MSTN_001,MSTN_0010MSTN_062,9,93.15,53,33,...,6.694562,7.145196,3.988984,3.583519,3.367296,2.890372,7.754727,5.325498,1.10124,1.971459
1,Anacostia,Archives,3.16,MSTN_002,MSTN_001,MSTN_0010MSTN_002,436,1377.76,62,33,...,6.694562,-4.60517,3.988984,3.583519,4.219508,3.912023,7.754727,-4.60517,1.10124,-0.830196
2,Anacostia,Arlington Cemetery,6.24,MSTN_065,MSTN_001,MSTN_0010MSTN_065,5,31.2,27,33,...,6.694562,-4.60517,3.988984,3.583519,1.386294,-4.60517,7.754727,4.806482,1.10124,0.754871
3,Anacostia,Ballston-MU,9.12,MSTN_068,MSTN_001,MSTN_0010MSTN_068,95,866.4,53,33,...,6.694562,-4.60517,3.988984,3.583519,3.332205,3.401197,7.754727,10.856568,1.10124,1.537456
4,Anacostia,Benning Road,7.89,MSTN_003,MSTN_001,MSTN_0010MSTN_003,12,94.68,51,33,...,6.694562,-4.60517,3.988984,3.583519,2.944439,3.526361,7.754727,7.171285,1.10124,1.592856


In [46]:
am_dataframe_new.to_csv("Outputs/am_dataframe_new2.csv")

# PM Dataframe

* pm_parking_users contains the number of passengers that used parking in the PM peak period
* pm_auto_tt contains the interpolated auto travel times for the PM peak period

In [47]:
pm_parking_users = pd.read_csv("../Data Preprocessing/Parking Users/output/pm_parking_updated.csv")
pm_auto_tt = pd.read_csv("../Data Preprocessing/Interpolated Auto Travel Times/output/pm_interpolated_auto_times.csv")

## PM Passenger Miles

* Filter for only passengers in the PM Peak
* Create an od_pair key column
* Merge track miles to passengers in the am, multiply the track miles by passengers, consolidate the columns, rename to passenger column

In [48]:
pm_passengers = passengers[passengers['period']=='PM Peak']
pm_passengers['pairs'] = pm_passengers.apply(lambda x: str(x['ID'])+str(0)+str(x['ID_1']), axis=1)

In [49]:
pmmilesmerge = track_miles2.merge(pm_passengers, left_on='pairs', right_on='pairs')
pmmilesmerge['riders_miles'] = pmmilesmerge['track_miles'] * pmmilesmerge['COUNT']
pm_dataframe_new = pmmilesmerge[['O', 'D','walkshed_filename_x', 'walkshed_filename_y', 'track_miles', 'ID_x', 'ID_y','pairs', 'COUNT', 'riders_miles', ]]
pm_dataframe_new.rename(columns={'COUNT':'passengers'}, inplace=True)
pm_dataframe_new.head()

Unnamed: 0,O,D,walkshed_filename_x,walkshed_filename_y,track_miles,ID_x,ID_y,pairs,passengers,riders_miles
0,Anacostia,Addison Road,ADDISON ROAD-SEAT PLEASANT,ANACOSTIA,10.35,MSTN_062,MSTN_001,MSTN_0010MSTN_062,24,248.4
1,Archives,Addison Road,ADDISON ROAD-SEAT PLEASANT,ARCHIVES-NAVY MEMORIAL-PENN QUARTER,8.31,MSTN_062,MSTN_002,MSTN_0020MSTN_062,69,573.39
2,Arlington Cemetery,Addison Road,ADDISON ROAD-SEAT PLEASANT,ARLINGTON CEMETERY,11.39,MSTN_062,MSTN_065,MSTN_0650MSTN_062,4,45.56
3,Ballston-MU,Addison Road,ADDISON ROAD-SEAT PLEASANT,BALLSTON-MU,14.27,MSTN_062,MSTN_068,MSTN_0680MSTN_062,56,799.12
4,Benning Road,Addison Road,ADDISON ROAD-SEAT PLEASANT,BENNING ROAD,2.46,MSTN_062,MSTN_003,MSTN_0030MSTN_062,88,216.48


## Fare per Track Mile

* Merge fare_track_miles into the main PM dataframe

In [50]:
pm_dataframe_new = pm_dataframe_new.merge(fare_track_miles)
pm_dataframe_new.head()

Unnamed: 0,O,D,walkshed_filename_x,walkshed_filename_y,track_miles,ID_x,ID_y,pairs,passengers,riders_miles,O_MSTN_ID,D_MSTN_ID,peak_fare_per_mile
0,Anacostia,Addison Road,ADDISON ROAD-SEAT PLEASANT,ANACOSTIA,10.35,MSTN_062,MSTN_001,MSTN_0010MSTN_062,24,248.4,MSTN_001,MSTN_062,0.570226
1,Archives,Addison Road,ADDISON ROAD-SEAT PLEASANT,ARCHIVES-NAVY MEMORIAL-PENN QUARTER,8.31,MSTN_062,MSTN_002,MSTN_0020MSTN_062,69,573.39,MSTN_002,MSTN_062,0.502833
2,Arlington Cemetery,Addison Road,ADDISON ROAD-SEAT PLEASANT,ARLINGTON CEMETERY,11.39,MSTN_062,MSTN_065,MSTN_0650MSTN_062,4,45.56,MSTN_065,MSTN_062,0.464334
3,Ballston-MU,Addison Road,ADDISON ROAD-SEAT PLEASANT,BALLSTON-MU,14.27,MSTN_062,MSTN_068,MSTN_0680MSTN_062,56,799.12,MSTN_068,MSTN_062,0.421457
4,Benning Road,Addison Road,ADDISON ROAD-SEAT PLEASANT,BENNING ROAD,2.46,MSTN_062,MSTN_003,MSTN_0030MSTN_062,88,216.48,MSTN_003,MSTN_062,0.861453


## Auto Travel Time in Minutes/Mile

* Merge PM auto travel times into the main PM dataframe

In [51]:
pm_dataframe_new = pm_dataframe_new.merge(pm_auto_tt, on='pairs')
pm_dataframe_new.head()

Unnamed: 0.1,O,D,walkshed_filename_x,walkshed_filename_y,track_miles,ID_x,ID_y,pairs,passengers,riders_miles,O_MSTN_ID,D_MSTN_ID,peak_fare_per_mile,Unnamed: 0,new_auto_tt2,new_auto_tt_per_mile2
0,Anacostia,Addison Road,ADDISON ROAD-SEAT PLEASANT,ANACOSTIA,10.35,MSTN_062,MSTN_001,MSTN_0010MSTN_062,24,248.4,MSTN_001,MSTN_062,0.570226,1,13.18,1.641345
1,Archives,Addison Road,ADDISON ROAD-SEAT PLEASANT,ARCHIVES-NAVY MEMORIAL-PENN QUARTER,8.31,MSTN_062,MSTN_002,MSTN_0020MSTN_062,69,573.39,MSTN_002,MSTN_062,0.502833,2,13.08,1.718791
2,Arlington Cemetery,Addison Road,ADDISON ROAD-SEAT PLEASANT,ARLINGTON CEMETERY,11.39,MSTN_062,MSTN_065,MSTN_0650MSTN_062,4,45.56,MSTN_065,MSTN_062,0.464334,3,30.466196,2.67482
3,Ballston-MU,Addison Road,ADDISON ROAD-SEAT PLEASANT,BALLSTON-MU,14.27,MSTN_062,MSTN_068,MSTN_0680MSTN_062,56,799.12,MSTN_068,MSTN_062,0.421457,4,40.669998,3.116475
4,Benning Road,Addison Road,ADDISON ROAD-SEAT PLEASANT,BENNING ROAD,2.46,MSTN_062,MSTN_003,MSTN_0030MSTN_062,88,216.48,MSTN_003,MSTN_062,0.861453,5,4.9,2.008197


## Bus Travel Time in Minutes/Mile

* Merge bus travel times into the main PM dataframe

In [52]:
pm_dataframe_new = pm_dataframe_new.merge(bus_tt_minpmile2, on='pairs')
pm_dataframe_new

Unnamed: 0.1,O,D,walkshed_filename_x,walkshed_filename_y,track_miles,ID_x,ID_y,pairs,passengers,riders_miles,O_MSTN_ID,D_MSTN_ID,peak_fare_per_mile,Unnamed: 0,new_auto_tt2,new_auto_tt_per_mile2,bus_tt,bus_tt_per_mile
0,Anacostia,Addison Road,ADDISON ROAD-SEAT PLEASANT,ANACOSTIA,10.35,MSTN_062,MSTN_001,MSTN_0010MSTN_062,24,248.40,MSTN_001,MSTN_062,0.570226,1,13.180000,1.641345,82.0,12.304868
1,Archives,Addison Road,ADDISON ROAD-SEAT PLEASANT,ARCHIVES-NAVY MEMORIAL-PENN QUARTER,8.31,MSTN_062,MSTN_002,MSTN_0020MSTN_062,69,573.39,MSTN_002,MSTN_062,0.502833,2,13.080000,1.718791,84.0,11.415660
2,Arlington Cemetery,Addison Road,ADDISON ROAD-SEAT PLEASANT,ARLINGTON CEMETERY,11.39,MSTN_062,MSTN_065,MSTN_0650MSTN_062,4,45.56,MSTN_065,MSTN_062,0.464334,3,30.466196,2.674820,122.0,12.730069
3,Ballston-MU,Addison Road,ADDISON ROAD-SEAT PLEASANT,BALLSTON-MU,14.27,MSTN_062,MSTN_068,MSTN_0680MSTN_062,56,799.12,MSTN_068,MSTN_062,0.421457,4,40.669998,3.116475,180.0,14.449949
4,Benning Road,Addison Road,ADDISON ROAD-SEAT PLEASANT,BENNING ROAD,2.46,MSTN_062,MSTN_003,MSTN_0030MSTN_062,88,216.48,MSTN_003,MSTN_062,0.861453,5,4.900000,2.008197,32.0,12.251781
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6198,Waterfront,Woodley Park,WOODLEY PARK-ZOO/ADAMS MORGAN,WATERFRONT,4.46,MSTN_040,MSTN_039,MSTN_0390MSTN_040,65,289.90,MSTN_039,MSTN_040,0.627445,8275,,3.359744,55.0,13.022452
6199,West Falls Church,Woodley Park,WOODLEY PARK-ZOO/ADAMS MORGAN,WEST FALLS CHURCH-VT/UVA,12.38,MSTN_040,MSTN_059,MSTN_0590MSTN_040,3,37.14,MSTN_059,MSTN_040,0.443870,8276,,3.359744,130.0,13.265092
6200,West Hyattsville,Woodley Park,WOODLEY PARK-ZOO/ADAMS MORGAN,WEST HYATTSVILLE,9.74,MSTN_040,MSTN_055,MSTN_0550MSTN_040,4,38.96,MSTN_055,MSTN_040,0.606483,8277,,3.359744,72.0,12.129653
6201,Wheaton,Woodley Park,WOODLEY PARK-ZOO/ADAMS MORGAN,WHEATON,14.38,MSTN_040,MSTN_046,MSTN_0460MSTN_040,27,388.26,MSTN_046,MSTN_040,0.519015,8278,,3.359744,77.0,8.594437


## Households per Half Mile

* Merge into the PM dataframe first based on the origins and again based on the destinations
* Drop redundant columns

In [53]:
pm_dataframe_new = pm_dataframe_new.merge(households_half_mile, left_on='ID_x', right_on='MSTN')
pm_dataframe_new = pm_dataframe_new.merge(households_half_mile, left_on='ID_y', right_on='MSTN')
pm_dataframe_new.head()

Unnamed: 0,O,D,walkshed_filename_x,walkshed_filename_y,track_miles,ID_x,ID_y,pairs,passengers,riders_miles,...,new_auto_tt2,new_auto_tt_per_mile2,bus_tt,bus_tt_per_mile,MSTN_x,proportionhouses_x,Total Households_x,MSTN_y,proportionhouses_y,Total Households_y
0,Anacostia,Addison Road,ADDISON ROAD-SEAT PLEASANT,ANACOSTIA,10.35,MSTN_062,MSTN_001,MSTN_0010MSTN_062,24,248.4,...,13.18,1.641345,82.0,12.304868,MSTN_062,529.032225,1663,MSTN_001,2076.692445,3639
1,Anacostia,Archives,ARCHIVES-NAVY MEMORIAL-PENN QUARTER,ANACOSTIA,3.16,MSTN_002,MSTN_001,MSTN_0010MSTN_002,99,312.84,...,13.212824,4.181273,31.0,10.305395,MSTN_002,2215.020592,3007,MSTN_001,2076.692445,3639
2,Anacostia,Arlington Cemetery,ARLINGTON CEMETERY,ANACOSTIA,6.24,MSTN_065,MSTN_001,MSTN_0010MSTN_065,1,6.24,...,19.669736,3.152201,,,MSTN_065,0.003139,1,MSTN_001,2076.692445,3639
3,Anacostia,Ballston-MU,BALLSTON-MU,ANACOSTIA,9.12,MSTN_068,MSTN_001,MSTN_0010MSTN_068,21,191.52,...,38.189999,4.896153,95.0,12.088109,MSTN_068,11605.121631,13574,MSTN_001,2076.692445,3639
4,Anacostia,Benning Road,BENNING ROAD,ANACOSTIA,7.89,MSTN_003,MSTN_001,MSTN_0010MSTN_003,20,157.8,...,23.128796,2.931406,44.0,10.595066,MSTN_003,3304.921667,4621,MSTN_001,2076.692445,3639


In [54]:
pm_dataframe_new.drop(['O_MSTN_ID','D_MSTN_ID','MSTN_x','MSTN_y', 'Unnamed: 0'], axis =1, inplace=True)

## PM Trains per Hour

* Filter only for weekday services and in the PM peak
* Groupby and get the sum of trains per hour
* Merge into the PM dataframe first based on the origins and again based on the destinations
* Drop redundant columns

In [55]:
trains_per_hour2 = trains_per_hour[(trains_per_hour['ARRIVAL_PERIOD']=='PM Peak') & (trains_per_hour['SERVICETYPE']=='Weekday')]
trains_per_hour2.head()

Unnamed: 0,STATION_ID,STOP_ID,DIR_ID,SERVICETYPE,ARRIVAL_PERIOD,AVG_TRAINS
9,MSTN_001,F06,1,Weekday,PM Peak,17
11,MSTN_001,F06,2,Weekday,PM Peak,17
21,MSTN_002,F02,1,Weekday,PM Peak,31
23,MSTN_002,F02,2,Weekday,PM Peak,31
33,MSTN_003,G01,1,Weekday,PM Peak,24


In [56]:
trains_per_hour2 = trains_per_hour2.groupby(['STATION_ID']).sum().reset_index()
trains_per_hour2 = trains_per_hour2[['STATION_ID','AVG_TRAINS']]
trains_per_hour2.head()

Unnamed: 0,STATION_ID,AVG_TRAINS
0,MSTN_001,34
1,MSTN_002,62
2,MSTN_003,48
3,MSTN_004,53
4,MSTN_005,74


In [57]:
pm_dataframe_new = pm_dataframe_new.merge(trains_per_hour2, left_on='ID_x', right_on='STATION_ID')
pm_dataframe_new = pm_dataframe_new.merge(trains_per_hour2, left_on='ID_y', right_on='STATION_ID')

In [58]:
pm_dataframe_new.drop(['STATION_ID_x', 'STATION_ID_y'], axis=1, inplace=True)

## PM Parking Users

* Give station names in pm_parking_users proper MSTN IDs
* Create an OD pair column
* Subset only the pairs column and the parking_users
* Merge into the main PM dataframe

In [59]:
pm_parking_users2 = mstns.merge(pm_parking_users, left_on = 'WMATA_filename', right_on= 'START_PLACE_NAME')
pm_parking_users2 = mstns.merge(pm_parking_users2, left_on = 'WMATA_filename' , right_on='END_PLACE_NAME')
pm_parking_users2.head()

Unnamed: 0.1,ID_x,PRIMARY_NAME_FY23_x,walkshed_filename_x,WMATA_filename_x,track_miles_names_x,auto_filename_x,ID_y,PRIMARY_NAME_FY23_y,walkshed_filename_y,WMATA_filename_y,track_miles_names_y,auto_filename_y,Unnamed: 0,START_PLACE_NAME,END_PLACE_NAME,parking_user
0,MSTN_062,Addison Road,ADDISON ROAD-SEAT PLEASANT,Addison Road,Addison Road,ADDISON ROAD-SEAT PLEASANT,MSTN_001,Anacostia,ANACOSTIA,Anacostia,Anacostia,ANACOSTIA,76,Anacostia,Addison Road,0.0
1,MSTN_062,Addison Road,ADDISON ROAD-SEAT PLEASANT,Addison Road,Addison Road,ADDISON ROAD-SEAT PLEASANT,MSTN_002,Archives,ARCHIVES-NAVY MEMORIAL-PENN QUARTER,Archives-Navy Memorial,Archives,ARCHIVES-NAVY MEMORIAL-PENN QUARTER,163,Archives-Navy Memorial,Addison Road,14.0
2,MSTN_062,Addison Road,ADDISON ROAD-SEAT PLEASANT,Addison Road,Addison Road,ADDISON ROAD-SEAT PLEASANT,MSTN_065,Arlington Cemetery,ARLINGTON CEMETERY,Arlington Cemetery,Arlington Cemetery,ARLINGTON CEMETERY,253,Arlington Cemetery,Addison Road,0.0
3,MSTN_062,Addison Road,ADDISON ROAD-SEAT PLEASANT,Addison Road,Addison Road,ADDISON ROAD-SEAT PLEASANT,MSTN_068,Ballston-MU,BALLSTON-MU,Ballston,Ballston-MU,BALLSTON-MU,337,Ballston,Addison Road,1.0
4,MSTN_062,Addison Road,ADDISON ROAD-SEAT PLEASANT,Addison Road,Addison Road,ADDISON ROAD-SEAT PLEASANT,MSTN_003,Benning Road,BENNING ROAD,Benning Road,Benning Road,BENNING ROAD,427,Benning Road,Addison Road,0.0


In [60]:
pm_parking_users2['pairs'] = pm_parking_users2.apply(lambda x: str(x['ID_y'])+str(0)+str(x['ID_x']), axis=1)

In [61]:
pm_parking_users2 = pm_parking_users2[['pairs', 'parking_user']]

In [62]:
pm_dataframe_new = pm_dataframe_new.merge(pm_parking_users2, on='pairs', how='left')
pm_dataframe_new.head()

Unnamed: 0,O,D,walkshed_filename_x,walkshed_filename_y,track_miles,ID_x,ID_y,pairs,passengers,riders_miles,...,new_auto_tt_per_mile2,bus_tt,bus_tt_per_mile,proportionhouses_x,Total Households_x,proportionhouses_y,Total Households_y,AVG_TRAINS_x,AVG_TRAINS_y,parking_user
0,Anacostia,Addison Road,ADDISON ROAD-SEAT PLEASANT,ANACOSTIA,10.35,MSTN_062,MSTN_001,MSTN_0010MSTN_062,24,248.4,...,1.641345,82.0,12.304868,529.032225,1663,2076.692445,3639,48,34,0.0
1,Anacostia,Archives,ARCHIVES-NAVY MEMORIAL-PENN QUARTER,ANACOSTIA,3.16,MSTN_002,MSTN_001,MSTN_0010MSTN_002,99,312.84,...,4.181273,31.0,10.305395,2215.020592,3007,2076.692445,3639,62,34,0.0
2,Anacostia,Arlington Cemetery,ARLINGTON CEMETERY,ANACOSTIA,6.24,MSTN_065,MSTN_001,MSTN_0010MSTN_065,1,6.24,...,3.152201,,,0.003139,1,2076.692445,3639,24,34,0.0
3,Anacostia,Ballston-MU,BALLSTON-MU,ANACOSTIA,9.12,MSTN_068,MSTN_001,MSTN_0010MSTN_068,21,191.52,...,4.896153,95.0,12.088109,11605.121631,13574,2076.692445,3639,50,34,0.0
4,Anacostia,Benning Road,BENNING ROAD,ANACOSTIA,7.89,MSTN_003,MSTN_001,MSTN_0010MSTN_003,20,157.8,...,2.931406,44.0,10.595066,3304.921667,4621,2076.692445,3639,48,34,0.0


## Number of Bus Lines

* Merge into the PM dataframe first based on the origins and again based on the destinations
* Drop redundant columns

In [63]:
pm_dataframe_new = pm_dataframe_new.merge(num_bus_lines, left_on='ID_x', right_on='MSTN')
pm_dataframe_new = pm_dataframe_new.merge(num_bus_lines, left_on='ID_y', right_on='MSTN')
pm_dataframe_new.head()

Unnamed: 0,O,D,walkshed_filename_x,walkshed_filename_y,track_miles,ID_x,ID_y,pairs,passengers,riders_miles,...,Total Households_y,AVG_TRAINS_x,AVG_TRAINS_y,parking_user,MSTN_x,bus_line_count_x,bus_stop_count_x,MSTN_y,bus_line_count_y,bus_stop_count_y
0,Anacostia,Addison Road,ADDISON ROAD-SEAT PLEASANT,ANACOSTIA,10.35,MSTN_062,MSTN_001,MSTN_0010MSTN_062,24,248.4,...,3639,48,34,0.0,MSTN_062,29.0,18.0,MSTN_001,54.0,36.0
1,Anacostia,Archives,ARCHIVES-NAVY MEMORIAL-PENN QUARTER,ANACOSTIA,3.16,MSTN_002,MSTN_001,MSTN_0010MSTN_002,99,312.84,...,3639,62,34,0.0,MSTN_002,68.0,50.0,MSTN_001,54.0,36.0
2,Anacostia,Arlington Cemetery,ARLINGTON CEMETERY,ANACOSTIA,6.24,MSTN_065,MSTN_001,MSTN_0010MSTN_065,1,6.24,...,3639,24,34,0.0,MSTN_065,4.0,0.0,MSTN_001,54.0,36.0
3,Anacostia,Ballston-MU,BALLSTON-MU,ANACOSTIA,9.12,MSTN_068,MSTN_001,MSTN_0010MSTN_068,21,191.52,...,3639,50,34,0.0,MSTN_068,28.0,30.0,MSTN_001,54.0,36.0
4,Anacostia,Benning Road,BENNING ROAD,ANACOSTIA,7.89,MSTN_003,MSTN_001,MSTN_0010MSTN_003,20,157.8,...,3639,48,34,0.0,MSTN_003,19.0,34.0,MSTN_001,54.0,36.0


In [64]:
pm_dataframe_new.drop(['MSTN_x','MSTN_y'], axis =1, inplace=True)

## Jobs per Half Mile

* Merge into the PM dataframe first based on the origins and again based on the destinations

In [65]:
pm_dataframe_new = pm_dataframe_new.merge(jobs_half_mile, left_on='walkshed_filename_x', right_on='Name_1')
pm_dataframe_new = pm_dataframe_new.merge(jobs_half_mile, left_on='walkshed_filename_y', right_on='Name_1')
pm_dataframe_new.head()

Unnamed: 0,O,D,walkshed_filename_x,walkshed_filename_y,track_miles,ID_x,ID_y,pairs,passengers,riders_miles,...,AVG_TRAINS_y,parking_user,bus_line_count_x,bus_stop_count_x,bus_line_count_y,bus_stop_count_y,Name_1_x,All_Jobs_x,Name_1_y,All_Jobs_y
0,Anacostia,Addison Road,ADDISON ROAD-SEAT PLEASANT,ANACOSTIA,10.35,MSTN_062,MSTN_001,MSTN_0010MSTN_062,24,248.4,...,34,0.0,29.0,18.0,54.0,36.0,ADDISON ROAD-SEAT PLEASANT,205.510639,ANACOSTIA,2332.571897
1,Anacostia,Archives,ARCHIVES-NAVY MEMORIAL-PENN QUARTER,ANACOSTIA,3.16,MSTN_002,MSTN_001,MSTN_0010MSTN_002,99,312.84,...,34,0.0,68.0,50.0,54.0,36.0,ARCHIVES-NAVY MEMORIAL-PENN QUARTER,0.0,ANACOSTIA,2332.571897
2,Anacostia,Arlington Cemetery,ARLINGTON CEMETERY,ANACOSTIA,6.24,MSTN_065,MSTN_001,MSTN_0010MSTN_065,1,6.24,...,34,0.0,4.0,0.0,54.0,36.0,ARLINGTON CEMETERY,122.300629,ANACOSTIA,2332.571897
3,Anacostia,Ballston-MU,BALLSTON-MU,ANACOSTIA,9.12,MSTN_068,MSTN_001,MSTN_0010MSTN_068,21,191.52,...,34,0.0,28.0,30.0,54.0,36.0,BALLSTON-MU,51873.732661,ANACOSTIA,2332.571897
4,Anacostia,Benning Road,BENNING ROAD,ANACOSTIA,7.89,MSTN_003,MSTN_001,MSTN_0010MSTN_003,20,157.8,...,34,0.0,19.0,34.0,54.0,36.0,BENNING ROAD,1301.51626,ANACOSTIA,2332.571897


## Terminal Station Dummy

* Merge into the PM dataframe first based on the origins and again based on the destinations

In [66]:
pm_dataframe_new = pm_dataframe_new.merge(terminal_station2, left_on='ID_x', right_on='MSTN')
pm_dataframe_new = pm_dataframe_new.merge(terminal_station2, left_on='ID_y', right_on='MSTN')

## Log of Variables and Export

* Turn column names with _x or _y to _O and _D
* Drop redundant columns
* Fill 0s and NAs with 0.01 to prevent errors with taking the natural log
* Fill terminal dummy columns with 0

In [67]:
pm_dataframe_new.columns = pm_dataframe_new.columns.str.replace('_x','_D')
pm_dataframe_new.columns = pm_dataframe_new.columns.str.replace('_y','_O')
pm_dataframe_new.columns

Index(['O', 'D', 'walkshed_filename_D', 'walkshed_filename_O', 'track_miles',
       'ID_D', 'ID_O', 'pairs', 'passengers', 'riders_miles',
       'peak_fare_per_mile', 'new_auto_tt2', 'new_auto_tt_per_mile2', 'bus_tt',
       'bus_tt_per_mile', 'proportionhouses_D', 'Total Households_D',
       'proportionhouses_O', 'Total Households_O', 'AVG_TRAINS_D',
       'AVG_TRAINS_O', 'parking_user', 'bus_line_count_D', 'bus_stop_count_D',
       'bus_line_count_O', 'bus_stop_count_O', 'Name_1_D', 'All_Jobs_D',
       'Name_1_O', 'All_Jobs_O', 'MSTN_D', 'terminal_dummy_2023_D', 'MSTN_O',
       'terminal_dummy_2023_O'],
      dtype='object')

In [68]:
pm_dataframe_new.drop(['Name_1_O','Name_1_D', 'MSTN_O', 'MSTN_D', ], axis =1, inplace=True)

In [69]:
pm_dataframe_new[['log_passengers', 'log_riders_miles',
       'log_peak_fare_per_mile', 
       'log_new_auto_tt2', 'log_new_auto_tt_per_mile2','log_bus_tt_per_mile', 'log_bus_tt',
       'log_proportionhouses_O', 'log_Total Households_O', 'log_proportionhouses_D',
       'log_Total Households_D', 'log_parking_user', 'log_bus_line_count_O',
       'log_bus_stop_count_O', 'log_bus_line_count_D', 'log_bus_stop_count_D',
       'log_All_Jobs_O', 'log_All_Jobs_D', 'log_AVG_TRAINS_O',
       'log_AVG_TRAINS_D',]] = np.log(pm_dataframe_new[['passengers', 'riders_miles',
       'peak_fare_per_mile', 
       'new_auto_tt2', 'new_auto_tt_per_mile2', 'bus_tt_per_mile', 'bus_tt',
       'proportionhouses_O', 'Total Households_O', 'proportionhouses_D',
       'Total Households_D', 'parking_user', 'bus_line_count_O',
       'bus_stop_count_O', 'bus_line_count_D', 'bus_stop_count_D',
       'All_Jobs_O', 'All_Jobs_D', 'AVG_TRAINS_O',
       'AVG_TRAINS_D']].replace({0 : 0.01, np.nan:0.01}))


In [70]:
pm_dataframe_new[[ 'terminal_dummy_2023_O', 'terminal_dummy_2023_D']] = pm_dataframe_new[[ 'terminal_dummy_2023_O', 'terminal_dummy_2023_D' ]].fillna(0)

In [71]:
pm_dataframe_new.head()

Unnamed: 0,O,D,walkshed_filename_D,walkshed_filename_O,track_miles,ID_D,ID_O,pairs,passengers,riders_miles,...,log_Total Households_D,log_parking_user,log_bus_line_count_O,log_bus_stop_count_O,log_bus_line_count_D,log_bus_stop_count_D,log_All_Jobs_O,log_All_Jobs_D,log_AVG_TRAINS_O,log_AVG_TRAINS_D
0,Anacostia,Addison Road,ADDISON ROAD-SEAT PLEASANT,ANACOSTIA,10.35,MSTN_062,MSTN_001,MSTN_0010MSTN_062,24,248.4,...,7.416378,-4.60517,3.988984,3.583519,3.367296,2.890372,7.754727,5.325498,3.526361,3.871201
1,Anacostia,Archives,ARCHIVES-NAVY MEMORIAL-PENN QUARTER,ANACOSTIA,3.16,MSTN_002,MSTN_001,MSTN_0010MSTN_002,99,312.84,...,8.008698,-4.60517,3.988984,3.583519,4.219508,3.912023,7.754727,-4.60517,3.526361,4.127134
2,Anacostia,Arlington Cemetery,ARLINGTON CEMETERY,ANACOSTIA,6.24,MSTN_065,MSTN_001,MSTN_0010MSTN_065,1,6.24,...,0.0,-4.60517,3.988984,3.583519,1.386294,-4.60517,7.754727,4.806482,3.526361,3.178054
3,Anacostia,Ballston-MU,BALLSTON-MU,ANACOSTIA,9.12,MSTN_068,MSTN_001,MSTN_0010MSTN_068,21,191.52,...,9.515911,-4.60517,3.988984,3.583519,3.332205,3.401197,7.754727,10.856568,3.526361,3.912023
4,Anacostia,Benning Road,BENNING ROAD,ANACOSTIA,7.89,MSTN_003,MSTN_001,MSTN_0010MSTN_003,20,157.8,...,8.438366,-4.60517,3.988984,3.583519,2.944439,3.526361,7.754727,7.171285,3.526361,3.871201


In [72]:
pm_dataframe_new.to_csv("Outputs/pm_dataframe_new2.csv")

# Off Peak Dataframe

* off_parking_users contains the number of passengers that used parking in the off peak period
* off_fare_track_miles contains the fare per track mile of each OD pair in the Off-peak period
* nite_wkend_jobs contains the number of night and weekend jobs within a walkshed
* median_income contains the median household income of each walkshed
* off_auto_tt contains the interpolated auto travel times for the off peak period

In [73]:
off_parking_users = pd.read_csv("../Data Preprocessing/Parking Users/output/off_parking_updated.csv")
off_fare_track_miles = pd.read_excel("../Data Preprocessing/Fare per Track Mile/output/railOD_off_peak_fare_per_mile.xlsx")
nite_wkend_jobs = pd.read_excel("../Data Preprocessing/Proportion of Night and Weekend Jobs/output/Proportional_night_weekend_jobs_ML.xlsx")
median_income = pd.read_excel("../Data Preprocessing/Median Household Income/output/proportional_walkshed_household_income_updated.xlsx")
off_auto_tt = pd.read_csv("../Data Preprocessing/Interpolated Auto Travel Times/output/off_interpolated_auto_times.csv")

## Off Peak Passenger Miles

* Filter for only passengers in the Off Peak
* Create an od_pair key column
* Merge track miles to passengers in the am, multiply the track miles by passengers, consolidate the columns, rename to passenger column

In [74]:
off_passengers = passengers[passengers['period']=='Off Peak']
off_passengers['pairs'] = off_passengers.apply(lambda x: str(x['ID'])+str(0)+str(x['ID_1']), axis=1)

In [75]:
offmilesmerge = track_miles2.merge(off_passengers, left_on='pairs', right_on='pairs')
offmilesmerge['riders_miles'] = offmilesmerge['track_miles'] * offmilesmerge['COUNT']
off_dataframe_new = offmilesmerge[['O', 'D','walkshed_filename_x', 'walkshed_filename_y', 'track_miles', 'ID_x', 'ID_y','pairs', 'COUNT', 'riders_miles', ]]
off_dataframe_new.rename(columns={'COUNT':'passengers'}, inplace=True)
off_dataframe_new.head()

Unnamed: 0,O,D,walkshed_filename_x,walkshed_filename_y,track_miles,ID_x,ID_y,pairs,passengers,riders_miles
0,Anacostia,Addison Road,ADDISON ROAD-SEAT PLEASANT,ANACOSTIA,10.35,MSTN_062,MSTN_001,MSTN_0010MSTN_062,12,124.2
1,Archives,Addison Road,ADDISON ROAD-SEAT PLEASANT,ARCHIVES-NAVY MEMORIAL-PENN QUARTER,8.31,MSTN_062,MSTN_002,MSTN_0020MSTN_062,20,166.2
2,Arlington Cemetery,Addison Road,ADDISON ROAD-SEAT PLEASANT,ARLINGTON CEMETERY,11.39,MSTN_062,MSTN_065,MSTN_0650MSTN_062,2,22.78
3,Ballston-MU,Addison Road,ADDISON ROAD-SEAT PLEASANT,BALLSTON-MU,14.27,MSTN_062,MSTN_068,MSTN_0680MSTN_062,25,356.75
4,Benning Road,Addison Road,ADDISON ROAD-SEAT PLEASANT,BENNING ROAD,2.46,MSTN_062,MSTN_003,MSTN_0030MSTN_062,109,268.14


## Fare per Track Mile

* Subset only the station ID columns from fare_track_miles
* Create a unique OD pair column
* Merge into the main Off Peak dataframe

In [76]:
off_fare_track_miles= off_fare_track_miles[['pairs', 'O_MSTN_ID', 'D_MSTN_ID', 'off_peak_fare_per_mile']]
off_fare_track_miles['pairs'] = off_fare_track_miles.apply(lambda x: str(x['O_MSTN_ID'])+str(0)+str(x['D_MSTN_ID']), axis=1)

In [77]:
off_dataframe_new = off_dataframe_new.merge(off_fare_track_miles)
off_dataframe_new.head()

Unnamed: 0,O,D,walkshed_filename_x,walkshed_filename_y,track_miles,ID_x,ID_y,pairs,passengers,riders_miles,O_MSTN_ID,D_MSTN_ID,off_peak_fare_per_mile
0,Anacostia,Addison Road,ADDISON ROAD-SEAT PLEASANT,ANACOSTIA,10.35,MSTN_062,MSTN_001,MSTN_0010MSTN_062,12,124.2,MSTN_001,MSTN_062,0.472687
1,Archives,Addison Road,ADDISON ROAD-SEAT PLEASANT,ARCHIVES-NAVY MEMORIAL-PENN QUARTER,8.31,MSTN_062,MSTN_002,MSTN_0020MSTN_062,20,166.2,MSTN_002,MSTN_062,0.421292
2,Arlington Cemetery,Addison Road,ADDISON ROAD-SEAT PLEASANT,ARLINGTON CEMETERY,11.39,MSTN_062,MSTN_065,MSTN_0650MSTN_062,2,22.78,MSTN_065,MSTN_062,0.380859
3,Ballston-MU,Addison Road,ADDISON ROAD-SEAT PLEASANT,BALLSTON-MU,14.27,MSTN_062,MSTN_068,MSTN_0680MSTN_062,25,356.75,MSTN_068,MSTN_062,0.309068
4,Benning Road,Addison Road,ADDISON ROAD-SEAT PLEASANT,BENNING ROAD,2.46,MSTN_062,MSTN_003,MSTN_0030MSTN_062,109,268.14,MSTN_003,MSTN_062,0.765736


## Auto Travel Time in Minutes/Mile

* Merge into the main Off Peak dataframe

In [78]:
off_dataframe_new = off_dataframe_new.merge(off_auto_tt, on='pairs')
off_dataframe_new.head()

Unnamed: 0.1,O,D,walkshed_filename_x,walkshed_filename_y,track_miles,ID_x,ID_y,pairs,passengers,riders_miles,O_MSTN_ID,D_MSTN_ID,off_peak_fare_per_mile,Unnamed: 0,new_auto_tt2,new_auto_tt_per_mile2
0,Anacostia,Addison Road,ADDISON ROAD-SEAT PLEASANT,ANACOSTIA,10.35,MSTN_062,MSTN_001,MSTN_0010MSTN_062,12,124.2,MSTN_001,MSTN_062,0.472687,1,13.1,1.2657
1,Archives,Addison Road,ADDISON ROAD-SEAT PLEASANT,ARCHIVES-NAVY MEMORIAL-PENN QUARTER,8.31,MSTN_062,MSTN_002,MSTN_0020MSTN_062,20,166.2,MSTN_002,MSTN_062,0.421292,2,13.23,1.592058
2,Arlington Cemetery,Addison Road,ADDISON ROAD-SEAT PLEASANT,ARLINGTON CEMETERY,11.39,MSTN_062,MSTN_065,MSTN_0650MSTN_062,2,22.78,MSTN_065,MSTN_062,0.380859,3,21.349543,1.874411
3,Ballston-MU,Addison Road,ADDISON ROAD-SEAT PLEASANT,BALLSTON-MU,14.27,MSTN_062,MSTN_068,MSTN_0680MSTN_062,25,356.75,MSTN_068,MSTN_062,0.309068,4,33.349998,2.337071
4,Benning Road,Addison Road,ADDISON ROAD-SEAT PLEASANT,BENNING ROAD,2.46,MSTN_062,MSTN_003,MSTN_0030MSTN_062,109,268.14,MSTN_003,MSTN_062,0.765736,5,4.91,1.995935


## Households per Half Mile

c
* Drop redundant columns

In [79]:
off_dataframe_new = off_dataframe_new.merge(households_half_mile, left_on='ID_x', right_on='MSTN')
off_dataframe_new = off_dataframe_new.merge(households_half_mile, left_on='ID_y', right_on='MSTN')
off_dataframe_new.head()

Unnamed: 0.1,O,D,walkshed_filename_x,walkshed_filename_y,track_miles,ID_x,ID_y,pairs,passengers,riders_miles,...,off_peak_fare_per_mile,Unnamed: 0,new_auto_tt2,new_auto_tt_per_mile2,MSTN_x,proportionhouses_x,Total Households_x,MSTN_y,proportionhouses_y,Total Households_y
0,Anacostia,Addison Road,ADDISON ROAD-SEAT PLEASANT,ANACOSTIA,10.35,MSTN_062,MSTN_001,MSTN_0010MSTN_062,12,124.2,...,0.472687,1,13.1,1.2657,MSTN_062,529.032225,1663,MSTN_001,2076.692445,3639
1,Anacostia,Archives,ARCHIVES-NAVY MEMORIAL-PENN QUARTER,ANACOSTIA,3.16,MSTN_002,MSTN_001,MSTN_0010MSTN_002,206,650.96,...,0.664864,183,9.632492,3.048257,MSTN_002,2215.020592,3007,MSTN_001,2076.692445,3639
2,Anacostia,Arlington Cemetery,ARLINGTON CEMETERY,ANACOSTIA,6.24,MSTN_065,MSTN_001,MSTN_0010MSTN_065,3,18.72,...,0.510586,274,14.017488,2.246392,MSTN_065,0.003139,1,MSTN_001,2076.692445,3639
3,Anacostia,Ballston-MU,BALLSTON-MU,ANACOSTIA,9.12,MSTN_068,MSTN_001,MSTN_0010MSTN_068,23,209.76,...,0.394454,365,26.440001,2.899123,MSTN_068,11605.121631,13574,MSTN_001,2076.692445,3639
4,Anacostia,Benning Road,BENNING ROAD,ANACOSTIA,7.89,MSTN_003,MSTN_001,MSTN_0010MSTN_003,19,149.91,...,0.626072,456,16.366593,2.074346,MSTN_003,3304.921667,4621,MSTN_001,2076.692445,3639


In [80]:
off_dataframe_new.drop(['O_MSTN_ID','D_MSTN_ID','MSTN_x','MSTN_y'], axis =1, inplace=True)

## Off Peak Parking Users

* Give station names in pm_parking_users proper MSTN IDs
* Create an OD pair column
* Subset only the pairs column and the parking_users
* Merge into the main PM dataframe

In [81]:
off_parking_users2 = mstns.merge(off_parking_users, left_on = 'WMATA_filename', right_on= 'START_PLACE_NAME')
off_parking_users2 = mstns.merge(off_parking_users2, left_on = 'WMATA_filename' , right_on='END_PLACE_NAME')

In [82]:
off_parking_users2['pairs'] = off_parking_users2.apply(lambda x: str(x['ID_y'])+str(0)+str(x['ID_x']), axis=1)

In [83]:
off_parking_users2 = off_parking_users2[['pairs', 'parking_user']]

In [84]:
off_dataframe_new = off_dataframe_new.merge(off_parking_users2, on='pairs', how='left')
off_dataframe_new.head()

Unnamed: 0.1,O,D,walkshed_filename_x,walkshed_filename_y,track_miles,ID_x,ID_y,pairs,passengers,riders_miles,off_peak_fare_per_mile,Unnamed: 0,new_auto_tt2,new_auto_tt_per_mile2,proportionhouses_x,Total Households_x,proportionhouses_y,Total Households_y,parking_user
0,Anacostia,Addison Road,ADDISON ROAD-SEAT PLEASANT,ANACOSTIA,10.35,MSTN_062,MSTN_001,MSTN_0010MSTN_062,12,124.2,0.472687,1,13.1,1.2657,529.032225,1663,2076.692445,3639,0.0
1,Anacostia,Archives,ARCHIVES-NAVY MEMORIAL-PENN QUARTER,ANACOSTIA,3.16,MSTN_002,MSTN_001,MSTN_0010MSTN_002,206,650.96,0.664864,183,9.632492,3.048257,2215.020592,3007,2076.692445,3639,6.0
2,Anacostia,Arlington Cemetery,ARLINGTON CEMETERY,ANACOSTIA,6.24,MSTN_065,MSTN_001,MSTN_0010MSTN_065,3,18.72,0.510586,274,14.017488,2.246392,0.003139,1,2076.692445,3639,0.0
3,Anacostia,Ballston-MU,BALLSTON-MU,ANACOSTIA,9.12,MSTN_068,MSTN_001,MSTN_0010MSTN_068,23,209.76,0.394454,365,26.440001,2.899123,11605.121631,13574,2076.692445,3639,0.0
4,Anacostia,Benning Road,BENNING ROAD,ANACOSTIA,7.89,MSTN_003,MSTN_001,MSTN_0010MSTN_003,19,149.91,0.626072,456,16.366593,2.074346,3304.921667,4621,2076.692445,3639,0.0


## Number of Bus Lines

* Merge into the Off Peak dataframe first based on the origins and again based on the destinations
* Drop reduncant columns

In [85]:
off_dataframe_new = off_dataframe_new.merge(num_bus_lines, left_on='ID_x', right_on='MSTN')
off_dataframe_new = off_dataframe_new.merge(num_bus_lines, left_on='ID_y', right_on='MSTN')
off_dataframe_new.head()

Unnamed: 0,O,D,walkshed_filename_x,walkshed_filename_y,track_miles,ID_x,ID_y,pairs,passengers,riders_miles,...,Total Households_x,proportionhouses_y,Total Households_y,parking_user,MSTN_x,bus_line_count_x,bus_stop_count_x,MSTN_y,bus_line_count_y,bus_stop_count_y
0,Anacostia,Addison Road,ADDISON ROAD-SEAT PLEASANT,ANACOSTIA,10.35,MSTN_062,MSTN_001,MSTN_0010MSTN_062,12,124.2,...,1663,2076.692445,3639,0.0,MSTN_062,29.0,18.0,MSTN_001,54.0,36.0
1,Anacostia,Archives,ARCHIVES-NAVY MEMORIAL-PENN QUARTER,ANACOSTIA,3.16,MSTN_002,MSTN_001,MSTN_0010MSTN_002,206,650.96,...,3007,2076.692445,3639,6.0,MSTN_002,68.0,50.0,MSTN_001,54.0,36.0
2,Anacostia,Arlington Cemetery,ARLINGTON CEMETERY,ANACOSTIA,6.24,MSTN_065,MSTN_001,MSTN_0010MSTN_065,3,18.72,...,1,2076.692445,3639,0.0,MSTN_065,4.0,0.0,MSTN_001,54.0,36.0
3,Anacostia,Ballston-MU,BALLSTON-MU,ANACOSTIA,9.12,MSTN_068,MSTN_001,MSTN_0010MSTN_068,23,209.76,...,13574,2076.692445,3639,0.0,MSTN_068,28.0,30.0,MSTN_001,54.0,36.0
4,Anacostia,Benning Road,BENNING ROAD,ANACOSTIA,7.89,MSTN_003,MSTN_001,MSTN_0010MSTN_003,19,149.91,...,4621,2076.692445,3639,0.0,MSTN_003,19.0,34.0,MSTN_001,54.0,36.0


In [86]:
off_dataframe_new.drop(['MSTN_x','MSTN_y'], axis =1, inplace=True)

## Night and Weekend Jobs

* Merge into the Off Peak dataframe first based on the origins and again based on the destinations

In [87]:
off_dataframe_new = off_dataframe_new.merge(nite_wkend_jobs, left_on='walkshed_filename_x', right_on='Name_1')
off_dataframe_new = off_dataframe_new.merge(nite_wkend_jobs, left_on='walkshed_filename_y', right_on='Name_1')
off_dataframe_new.head()

Unnamed: 0,O,D,walkshed_filename_x,walkshed_filename_y,track_miles,ID_x,ID_y,pairs,passengers,riders_miles,...,Total Households_y,parking_user,bus_line_count_x,bus_stop_count_x,bus_line_count_y,bus_stop_count_y,Name_1_x,Proportion night weekend jobs_x,Name_1_y,Proportion night weekend jobs_y
0,Anacostia,Addison Road,ADDISON ROAD-SEAT PLEASANT,ANACOSTIA,10.35,MSTN_062,MSTN_001,MSTN_0010MSTN_062,12,124.2,...,3639,0.0,29.0,18.0,54.0,36.0,ADDISON ROAD-SEAT PLEASANT,10.449704,ANACOSTIA,134.785117
1,Anacostia,Archives,ARCHIVES-NAVY MEMORIAL-PENN QUARTER,ANACOSTIA,3.16,MSTN_002,MSTN_001,MSTN_0010MSTN_002,206,650.96,...,3639,6.0,68.0,50.0,54.0,36.0,ARCHIVES-NAVY MEMORIAL-PENN QUARTER,0.0,ANACOSTIA,134.785117
2,Anacostia,Arlington Cemetery,ARLINGTON CEMETERY,ANACOSTIA,6.24,MSTN_065,MSTN_001,MSTN_0010MSTN_065,3,18.72,...,3639,0.0,4.0,0.0,54.0,36.0,ARLINGTON CEMETERY,63.154119,ANACOSTIA,134.785117
3,Anacostia,Ballston-MU,BALLSTON-MU,ANACOSTIA,9.12,MSTN_068,MSTN_001,MSTN_0010MSTN_068,23,209.76,...,3639,0.0,28.0,30.0,54.0,36.0,BALLSTON-MU,2731.168835,ANACOSTIA,134.785117
4,Anacostia,Benning Road,BENNING ROAD,ANACOSTIA,7.89,MSTN_003,MSTN_001,MSTN_0010MSTN_003,19,149.91,...,3639,0.0,19.0,34.0,54.0,36.0,BENNING ROAD,309.339636,ANACOSTIA,134.785117


## Median Household Income

* Rename the station name column to make it easier for merging 
* Merge into the Off Peak dataframe first based on the origins and again based on the destinations
* Fill any NAs with 0

In [88]:
median_income.rename(columns={'Name_1':'Name_2'}, inplace=True)

In [89]:
off_dataframe_new = off_dataframe_new.merge(median_income, left_on='walkshed_filename_x', right_on='Name_2')
off_dataframe_new = off_dataframe_new.merge(median_income, left_on='walkshed_filename_y', right_on='Name_2')
off_dataframe_new.head()

Unnamed: 0.1,O,D,walkshed_filename_x,walkshed_filename_y,track_miles,ID_x,ID_y,pairs,passengers,riders_miles,...,Name_1_x,Proportion night weekend jobs_x,Name_1_y,Proportion night weekend jobs_y,Unnamed: 0_y,Name_2_x,Median household income_x,Unnamed: 0,Name_2_y,Median household income_y
0,Anacostia,Addison Road,ADDISON ROAD-SEAT PLEASANT,ANACOSTIA,10.35,MSTN_062,MSTN_001,MSTN_0010MSTN_062,12,124.2,...,ADDISON ROAD-SEAT PLEASANT,10.449704,ANACOSTIA,134.785117,0,ADDISON ROAD-SEAT PLEASANT,77702.0,1,ANACOSTIA,36323.0
1,Anacostia,Archives,ARCHIVES-NAVY MEMORIAL-PENN QUARTER,ANACOSTIA,3.16,MSTN_002,MSTN_001,MSTN_0010MSTN_002,206,650.96,...,ARCHIVES-NAVY MEMORIAL-PENN QUARTER,0.0,ANACOSTIA,134.785117,2,ARCHIVES-NAVY MEMORIAL-PENN QUARTER,135011.0,1,ANACOSTIA,36323.0
2,Anacostia,Arlington Cemetery,ARLINGTON CEMETERY,ANACOSTIA,6.24,MSTN_065,MSTN_001,MSTN_0010MSTN_065,3,18.72,...,ARLINGTON CEMETERY,63.154119,ANACOSTIA,134.785117,3,ARLINGTON CEMETERY,,1,ANACOSTIA,36323.0
3,Anacostia,Ballston-MU,BALLSTON-MU,ANACOSTIA,9.12,MSTN_068,MSTN_001,MSTN_0010MSTN_068,23,209.76,...,BALLSTON-MU,2731.168835,ANACOSTIA,134.785117,5,BALLSTON-MU,127512.0,1,ANACOSTIA,36323.0
4,Anacostia,Benning Road,BENNING ROAD,ANACOSTIA,7.89,MSTN_003,MSTN_001,MSTN_0010MSTN_003,19,149.91,...,BENNING ROAD,309.339636,ANACOSTIA,134.785117,6,BENNING ROAD,52158.0,1,ANACOSTIA,36323.0


In [90]:
off_dataframe_new[['Median household income_x','Median household income_y', ]] = off_dataframe_new[['Median household income_x','Median household income_y', ]].fillna(0)

## Terminal Station Dummy

* Merge into the Off Peak dataframe first based on the origins and again based on the destinations

In [91]:
off_dataframe_new = off_dataframe_new.merge(terminal_station2, left_on='ID_x', right_on='MSTN')
off_dataframe_new = off_dataframe_new.merge(terminal_station2, left_on='ID_y', right_on='MSTN')

## Log of Variables and Export

* Turn column names with _x or _y to _O and _D
* Drop redundant columns
* Fill 0s and NAs with 0.01 to prevent errors with taking the natural log
* Fill terminal dummy columns with 0

In [92]:
off_dataframe_new.columns = off_dataframe_new.columns.str.replace('_x','_D')
off_dataframe_new.columns = off_dataframe_new.columns.str.replace('_y','_O')
off_dataframe_new.columns

Index(['O', 'D', 'walkshed_filename_D', 'walkshed_filename_O', 'track_miles',
       'ID_D', 'ID_O', 'pairs', 'passengers', 'riders_miles',
       'off_peak_fare_per_mile', 'Unnamed: 0_D', 'new_auto_tt2',
       'new_auto_tt_per_mile2', 'proportionhouses_D', 'Total Households_D',
       'proportionhouses_O', 'Total Households_O', 'parking_user',
       'bus_line_count_D', 'bus_stop_count_D', 'bus_line_count_O',
       'bus_stop_count_O', 'Name_1_D', 'Proportion night weekend jobs_D',
       'Name_1_O', 'Proportion night weekend jobs_O', 'Unnamed: 0_O',
       'Name_2_D', 'Median household income_D', 'Unnamed: 0', 'Name_2_O',
       'Median household income_O', 'MSTN_D', 'terminal_dummy_2023_D',
       'MSTN_O', 'terminal_dummy_2023_O'],
      dtype='object')

In [93]:
off_dataframe_new.drop(['Name_1_O','Name_1_D','Unnamed: 0_O', 'Name_2_O',
       'Unnamed: 0_D', 'Name_2_D', 'MSTN_O', 'MSTN_D', 'Unnamed: 0'], axis =1, inplace=True)

In [94]:
off_dataframe_new.columns

Index(['O', 'D', 'walkshed_filename_D', 'walkshed_filename_O', 'track_miles',
       'ID_D', 'ID_O', 'pairs', 'passengers', 'riders_miles',
       'off_peak_fare_per_mile', 'new_auto_tt2', 'new_auto_tt_per_mile2',
       'proportionhouses_D', 'Total Households_D', 'proportionhouses_O',
       'Total Households_O', 'parking_user', 'bus_line_count_D',
       'bus_stop_count_D', 'bus_line_count_O', 'bus_stop_count_O',
       'Proportion night weekend jobs_D', 'Proportion night weekend jobs_O',
       'Median household income_D', 'Median household income_O',
       'terminal_dummy_2023_D', 'terminal_dummy_2023_O'],
      dtype='object')

In [95]:
off_dataframe_new[['log_passengers', 'log_riders_miles',
       'log_off_peak_fare_per_mile',
       'log_new_auto_tt_per_mile2', 'log_new_auto_tt2', 
       'log_proportionhouses_O', 'log_Total Households_O', 'log_proportionhouses_D',
       'log_Total Households_D', 'log_parking_user', 'log_bus_line_count_O', 
       'log_bus_stop_count_O', 'log_bus_line_count_D', 'log_bus_stop_count_D',
       'log_Proportion night weekend jobs_O', 'log_Proportion night weekend jobs_D',
       'log_Median household income_O', 'log_Median household income_D',]] = np.log(off_dataframe_new[['passengers', 'riders_miles',
       'off_peak_fare_per_mile', 
       'new_auto_tt_per_mile2', 'new_auto_tt2', 
       'proportionhouses_O', 'Total Households_O', 'proportionhouses_D',
       'Total Households_D', 'parking_user', 'bus_line_count_O',
       'bus_stop_count_O', 'bus_line_count_D', 'bus_stop_count_D',
       'Proportion night weekend jobs_O', 'Proportion night weekend jobs_D',
       'Median household income_O', 'Median household income_D',]].replace({0 : 0.01, np.nan : 0.01}))


In [96]:
off_dataframe_new[[ 'terminal_dummy_2023_O', 'terminal_dummy_2023_D']] = off_dataframe_new[[ 'terminal_dummy_2023_O', 'terminal_dummy_2023_D' ]].fillna(0)

In [97]:
off_dataframe_new

Unnamed: 0,O,D,walkshed_filename_D,walkshed_filename_O,track_miles,ID_D,ID_O,pairs,passengers,riders_miles,...,log_Total Households_D,log_parking_user,log_bus_line_count_O,log_bus_stop_count_O,log_bus_line_count_D,log_bus_stop_count_D,log_Proportion night weekend jobs_O,log_Proportion night weekend jobs_D,log_Median household income_O,log_Median household income_D
0,Anacostia,Addison Road,ADDISON ROAD-SEAT PLEASANT,ANACOSTIA,10.35,MSTN_062,MSTN_001,MSTN_0010MSTN_062,12,124.20,...,7.416378,-4.605170,3.988984,3.583519,3.367296,2.890372,4.903682,2.346574,10.500206,11.260636
1,Anacostia,Archives,ARCHIVES-NAVY MEMORIAL-PENN QUARTER,ANACOSTIA,3.16,MSTN_002,MSTN_001,MSTN_0010MSTN_002,206,650.96,...,8.008698,1.791759,3.988984,3.583519,4.219508,3.912023,4.903682,-4.605170,10.500206,11.813112
2,Anacostia,Arlington Cemetery,ARLINGTON CEMETERY,ANACOSTIA,6.24,MSTN_065,MSTN_001,MSTN_0010MSTN_065,3,18.72,...,0.000000,-4.605170,3.988984,3.583519,1.386294,-4.605170,4.903682,4.145578,10.500206,-4.605170
3,Anacostia,Ballston-MU,BALLSTON-MU,ANACOSTIA,9.12,MSTN_068,MSTN_001,MSTN_0010MSTN_068,23,209.76,...,9.515911,-4.605170,3.988984,3.583519,3.332205,3.401197,4.903682,7.912485,10.500206,11.755966
4,Anacostia,Benning Road,BENNING ROAD,ANACOSTIA,7.89,MSTN_003,MSTN_001,MSTN_0010MSTN_003,19,149.91,...,8.438366,-4.605170,3.988984,3.583519,2.944439,3.526361,4.903682,5.734440,10.500206,10.862033
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6290,Addison Road,West Hyattsville,WEST HYATTSVILLE,ADDISON ROAD-SEAT PLEASANT,15.52,MSTN_055,MSTN_062,MSTN_0620MSTN_055,5,77.60,...,7.992945,-4.605170,3.367296,2.890372,2.397895,2.639057,2.346574,4.562612,11.260636,11.128586
6291,Addison Road,Wheaton,WHEATON,ADDISON ROAD-SEAT PLEASANT,20.28,MSTN_046,MSTN_062,MSTN_0620MSTN_046,6,121.68,...,8.263333,-4.605170,3.367296,2.890372,3.135494,2.995732,2.346574,7.236123,11.260636,11.418406
6292,Addison Road,Wiehle,WIEHLE-RESTON EAST,ADDISON ROAD-SEAT PLEASANT,29.36,MSTN_091,MSTN_062,MSTN_0620MSTN_091,7,205.52,...,8.272571,-4.605170,3.367296,2.890372,0.693147,0.000000,2.346574,-4.605170,11.260636,11.992486
6293,Addison Road,Woodley Park,WOODLEY PARK-ZOO/ADAMS MORGAN,ADDISON ROAD-SEAT PLEASANT,11.41,MSTN_040,MSTN_062,MSTN_0620MSTN_040,24,273.84,...,8.929170,-4.605170,3.367296,2.890372,2.772589,2.995732,2.346574,8.054467,11.260636,11.887120


In [98]:
off_dataframe_new.to_csv("Outputs/off_dataframe_new2.csv")