# Prepare Data Pt. 1 (`data_prepar_locals`)

This emulates `data_prepar_locals.m`.

In [1]:
import pandas as pd
import numpy as np

In [2]:
aadts_raw = pd.read_csv('/home/czhu/Data/btp_sandbox_prep_kriglocaldata/all_AADT_2010.txt', names=['centreline_id', 'AADT'])

In [3]:
aadts_raw.shape

(7160, 2)

In [4]:
aadts_raw['centreline_id'].unique().shape

(7097,)

In [5]:
# https://stackoverflow.com/questions/22105452/what-is-the-equivalent-of-sql-group-by-having-on-pandas
aadts_gb = aadts_raw.groupby('centreline_id')
aadts_gb.filter(lambda x: len(x) > 1).sort_values('centreline_id').head()

Unnamed: 0,centreline_id,AADT
94,890,68240.51802
7097,890,61815.30708
100,1022,64098.49742
7098,1022,55670.13574
655,106797,2650.273397


These are recognizably the PTCs, which are duplicated because they're often also STTCs in the off years.  The loop over `input_population_data` in `data_prepar_locals.m` effectly overwrites the STTC estimate with the PTC one, so let's do that here as well.

In [6]:
# Reproduces input_population_data (why is it called that??)
aadts = pd.DataFrame(aadts_gb['AADT'].last())

Read Excel spreadsheets.  According to Arman:

> `Landuse_pop_lane_speed.xlsx` - It has been generated for all roads in the city and contains land use data and information come from different sources, e.g., the City of Toronto (the City of Toronto Open Data portal (June 2016), university shape files ([Census](https://mdl.library.utoronto.ca/collections/numeric-data/census-canada/2016)). Land use, building footprint, were extracted from the DMTI Spatial Inc.

> `predictors_300m.xlsx` - This has been also generated for all roads using 300 meter buffers in the city and IT contains land use data and information come from different sources, e.g., the City of Toronto, university shape files,  etc.

> `poprate300.xlsx`: This is the population rate that has been estimated based on 2016 and 2011 population in Toronto. Values extracted by intersecting roads buffer and population map with 200 and 300 meters buffer. I think this one has 300 meter buffers as indicated in its name. As it extracted for all roads in the city no need to any further analysis.

From `data_prepar_locals.m` I found that `poprate` in `predictors_300m.xlsx` is the year-on-year fractional growth factor such that `Population(this_year) = Population(census_year) * (1 + growth_factor)^(this_year - census_year)`.  Also, `census_year = 2011`.

In [7]:
# preds300 = pd.read_excel(('/mnt/c/Users/czhu5/Documents/VolumeModel/'
#                            'TEPS-dev/PRTCS/negative/locals/predictors_300m.xlsx'),
#                           usecols=range(7))
# landuse = pd.read_excel(('/mnt/c/Users/czhu5/Documents/VolumeModel/'
#                            'TEPS-dev/PRTCS/negative/locals/Landuse_pop_lane_speed.xlsx'))
# poprate = pd.read_excel(('/mnt/c/Users/czhu5/Documents/VolumeModel/'
#                            'TEPS-dev/PRTCS/negative/locals/poprate300.xlsx'))
# # Combine the files into a single network features data store.
# nf = pd.HDFStore('/home/czhu/Data/btp_sandbox_prep_kriglocaldata/land_data.hdf5', 'w')
# nf['preds300'] = preds300
# nf['landuse'] = landuse
# nf['poprate'] = poprate
# nf.close()

with pd.HDFStore('/home/czhu/Data/btp_sandbox_prep_kriglocaldata/land_data.hdf5') as nf:
    preds300 = nf['preds300'].copy()
    landuse = nf['landuse'].copy()
    poprate = nf['poprate'].copy()

preds300.columns = ['centreline_id', 'population density', 'res', 'com', 'emp', 'gov', 'ind']
landuse.columns = ['centreline_id', 'AADT', 'Sum_pop', 'number of lanes', 'speed limit', 'employment', 'commercial',
                   'industrial', 'government', 'road type']
poprate.columns = ['centreline_id', 'poprate']
preds300.set_index('centreline_id', drop=True, inplace=True)
landuse.set_index('centreline_id', drop=True, inplace=True)
poprate.set_index('centreline_id', drop=True, inplace=True)

# Drop dummy AADT values.
landuse.drop(columns='AADT', inplace=True)
landuse['road type'] = landuse['road type'].apply(lambda x: 'NaN' if np.isnan(x) else str(int(x)))

preds300.head()

Unnamed: 0_level_0,population density,res,com,emp,gov,ind
centreline_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
103,50.226852,0.0,0.0,0.0,0.0,0.0
106,135.564489,27377.604272,0.0,0.0,0.0,0.0
107,67.888991,30447.029986,0.0,0.0,0.0,0.0
108,50.125568,23876.798219,0.0,0.0,0.0,0.0
112,103.211378,31282.014748,0.0,0.0,0.0,14884.321115


In [8]:
landuse.head()

Unnamed: 0_level_0,Sum_pop,number of lanes,speed limit,employment,commercial,industrial,government,road type
centreline_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
103,7824.516305,0,0,1025477.0,29105.308526,1122606.0,33081.12802,
106,8490.111947,0,0,274663.0,84597.684531,399681.0,322477.021673,
107,7520.466102,0,0,214657.9,84955.419497,329367.5,352416.65571,
108,7297.781592,2,45,250720.6,84597.684531,358985.5,301465.93209,37.0
112,8050.696644,0,0,192357.0,99521.912333,292031.1,418089.930909,


In [9]:
poprate.head()

Unnamed: 0_level_0,poprate
centreline_id,Unnamed: 1_level_1
103,0.026667
106,0.025
107,0.025
108,0.025
112,0.025


In [10]:
# Replicates PRTCS/negative/locals/AADT_Landuse_pop_lane_speed2_{YEAR}.xlsx, except that:
# - AADTs are NaN instead of 0 when they're not in all_AADT_{YEAR}.txt.
aadt_landuse_2_2010 = pd.merge(landuse, aadts, how='left', left_index=True, right_index=True)
aadt_landuse_2_2010 = aadt_landuse_2_2010[['AADT', 'Sum_pop', 'number of lanes', 'speed limit',
                                           'employment', 'commercial', 'industrial', 'government',
                                           'road type']]

In [11]:
aadt_landuse_2_2010.head(10)

Unnamed: 0_level_0,AADT,Sum_pop,number of lanes,speed limit,employment,commercial,industrial,government,road type
centreline_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
103,,7824.516305,0,0,1025477.0,29105.308526,1122606.0,33081.12802,
106,,8490.111947,0,0,274663.0,84597.684531,399681.0,322477.021673,
107,,7520.466102,0,0,214657.9,84955.419497,329367.5,352416.65571,
108,,7297.781592,2,45,250720.6,84597.684531,358985.5,301465.93209,37.0
112,,8050.696644,0,0,192357.0,99521.912333,292031.1,418089.930909,
116,,8982.620851,0,0,1335915.0,60848.25338,1349389.0,28300.515157,
117,1572.582156,7483.820976,2,45,195526.9,100485.535641,291165.3,367538.382684,37.0
118,,7325.674941,2,45,158568.7,100485.535641,249236.3,397774.774659,37.0
119,,8059.289082,0,0,123578.1,98726.479331,214507.3,428313.713906,
120,,8122.45283,2,45,385064.4,84753.810793,503392.5,283146.293595,37.0


In [12]:
landuse.columns

Index(['Sum_pop', 'number of lanes', 'speed limit', 'employment', 'commercial',
       'industrial', 'government', 'road type'],
      dtype='object')

In [13]:
# Replicates PRTCS/output_for_local{YEAR}negative/AADT_Landuse_pop_lane_speed3_2010.csv, except that:
# - When AADT is NaN it's np.nan and not -999
# - When road type is NaN it's 'NaN' and not -999
aadt_landuse_pls_3_2010 = pd.merge(preds300[['com', 'emp', 'gov', 'ind']],
                                   aadts, how='left', left_index=True, right_index=True)
aadt_landuse_pls_3_2010 = pd.merge(aadt_landuse_pls_3_2010, poprate, how='left', left_index=True, right_index=True)
aadt_landuse_pls_3_2010 = pd.merge(aadt_landuse_pls_3_2010, landuse[['Sum_pop', 'speed limit',
                                                                     'number of lanes', 'road type']],
                                   how='left', left_index=True, right_index=True)
aadt_landuse_pls_3_2010['Sum_pop_2010'] = (
    aadt_landuse_pls_3_2010['Sum_pop'] * (1. + aadt_landuse_pls_3_2010['poprate'])**(2010 - 2011))
aadt_landuse_pls_3_2010.drop(columns='Sum_pop', inplace=True)
aadt_landuse_pls_3_2010 = aadt_landuse_pls_3_2010[['AADT', 'Sum_pop_2010', 'number of lanes', 'speed limit',
                                                   'emp', 'com', 'ind', 'gov', 'road type']]

# Drop all null rows, like data_prepar_locals drops `output2`.
aadt_landuse_pls_3_2010.dropna(inplace=True)

In [14]:
aadt_landuse_pls_3_2010.head(10)

Unnamed: 0_level_0,AADT,Sum_pop_2010,number of lanes,speed limit,emp,com,ind,gov,road type
centreline_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
117,1572.582156,7301.288757,2.0,45.0,0.0,0.0,4835.90926,0.0,37
127,1732.243201,7319.25842,2.0,45.0,0.0,0.0,11700.732326,0.0,37
131,1647.319288,7321.374076,2.0,45.0,0.0,0.0,5530.18479,0.0,37
142,1584.905327,7332.225423,2.0,45.0,0.0,0.0,1289.312517,0.0,37
161,928.023971,7310.267086,2.0,45.0,0.0,0.0,0.0,0.0,14
163,1662.666983,7331.786449,2.0,45.0,0.0,0.0,672.399736,0.0,37
169,1819.566543,7271.304906,2.0,45.0,0.0,0.0,0.0,0.0,14
170,103.462577,7890.884929,2.0,45.0,0.0,0.0,178.010183,0.0,37
175,2414.197091,7877.694268,2.0,45.0,0.0,0.0,0.0,0.0,37
180,1957.822783,7247.324752,2.0,45.0,0.0,0.0,0.0,807.531505,14


**Checked these against the xlsx and csv outputs from TEPs - they're identical.**

In [15]:
# Create pkdata.hdf5 for part 2 of this exercise.
with pd.HDFStore('/home/czhu/Data/btp_sandbox_prep_kriglocaldata/data/pkdata2010.hdf5', 'w') as pkstore:
    pkstore['aadt_landuse_2_2010'] = aadt_landuse_2_2010
    pkstore.close()