# Prepare Data Pt. 2 (`data_prep_kriging`)

This emulates `data_prep_kriging.m`.  **IT MUST BE RUN AFTER `Prepare Locals.ipynb`, SINCE THAT CREATES `pkdata.hdf5`.**

In [1]:
import pandas as pd
import numpy as np

In [4]:
# Read in directions file.
# directions = pd.read_csv(('/mnt/c/Users/czhu5/Documents/VolumeModel/'
#                           'TEPS-dev/PRTCS/direction3.csv'))
# directions.sort_values('centreline_id', inplace=True)
# directions.reset_index(drop=True, inplace=True)
# pkstore = pd.HDFStore('./data/pkdata.hdf5', 'r+')
# pkstore.put('directions', directions)
# pkstore.close()

with pd.HDFStore('/home/czhu/Data/btp_sandbox_prep_kriglocaldata/pkdata2010.hdf5') as pkstore:
    directions = pkstore['directions']
    aadt_landuse_2_2010 = pkstore['aadt_landuse_2_2010']

In [5]:
directions.head()

Unnamed: 0,centreline_id,dir_bin
0,108,-1
1,108,1
2,117,-1
3,117,1
4,118,-1


In [6]:
aadt_landuse_2_2010.head()

Unnamed: 0_level_0,AADT,Sum_pop,number of lanes,speed limit,employment,commercial,industrial,government,road type
centreline_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
103,,7824.516305,0,0,1025477.0,29105.308526,1122606.0,33081.12802,
106,,8490.111947,0,0,274663.0,84597.684531,399681.0,322477.021673,
107,,7520.466102,0,0,214657.9,84955.419497,329367.5,352416.65571,
108,,7297.781592,2,45,250720.6,84597.684531,358985.5,301465.93209,37.0
112,,8050.696644,0,0,192357.0,99521.912333,292031.1,418089.930909,


In [7]:
aadt_landuse_2_2010.index.shape

(64848,)

### Produce `resmat{YEAR}.txt`

Equivalent to `b` in `data_prep_kriging.m`.

The exact definition of `b`'s colums is given by `b2`'s:
```
b2=[ref_id_start_point_t ref_id_stop_point count_t speed_limit_t population_t ...
    lane_num_t end_point_t start_point_t speed_limit_t.*lane_num_t commer_t ...
    indus_t gov_t emplo_t dummy_road_type_t Dist_t];
```

The only thing my `resmat` doesn't have is `speed_limit_t.*lane_num_t`, which is trivial to calculate.  (Also we have speed limits from two different sources, but I'm lazy.)

In [37]:
# Read in raw centreline-centreline distance table.
with pd.HDFStore('/home/czhu/Data/btp_sandbox_prep_kriglocaldata/resmat.hdf5') as rsm:
    resmat = rsm['resmat']
    resmat = resmat[['origin_centreline_id', 'dest_centreline_id',
                     'network_distance', 'speed_limit']]

In [38]:
resmat.shape

(4657659, 4)

In [39]:
resmat = resmat.join(aadt_landuse_2_2010, on='dest_centreline_id', how='left', rsuffix='_r')
resmat.shape

(4657659, 13)

In [40]:
# Auto-eliminates nulls.
resmat = resmat.loc[(resmat['number of lanes'] > 0) & (resmat['network_distance'] < 2.), :]

In [41]:
resmat.shape

(3011569, 13)

### Produce `distance_short{YEAR}.csv`

`ACSPedit{YEAR}.csv` is this but for reference IDs, and is unused.

In [42]:
distance_short = resmat[['origin_centreline_id', 'dest_centreline_id', 'network_distance']]

In [79]:
with pd.HDFStore('/home/czhu/Data/btp_sandbox_prep_kriglocaldata/distance_short.hdf5', 'w') as dsh:
    dsh['distance_short'] = distance_short

### Produce `data_for_pred{YEAR}.txt` and `data_for_fit{YEAR}.txt`

Their `id` counterparts are just the endpoint centreline_id versions of these.

In [64]:
# Equivalent to picking out the first unique instance of b(:7) in [~,idx]=unique(c,'rows'); out2=b(idx,:);
# Use speed limit from land use xlsx.
data_for_pred = resmat[['dest_centreline_id', 'speed limit', 'AADT', 'Sum_pop',
                        'number of lanes', 'employment', 'commercial',
                        'industrial', 'government', 'road type']]

In [65]:
data_for_pred = data_for_pred.drop_duplicates().reset_index(drop=True)

In [66]:
data_for_pred.head()

Unnamed: 0,dest_centreline_id,speed limit,AADT,Sum_pop,number of lanes,employment,commercial,industrial,government,road type
0,9109255,45.0,763.356954,12558.251379,2.0,1084553.0,,1312336.0,187747.021115,37.0
1,14188125,0.0,,8374.53915,2.0,711947.5,,460115.2,359721.633864,
2,914742,45.0,571.524269,10633.715781,2.0,633798.4,,776660.8,308111.966045,37.0
3,914856,55.0,,7071.569392,4.0,195870.3,,432605.1,,10.0
4,7585679,55.0,,12267.637772,4.0,850630.9,,1089469.0,55399.259756,10.0


In [67]:
data_for_pred.shape

(18077, 10)

In [68]:
data_for_pred = data_for_pred.loc[data_for_pred['speed limit'] > 0., :]

In [69]:
ctrline_negdir = directions[directions['dir_bin'] < 0]

In [70]:
assert ctrline_negdir['centreline_id'].unique().shape[0] == ctrline_negdir.shape[0]

In [71]:
data_for_pred = (pd.merge(data_for_pred, ctrline_negdir[['centreline_id']],
                         left_on='dest_centreline_id', right_on='centreline_id')
                 .drop(columns='centreline_id').reset_index(drop=True))

In [85]:
data_for_pred = data_for_pred[data_for_pred['road type'] != '0'].reset_index(drop=True)

In [86]:
# The fitting data is any road section whose AADT estimate is 2000 or greater (automatically eliminates nulls).
data_for_fit = data_for_pred[data_for_pred['AADT'] > 2000.].reset_index(drop=True)

In [87]:
data_for_pred.head()

Unnamed: 0,dest_centreline_id,speed limit,AADT,Sum_pop,number of lanes,employment,commercial,industrial,government,road type
0,9109255,45.0,763.356954,12558.251379,2.0,1084553.0,,1312336.0,187747.021115,37
1,914742,45.0,571.524269,10633.715781,2.0,633798.4,,776660.8,308111.966045,37
2,914856,55.0,,7071.569392,4.0,195870.3,,432605.1,,10
3,7585679,55.0,,12267.637772,4.0,850630.9,,1089469.0,55399.259756,10
4,7586931,55.0,,12233.734943,4.0,900779.0,,1098013.0,92275.284926,10


In [88]:
data_for_fit.head()

Unnamed: 0,dest_centreline_id,speed limit,AADT,Sum_pop,number of lanes,employment,commercial,industrial,government,road type
0,914446,55.0,8961.192262,9590.566862,4.0,1871610.0,,1366294.0,352847.10482,10
1,914320,55.0,10756.43209,7671.348647,4.0,1980414.0,,1389806.0,216803.398979,10
2,3326929,55.0,12739.53196,7602.013661,4.0,666139.5,,786494.0,106614.317019,10
3,11070075,50.0,8240.331834,7394.967113,4.0,690602.3,,866898.4,106614.317019,33
4,20048292,55.0,5824.497184,9216.707898,4.0,1264567.0,,883634.4,362648.479967,10


In [89]:
# Read in raw centreline-centreline distance table.
with pd.HDFStore('/home/czhu/Data/btp_sandbox_prep_kriglocaldata/data_for_pf2010negative.hdf5', 'w') as dfp:
    dfp['data_for_pred'] = data_for_pred
    dfp['data_for_fit'] = data_for_fit

## Check that `data_for_pred` contains major arterials

In [106]:
import pathlib, os
import configparser
import psycopg2
import geopandas as gpd

filepath = pathlib.Path.home().joinpath('.charlesconfig')
config = configparser.RawConfigParser()
config.read(filepath.as_posix());

with psycopg2.connect(database='bigdata', user=config['POSTGRES']['user'],
                      password=config['POSTGRES']['password'],
                      host=config['POSTGRES']['host'], port=config['POSTGRES']['port']) as db_con:
    gis_centreline = gpd.read_postgis('SELECT * FROM gis.centreline', db_con)
    gis_centreline['geo_id'] = gis_centreline['geo_id'].astype(int)

In [114]:
road_descriptions = pd.merge(data_for_pred, gis_centreline[['geo_id', 'fcode', 'fcode_desc']],
                             how='inner', left_on='dest_centreline_id', right_on='geo_id')

In [119]:
road_descriptions['fcode_desc'].value_counts()

Collector              5932
Major Arterial         5422
Minor Arterial         3269
Expressway              718
Expressway Ramp         584
Major Arterial Ramp     109
Local                   104
Collector Ramp           10
Name: fcode_desc, dtype: int64

In [120]:
road_descriptions_fit = pd.merge(data_for_fit, gis_centreline[['geo_id', 'fcode', 'fcode_desc']],
                                 how='inner', left_on='dest_centreline_id', right_on='geo_id')

In [121]:
road_descriptions_fit['fcode_desc'].value_counts()

Major Arterial         1797
Minor Arterial          901
Collector               730
Expressway              114
Expressway Ramp          43
Major Arterial Ramp      14
Local                     5
Name: fcode_desc, dtype: int64

Close enough, I guess.

In [125]:
road_descriptions = pd.merge(data_for_pred, gis_centreline[['geo_id', 'fcode', 'fcode_desc']],
                             how='left', left_on='dest_centreline_id', right_on='geo_id')

In [127]:
road_descriptions.loc[road_descriptions['geo_id'].isnull(), 'dest_centreline_id']

35         914545
307       3086207
416       7668066
770      14204001
1150      8407393
1291       909410
1373      9655693
1876      7963393
2253         8436
2306      8791748
2312     10759021
2355      8166335
2356      8929258
2457     14004162
2526      9781235
2620     11461945
2624       446598
3031      8206114
3032      8206105
3047     13974047
3182      6913362
3258      1140382
3310     14003783
3420      1139499
3466      1140633
3558      2981086
3559      1141672
3825     14673528
3891      4153107
3920     20235606
           ...   
14824     6257558
14885     6247978
14889     6247982
14890     6247983
14969     9468212
15108      106132
15169     4786680
15229      107510
15241      107539
15263    20040264
15285      106675
15292      105619
15294     6793391
15330     7195953
15360      105988
15369    14191397
15415      443550
15457    14258081
15480    13502910
15563      109757
15637     6674323
15679      105368
15880    14661315
15891    20057375
15893    1

So there are some roads whose codes don't exist in the current centreline.