# CountMatch Matcher Development
## Part 3: Rewriting a Sensible Prototype Matcher

This notebook investigates why our prototype in Part 1 produces a much noisier ground truth vs. predictions plot, and much worse mean absolute error, for 2011.

In [1]:
%matplotlib inline
import sys
sys.path.append('../')
import importlib
import matplotlib.pyplot as plt
import numpy as np
import knowyourdata as kyd

import pandas as pd
from traffic_prophet import cfg
import pathlib, os
import configparser

from traffic_prophet import connection
from traffic_prophet.countmatch import reader
from traffic_prophet.countmatch import growthfactor as gf
from traffic_prophet.countmatch import neighbour

defaultcolours = plt.rcParams['axes.prop_cycle'].by_key()['color']

filepath = pathlib.Path.home().joinpath('.charlesconfig')
if os.path.isfile(filepath):
    vol_conn = connection.Connection(filepath, 'POSTGRES',
                                     'czhu.btp_centreline_daily_counts')
    ll_conn = connection.Connection(filepath, 'POSTGRES',
                                    'czhu.btp_centreline_lonlat')
    config = configparser.RawConfigParser()
    config.read(filepath.as_posix())
    MAPBOX_TOKEN = config['MAPBOX']['token']
    PLOTLY_USER = config['PLOTLY']['user']
    PLOTLY_KEY = config['PLOTLY']['key']
else:
    filepath = pathlib.Path.home().joinpath('cf.txt')
    vol_conn = connection.Connection(filepath, 'localpg',
                                     'prj_vol.btp_centreline_daily_counts')
    ll_conn = connection.Connection(filepath, 'localpg',
                                    'gis.btp_centreline_lonlat')
    config = configparser.RawConfigParser()
    config.read(filepath.as_posix())
    MAPBOX_TOKEN = config['mapbox']['token']
    PLOTLY_USER = config['plotly']['user']
    PLOTLY_KEY = config['plotly']['apikey']

In [2]:
rdr = reader.Reader(vol_conn)
%time rdr.read()

CPU times: user 1min 34s, sys: 220 ms, total: 1min 35s
Wall time: 1min 35s


In [3]:
gf.get_growth_factors(rdr)

In [4]:
ptc_ids = np.unique(np.abs(list(rdr.ptcs.keys())))
nb = neighbour.NeighbourLonLatEuclidean(ll_conn, 20, ptc_ids)
%time nb.find_neighbours()

CPU times: user 18.2 s, sys: 80.2 ms, total: 18.3 s
Wall time: 18.5 s


We won't worry about growth factors just yet, let's just get the MSE matcher working first.

## `D_ijd` Calculator

Let's calculate a day-of-week to AADT, just like the DoM factor.

In [5]:
# Sandbox first:

myptc = next(iter(rdr.ptcs.values()))
myptc.centreline_id

8540609

In [6]:
myptc.data['Daily Count'].loc[2015].head(30)

Unnamed: 0_level_0,Date,Daily Count
Day of Year,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2015-01-01,135.0
2,2015-01-02,311.0
3,2015-01-03,161.0
4,2015-01-04,135.0
5,2015-01-05,257.0
6,2015-01-06,322.0
7,2015-01-07,221.0
8,2015-01-08,284.0
9,2015-01-09,247.0
10,2015-01-10,135.0


In [7]:
myptc.data['DoMADT'].loc[2015]

Day of Week,0,1,2,3,4,5,6
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,345.0,331.0,407.25,308.75,335.2,169.8,96.75
2,42.0,98.5,107.0,130.0,98.25,37.0,53.5
3,554.2,460.8,244.0,392.5,299.0,200.75,266.8
4,872.0,666.333333,1076.8,1040.8,557.0,613.25,439.75
5,1666.0,2166.5,2120.75,2297.0,1977.4,1208.4,1201.0
6,1866.4,2014.6,2398.0,2339.25,1853.75,1229.5,959.25
7,2411.5,2029.25,2240.0,2393.6,1944.6,1337.25,1431.0
8,1736.2,1946.25,2207.0,2035.5,1828.0,1333.2,1366.2
9,1735.25,1941.8,2192.4,2409.25,2079.75,976.75,1101.5
10,1391.25,1658.25,1356.0,1647.8,1452.4,828.4,929.25


In [8]:
doyadt = []
for year in myptc.data['AADT'].index:
    _ctable = myptc.data['AADT'].at[year, 'AADT'] / myptc.data['DoMADT'].loc[year]
    _ctable.index = pd.MultiIndex.from_product(
        [[year, ], _ctable.index],
        names=['Year', _ctable.index.name])
    doyadt.append(_ctable)

In [9]:
doyadt = pd.concat(doyadt)
doyadt.loc[2017]

Day of Week,0,1,2,3,4,5,6
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,2.401763,2.452139,2.159893,2.204459,2.346318,4.490803,5.225309
2,2.381592,2.224742,2.13084,1.943355,2.723232,4.112932,6.219438
3,1.918468,1.674232,1.702524,1.679127,2.442626,5.630014,4.915219
4,0.816292,0.899626,0.819203,1.168088,0.933253,1.588552,1.75347
5,0.759605,0.594781,0.556455,0.787919,1.166605,1.152268,1.227054
6,0.601872,0.678796,0.467735,0.597318,0.727604,0.779982,1.04269
7,0.638139,0.48589,0.488802,0.652755,0.623858,0.894349,0.934214
8,0.64271,0.569196,0.52125,0.609054,0.713923,0.885864,0.843417
9,0.592175,0.637241,0.479901,0.508308,0.606849,0.965308,1.021818
10,0.792741,0.603914,0.694076,0.608731,0.678295,1.459832,1.365988


In [10]:
doyadt.loc[2015]

Day of Week,0,1,2,3,4,5,6
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,3.336559,3.477683,2.826551,3.728301,3.434108,6.779228,11.897808
2,27.40745,11.686425,10.758065,8.854715,11.716162,31.11116,21.516129
3,2.077071,2.498075,4.717676,2.932772,3.849876,5.734062,4.314516
4,1.320084,1.727533,1.069013,1.105989,2.06663,1.87707,2.617653
5,0.690944,0.531324,0.542786,0.501138,0.582135,0.952593,0.958462
6,0.616756,0.571385,0.48003,0.492086,0.620964,0.936245,1.200013
7,0.477343,0.56726,0.51389,0.480913,0.591954,0.860806,0.804412
8,0.663007,0.591452,0.521574,0.565519,0.629712,0.863421,0.842565
9,0.66337,0.592807,0.525047,0.477789,0.553486,1.178513,1.045041
10,0.827395,0.694173,0.848903,0.698576,0.792559,1.389562,1.238755


We need to install some kind of anomaly detector for PTCs - obviously something terrible is happening in February 2015.

In [120]:
myptc.data['AADT']

Unnamed: 0_level_0,AADT
Year,Unnamed: 1_level_1
2006,1062.784683
2011,1076.228148
2012,1513.75356
2013,1100.390389
2014,1107.003235
2015,1151.112911
2016,1214.887862
2017,1148.522887


In [132]:
# Get ratio between AADT and daily count
doyr = myptc.data['AADT'].copy()
doyr['DoYADT'] = np.empty(doyr.shape[0])
for year in doyr.index.values:
    doyr.loc[year, 'DoYADT'] = (
        myptc.data['AADT'].loc[year, 'AADT'] /
        myptc.data['Daily Count'].loc[year, 'Daily Count']).mean()

In [133]:
doyr

Unnamed: 0_level_0,AADT,DoYADT
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
2006,1062.784683,1.930788
2011,1076.228148,2.022777
2012,1513.75356,2.062894
2013,1100.390389,5.988307
2014,1107.003235,4.081941
2015,1151.112911,5.245396
2016,1214.887862,2.126913
2017,1148.522887,2.238702


In [134]:
myptc.data['AADT'].loc[2013, 'AADT'] 

1100.3903892150468

In [138]:
myptc.data['AADT'].loc[2013, 'AADT'] / myptc.data['Daily Count'].loc[2013, 'Daily Count']

Day of Year
1       8.273612
2       3.874614
3       4.956713
4       3.680235
5       4.642997
         ...    
361    91.699199
362     8.464541
363     6.589164
364     5.447477
365     9.737968
Name: Daily Count, Length: 349, dtype: float64

In [None]:
myptc.data['Daily Count'] 

pd.DataFrame(doyr.groupby('Year')['Day-to-AADT Ratio'].mean())

### Functionalize `D_ijd` calculator

In [11]:
def get_Dijd(ptc):
    doyadt = []
    for year in ptc.data['AADT'].index:
        _ctable = ptc.data['AADT'].at[year, 'AADT'] / ptc.data['DoMADT'].loc[year]
        _ctable.index = pd.MultiIndex.from_product(
            [[year, ], _ctable.index],
            names=['Year', _ctable.index.name])
        doyadt.append(_ctable)
    ptc.data['DoYADT'] = pd.concat(doyadt)
    
    ptc.data['DoYADT'] = pd.concat(doyadt)

for ptc in rdr.ptcs.values():
    get_Dijd(ptc)

In [12]:
rdr.ptcs[-104870].data['DoYADT']

Unnamed: 0_level_0,Day of Week,0,1,2,3,4,5,6
Year,Month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2006,1,2.801225,2.424235,2.882128,2.550173,2.173384,4.465482,7.047644
2006,2,3.177234,3.068676,2.441312,2.548644,3.318609,4.776560,6.798196
2006,3,2.028215,2.318803,1.951496,1.899865,2.145306,3.403634,3.907297
2006,4,1.113641,1.097635,1.135151,0.840146,1.234839,2.133249,2.095189
2006,5,0.707862,0.736817,0.745683,1.081715,1.009532,1.399782,1.640100
...,...,...,...,...,...,...,...,...
2017,8,0.642710,0.569196,0.521250,0.609054,0.713923,0.885864,0.843417
2017,9,0.592175,0.637241,0.479901,0.508308,0.606849,0.965308,1.021818
2017,10,0.792741,0.603914,0.694076,0.608731,0.678295,1.459832,1.365988
2017,11,1.024552,0.862902,0.933455,1.152210,1.184044,2.448876,3.024418


## Incomplete Data matcher

TEPs-I uses a closest day-of-week and year matching algorithm to handle gaps in PTC data when comparing against STTCs.

In [13]:
test_doyadt = rdr.ptcs[-104870].data['DoYADT'].copy()
test_doyadt.loc[:, 5] = np.nan
test_doyadt.loc[(2017, 4), 0] = np.nan

In [14]:
test_doyadt.loc[2017]

Day of Week,0,1,2,3,4,5,6
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,2.401763,2.452139,2.159893,2.204459,2.346318,,5.225309
2,2.381592,2.224742,2.13084,1.943355,2.723232,,6.219438
3,1.918468,1.674232,1.702524,1.679127,2.442626,,4.915219
4,,0.899626,0.819203,1.168088,0.933253,,1.75347
5,0.759605,0.594781,0.556455,0.787919,1.166605,,1.227054
6,0.601872,0.678796,0.467735,0.597318,0.727604,,1.04269
7,0.638139,0.48589,0.488802,0.652755,0.623858,,0.934214
8,0.64271,0.569196,0.52125,0.609054,0.713923,,0.843417
9,0.592175,0.637241,0.479901,0.508308,0.606849,,1.021818
10,0.792741,0.603914,0.694076,0.608731,0.678295,,1.365988


In [15]:
%timeit test_doyadt.loc[(slice(None), 1), :]

505 µs ± 17.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [16]:
# Hacked way of checking if there is data for any year for
# a given month and day of week.  Groupbys filter out missing data.
# https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html
doyadt_has_data = (test_doyadt.groupby(level=1).max() + 1.).fillna(0.).astype(bool)

In [17]:
%timeit doyadt_has_data.loc[3, 1]

4.54 µs ± 29.1 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [89]:
def has_data_lookup_speedtest():
    if doyadt_has_data.loc[3, 1]:
        unique_years = (test_doyadt.loc[(slice(None), 3), 1]
                        .reset_index(level=1, drop=True).index.values)
        closest_year = unique_years[np.argmin(
            np.abs(unique_years - 2010))]
        
%timeit has_data_lookup_speedtest()

713 µs ± 3.87 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


But this index resetting still takes a long time.  What if we precalculated what years are available?

In [91]:
def get_available_years(doyadt):
    avail_years = []
    month = []

    for name, group in doyadt.notnull().groupby(level=1):
        gd = group.reset_index(level=1, drop=True)
        avail_years.append([gd.loc[gd[c]].index.values for c in group.columns])
        month.append(name)

    return pd.DataFrame(avail_years, index=month)

get_available_years(test_doyadt)

Unnamed: 0,0,1,2,3,4,5,6
1,"[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]","[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]","[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]","[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]","[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]",[],"[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]"
2,"[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]","[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]","[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]","[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]","[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]",[],"[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]"
3,"[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]","[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]","[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]","[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]","[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]",[],"[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]"
4,"[2006, 2011, 2012, 2013, 2014, 2015, 2016]","[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]","[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]","[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]","[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]",[],"[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]"
5,"[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]","[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]","[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]","[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]","[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]",[],"[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]"
6,"[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]","[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]","[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]","[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]","[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]",[],"[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]"
7,"[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]","[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]","[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]","[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]","[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]",[],"[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]"
8,"[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]","[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]","[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]","[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]","[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]",[],"[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]"
9,"[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]","[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]","[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]","[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]","[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]",[],"[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]"
10,"[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]","[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]","[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]","[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]","[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]",[],"[2006, 2011, 2012, 2013, 2014, 2015, 2016, 2017]"


In [97]:
test_doyadt_availyears = get_available_years(test_doyadt)

def has_data_lookup_speedtest2():
    unique_years = test_doyadt_availyears.loc[3, 1]
    if len(unique_years):
        closest_year = unique_years[np.argmin(
            np.abs(unique_years - 2010))]
        
%timeit has_data_lookup_speedtest2()

8.98 µs ± 208 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


NICE

In [99]:
def get_available_years(ptc):
    
    doyadt = ptc.data['DoYADT']
    
    avail_years = []
    month = []

    for name, group in doyadt.notnull().groupby(level=1):
        gd = group.reset_index(level=1, drop=True)
        avail_years.append([gd.loc[gd[c]].index.values for c in group.columns])
        month.append(name)

    ptc.data['DoYADT_avail_year'] = pd.DataFrame(avail_years, index=month)

for ptc in rdr.ptcs.values():
    get_available_years(ptc)

## Preliminary AADT estimator

In [102]:
# Sandbox
sttc = next(iter(rdr.sttcs.values()))
ptc = next(iter(rdr.ptcs.values()))

In [113]:
# Preprocessing - break daily counts up into a single index table with date, day of week, month.
daily_count = sttc.data.reset_index().drop(columns='Day of Year')
daily_count['Day of Week'] = daily_count['Date'].dt.dayofweek
daily_count['Month'] = daily_count['Date'].dt.month

In [118]:
unique_days = daily_count[['Year', 'Day of Week', 'Month']].drop_duplicates()

for i, row in unique_days:
    

In [None]:
def get_aadt_prelim(sttc, ptc):


In [100]:
sttc

NameError: name 'sttc' is not defined