# Data Validation of the top 10 imbalanced route:
(analysis_id = 1453262, 1453284, 1453305, 1453367, 1453395, 1453445, 1453464, 1453483, 1454196, 1454209, \
1454352, 1454366, 1454523, 1454549, 1454670, 1454683, 1455243, 1455256, 1455385, 1455400) 
- NOTE: No bt data found for 1455243 , 1455400 
- Period: 2 days (From '2019-10-09 00:00:00' to '2019-10-10 23:59:00')

In [1]:
from psycopg2 import connect
import psycopg2.sql as pg
import configparser
import pandas.io.sql as pandasql
from IPython.display import HTML
def print_table(sql, con):
    return HTML(pandasql.read_sql(sql, con).to_html(index=False))

In [2]:
# setting up pgsql connection
CONFIG = configparser.ConfigParser()
CONFIG.read(r'/home/jchew/local/db.cfg')
dbset = CONFIG['DBSETTINGS']
con = connect(**dbset)

## 1. Comparing number of observations
### 1.1 Bluetooth obs
**Select bt obs data of those analysis_id within that time period**
```
CREATE OR REPLACE VIEW jchew.bt_top10_2days AS
SELECT bt.analysis_id, seg.bt_id, 
seg.street_name, seg.direction, seg.from_intersection, seg.to_intersection,
sum(obs) AS sum_bt_obs
FROM bluetooth.aggr_5min bt
JOIN king_pilot.bt_segments seg
USING (analysis_id)
WHERE bt.datetime_bin BETWEEN '2019-10-09 00:00:00' AND '2019-10-10 23:59:00'
AND seg.analysis_id IN (1453262, 1453284, 1453305, 1453367, 1453395, 1453445, 1453464, 1453483, 1454196, 1454209,
                        1454352, 1454366, 1454523, 1454549, 1454670, 1454683, 1455243, 1455256, 1455385, 1455400)
GROUP BY bt.analysis_id, seg.bt_id, seg.street_name, seg.direction, seg.from_intersection, seg.to_intersection
```
**Ratio for bt obs**
```
CREATE OR REPLACE VIEW jchew.ratio_bt_top10_2days AS
WITH X AS (
SELECT a.analysis_id AS analysis_id_1, a.bt_id AS bt_id_1, 
    b.analysis_id AS analysis_id_2, b.bt_id AS bt_id_2, a.street_name, 
    a.direction AS eb_nb, b.direction AS wb_sb, 
    a.from_intersection AS intersection_1, a.to_intersection AS intersection_2,
    a.sum_bt_obs AS eb_nb_obs, b.sum_bt_obs AS wb_sb_obs
    FROM jchew.bt_top10_2days a
JOIN jchew.bt_top10_2days b
ON a.street_name = b .street_name AND a.from_intersection = b.to_intersection 
AND a.to_intersection=b.from_intersection 
WHERE a.direction IN ('EB', 'NB')
)
SELECT *,
CASE WHEN X.eb_nb_obs > X.wb_sb_obs THEN (X.eb_nb_obs * 1.0) / (X.eb_nb_obs + X.wb_sb_obs)
WHEN X.eb_nb_obs < X.wb_sb_obs THEN (X.wb_sb_obs * 1.0) / (X.eb_nb_obs + X.wb_sb_obs)
END AS "EB/WB or NB/SB Ratio",
 CASE
WHEN X.eb_nb = 'EB' THEN CASE
    WHEN X.eb_nb_obs < X.wb_sb_obs THEN 'WB'
    WHEN X.eb_nb_obs > X.wb_sb_obs THEN 'EB'
    END
WHEN X.eb_nb = 'NB' THEN CASE
    WHEN X.eb_nb_obs < X.wb_sb_obs THEN 'SB'
    WHEN X.eb_nb_obs > X.wb_sb_obs THEN 'NB'
    END
        END AS "Bias Towards"
FROM X
ORDER BY analysis_id_1
```

### 1.2 HERE obs
**Select HERE obs data of those analysis_id within that time period** \
Number of obs for HERE data is calculated by counting number of datetime bin
```
CREATE MATERIALIZED VIEW jchew.here_top10_2days AS
SELECT a.analysis_id, a.street_name, a.direction, 
a.from_intersection_name AS from_intersection, a.to_intersection_name AS to_intersection,
SUM(b.total) AS sum_here_obs, SUM(b.length) AS sum_length
FROM
(SELECT analysis_id, street_name, direction, 
 from_intersection_name, to_intersection_name, pp_link_dir AS link_dir, reference_length
FROM jchew.validation_bt_here
WHERE analysis_id IN (1453262, 1453284, 1453305, 1453367, 1453395, 1453445, 1453464, 1453483, 1454196, 1454209, 
                      1454352, 1454366, 1454523, 1454549, 1454670, 1454683, 1455243, 1455256, 1455385, 1455400)
) a
LEFT JOIN
(SELECT link_dir, count(tx) AS total, length FROM here.ta
WHERE tx BETWEEN '2019-10-09 00:00:00' AND '2019-10-10 23:59:00'
GROUP BY link_dir,length) b
USING (link_dir)
GROUP BY analysis_id, street_name, direction, from_intersection_name, to_intersection_name
```
**ratio for HERE obs**
```
CREATE OR REPLACE VIEW jchew.ratio_here_top10_2days AS
WITH X AS (
SELECT a.analysis_id AS analysis_id_1, 
    b.analysis_id AS analysis_id_2, a.street_name,
    a.direction AS eb_nb, b.direction AS wb_sb, 
    a.from_intersection AS intersection_1, a.to_intersection AS intersection_2,
    a.sum_here_obs AS eb_nb_obs, b.sum_here_obs AS wb_sb_obs
    FROM jchew.here_top10_2days a
JOIN jchew.here_top10_2days b
ON a.street_name = b .street_name AND a.from_intersection = b.to_intersection 
AND a.to_intersection=b.from_intersection 
WHERE a.direction IN ('EB', 'NB')
)
SELECT *,
CASE WHEN X.eb_nb_obs > X.wb_sb_obs THEN (X.eb_nb_obs * 1.0) / (X.eb_nb_obs + X.wb_sb_obs)
WHEN X.eb_nb_obs < X.wb_sb_obs THEN (X.wb_sb_obs * 1.0) / (X.eb_nb_obs + X.wb_sb_obs)
END AS "EB/WB or NB/SB Ratio",
 CASE
WHEN X.eb_nb = 'EB' THEN CASE
    WHEN X.eb_nb_obs < X.wb_sb_obs THEN 'WB'
    WHEN X.eb_nb_obs > X.wb_sb_obs THEN 'EB'
    END
WHEN X.eb_nb = 'NB' THEN CASE
    WHEN X.eb_nb_obs < X.wb_sb_obs THEN 'SB'
    WHEN X.eb_nb_obs > X.wb_sb_obs THEN 'NB'
    END
        END AS "Bias Towards"
FROM X
ORDER BY analysis_id_1
```

### 1.3 Comparing ratio of obs for both bt and HERE

In [5]:
sql = '''
SELECT b.analysis_id_1, b.analysis_id_2, b.street_name, 
b.eb_nb, b.wb_sb, b.intersection_1, b.intersection_2, 
a.eb_nb_obs AS bt_eb_nb_obs, a.wb_sb_obs AS bt_wb_sb_obs,
b.eb_nb_obs AS here_eb_nb_obs, b.wb_sb_obs AS here_wb_sb_obs,
a."EB/WB or NB/SB Ratio" AS bt_ratio,
b."EB/WB or NB/SB Ratio" AS here_ratio,
a."Bias Towards" AS bt_bias,
b."Bias Towards" AS here_bias
FROM jchew.ratio_bt_top10_2days a
RIGHT JOIN jchew.ratio_here_top10_2days b
USING (analysis_id_1)
'''
print_table(sql, con)

analysis_id_1,analysis_id_2,street_name,eb_nb,wb_sb,intersection_1,intersection_2,bt_eb_nb_obs,bt_wb_sb_obs,here_eb_nb_obs,here_wb_sb_obs,bt_ratio,here_ratio,bt_bias,here_bias
1453262,1453483,Dundas,EB,WB,Dufferin,Bathurst,310.0,265.0,4059.0,4138.0,0.53913,0.504819,EB,WB
1453284,1453464,Dundas,EB,WB,Bathurst,Spadina,475.0,371.0,1842.0,2012.0,0.561466,0.522055,EB,WB
1453305,1453445,Dundas,EB,WB,Spadina,University,306.0,178.0,2082.0,1942.0,0.632231,0.517396,EB,EB
1453367,1453395,Dundas,EB,WB,Jarvis,Parliament,593.0,303.0,4055.0,1899.0,0.66183,0.681055,EB,EB
1454196,1454366,King,EB,WB,Spadina,University,11.0,21.0,312.0,296.0,0.65625,0.513158,WB,EB
1454209,1454352,King,EB,WB,University,Yonge,40.0,34.0,687.0,324.0,0.540541,0.679525,EB,EB
1454523,1454683,Front,EB,WB,Bathurst,Spadina,297.0,241.0,528.0,672.0,0.552045,0.56,EB,WB
1454549,1454670,Front,EB,WB,Spadina,University,87.0,43.0,1931.0,1345.0,0.669231,0.589438,EB,EB
1455385,1455256,University,NB,SB,Queen,Dundas,298.0,829.0,1106.0,1361.0,0.735581,0.551682,SB,SB
1455400,1455243,University,NB,SB,Dundas,College,,,3158.0,4806.0,,0.603466,,SB


## 2. Comparing median speed
### 2.1 Bluetooth spd
**Select bt spd data of those analysis_id within that time period** \
For bluetooth data, `speed = (length of segment) / (median time travel)`
```
CREATE OR REPLACE VIEW jchew.bt_top10_2days_spd AS
WITH X AS 
(SELECT bt.analysis_id, PERCENTILE_CONT(0.5) WITHIN GROUP(ORDER BY tt) AS median_tt, l.length
FROM bluetooth.aggr_5min bt 
RIGHT JOIN bluetooth.segments l
USING (analysis_id)
WHERE bt.datetime_bin BETWEEN '2019-10-09 00:00:00' AND '2019-10-10 23:59:00'
GROUP BY bt.analysis_id, l.length)
SELECT X.analysis_id, seg.bt_id, 
seg.street_name, seg.direction, seg.from_intersection, seg.to_intersection, 
X.length, X.median_tt, (X.length*0.001)/(X.median_tt/3600) AS speed
FROM X
JOIN king_pilot.bt_segments seg
USING (analysis_id)
WHERE seg.analysis_id IN (1453262, 1453284, 1453305, 1453367, 1453395, 1453445, 1453464, 1453483, 1454196, 1454209,
                          1454352, 1454366, 1454523, 1454549, 1454670, 1454683, 1455243, 1455256, 1455385, 1455400)
```
**ratio for bt spd**
```
CREATE OR REPLACE VIEW jchew.ratio_bt_top10_2days_spd AS
WITH X AS (
SELECT a.analysis_id AS analysis_id_1, a.bt_id AS bt_id_1, 
	b.analysis_id AS analysis_id_2, b.bt_id AS bt_id_2, a.street_name, 
	a.direction AS eb_nb, b.direction AS wb_sb, 
	a.from_intersection AS intersection_1, a.to_intersection AS intersection_2,
	a.speed AS eb_nb_spd, b.speed AS wb_sb_spd
	FROM jchew.bt_top10_2days_spd a
JOIN jchew.bt_top10_2days_spd b
ON a.street_name = b .street_name AND a.from_intersection = b.to_intersection 
AND a.to_intersection=b.from_intersection 
WHERE a.direction IN ('EB', 'NB')
)
SELECT *,
CASE WHEN X.eb_nb_spd > X.wb_sb_spd THEN (X.eb_nb_spd - X.wb_sb_spd)
WHEN X.eb_nb_spd < X.wb_sb_spd THEN (X.eb_nb_spd - X.wb_sb_spd)
END AS "Speed Difference",
 CASE
WHEN X.eb_nb = 'EB' THEN CASE
	WHEN X.eb_nb_spd < X.wb_sb_spd THEN 'WB'
    WHEN X.eb_nb_spd > X.wb_sb_spd THEN 'EB'
    END
WHEN X.eb_nb = 'NB' THEN CASE
    WHEN X.eb_nb_spd < X.wb_sb_spd THEN 'SB'
    WHEN X.eb_nb_spd > X.wb_sb_spd THEN 'NB'
    END
        END AS "Bias Towards"
FROM X
ORDER BY analysis_id_1
```

### 2.2 HERE spd
**Select HERE spd data of those analysis_id within that time period** \
Column `pct_50` is used to represent median speed in HERE data
```
CREATE OR REPLACE VIEW jchew.here_top10_2days_spd AS 
SELECT a.analysis_id, a.street_name, a.direction, a.from_intersection_name, a.to_intersection_name,
avg(pct_50) AS speed
FROM
(SELECT analysis_id, street_name, direction, from_intersection_name, to_intersection_name, pp_link_dir
FROM jchew.validation_bt_here bt
WHERE analysis_id IN (1453262, 1453284, 1453305, 1453367, 1453395, 1453445, 1453464, 1453483, 1454196, 1454209, 
1454352, 1454366, 1454523, 1454549, 1454670, 1454683, 1455243, 1455256, 1455385, 1455400)
) a
LEFT JOIN
(SELECT link_dir, pct_50 FROM here.ta
WHERE tx BETWEEN '2019-10-09 00:00:00' AND '2019-10-10 23:59:00'
) b
ON a.pp_link_dir = b.link_dir
GROUP BY analysis_id, street_name, direction, from_intersection_name, to_intersection_name
```
**ratio for HERE spd**
```
CREATE OR REPLACE VIEW jchew.ratio_here_top10_2days_spd AS
WITH X AS (
SELECT a.analysis_id AS analysis_id_1,
	b.analysis_id AS analysis_id_2, a.street_name, 
	a.direction AS eb_nb, b.direction AS wb_sb, 
	a.from_intersection_name AS intersection_1, a.to_intersection_name AS intersection_2,
	a.speed AS eb_nb_spd, b.speed AS wb_sb_spd
	FROM jchew.here_top10_2days_spd a
JOIN jchew.here_top10_2days_spd b
ON a.street_name = b .street_name AND a.from_intersection_name = b.to_intersection_name 
AND a.to_intersection_name = b.from_intersection_name 
WHERE a.direction IN ('EB', 'NB')
)
SELECT *,
CASE WHEN X.eb_nb_spd > X.wb_sb_spd THEN (X.eb_nb_spd - X.wb_sb_spd)
WHEN X.eb_nb_spd < X.wb_sb_spd THEN (X.eb_nb_spd - X.wb_sb_spd)
END AS "Speed Difference",
 CASE
WHEN X.eb_nb = 'EB' THEN CASE
	WHEN X.eb_nb_spd < X.wb_sb_spd THEN 'WB'
    WHEN X.eb_nb_spd > X.wb_sb_spd THEN 'EB'
    END
WHEN X.eb_nb = 'NB' THEN CASE
    WHEN X.eb_nb_spd < X.wb_sb_spd THEN 'SB'
    WHEN X.eb_nb_spd > X.wb_sb_spd THEN 'NB'
    END
        END AS "Bias Towards"
FROM X
ORDER BY analysis_id_1
```

### 2.3 Comparing difference in spd for both bt and HERE
- *EB/NB is always positive*

In [15]:
sql = '''
SELECT b.analysis_id_1, b.analysis_id_2, b.street_name, 
b.eb_nb, b.wb_sb, b.intersection_1, b.intersection_2, 
a.eb_nb_spd AS bt_eb_nb_spd, a.wb_sb_spd AS bt_wb_sb_spd,
b.eb_nb_spd AS here_eb_nb_spd, b.wb_sb_spd AS here_wb_sb_spd,
a."Speed Difference" AS bt_diff_spd,
b."Speed Difference" AS here_diff_spd,
a."Bias Towards" AS bt_bias,
b."Bias Towards" AS here_bias
FROM jchew.ratio_bt_top10_2days_spd a
RIGHT JOIN jchew.ratio_here_top10_2days_spd b
USING (analysis_id_1)
'''
print_table(sql, con)

analysis_id_1,analysis_id_2,street_name,eb_nb,wb_sb,intersection_1,intersection_2,bt_eb_nb_spd,bt_wb_sb_spd,here_eb_nb_spd,here_wb_sb_spd,bt_diff_spd,here_diff_spd,bt_bias,here_bias
1453262,1453483,Dundas,EB,WB,Dufferin,Bathurst,19.473575,20.488525,27.305494,28.392218,-1.014949,-1.086724,WB,WB
1453284,1453464,Dundas,EB,WB,Bathurst,Spadina,21.696429,24.795918,28.093377,28.624254,-3.09949,-0.530878,WB,WB
1453305,1453445,Dundas,EB,WB,Spadina,University,17.532326,17.271429,24.576369,22.988671,0.260898,1.587697,EB,EB
1453367,1453395,Dundas,EB,WB,Jarvis,Parliament,22.594142,20.032472,27.952651,26.358083,2.56167,1.594568,EB,EB
1454196,1454366,King,EB,WB,Spadina,University,15.648241,11.081851,20.195513,19.304054,4.566391,0.891459,EB,EB
1454209,1454352,King,EB,WB,University,Yonge,14.325,10.471066,20.921397,20.040123,3.853934,0.881274,EB,EB
1454523,1454683,Front,EB,WB,Bathurst,Spadina,23.448128,29.627027,28.445076,26.846726,-6.178899,1.59835,WB,EB
1454549,1454670,Front,EB,WB,Spadina,University,14.980165,13.550562,22.009839,21.565799,1.429603,0.44404,EB,EB
1455385,1455256,University,NB,SB,Queen,Dundas,27.034483,22.142466,35.342676,31.951506,4.892017,3.39117,NB,NB
1455400,1455243,University,NB,SB,Dundas,College,,,35.482901,28.786309,,6.696592,,NB


## 3. Comparing both obs and spd

In [16]:
sql = '''
WITH X AS
(SELECT b.analysis_id_1, b.analysis_id_2, b.street_name, 
b.eb_nb, b.wb_sb, b.intersection_1, b.intersection_2, 
a.eb_nb_obs AS bt_eb_nb_obs, a.wb_sb_obs AS bt_wb_sb_obs,
b.eb_nb_obs AS here_eb_nb_obs, b.wb_sb_obs AS here_wb_sb_obs,
a."EB/WB or NB/SB Ratio" AS bt_ratio,
b."EB/WB or NB/SB Ratio" AS here_ratio,
a."Bias Towards" AS bt_bias,
b."Bias Towards" AS here_bias
FROM jchew.ratio_bt_top10_2days a
RIGHT JOIN jchew.ratio_here_top10_2days b
USING (analysis_id_1)
),
Y AS
(SELECT b.analysis_id_1, b.analysis_id_2, b.street_name, 
b.eb_nb, b.wb_sb, b.intersection_1, b.intersection_2, 
a.eb_nb_spd AS bt_eb_nb_spd, a.wb_sb_spd AS bt_wb_sb_spd,
b.eb_nb_spd AS here_eb_nb_spd, b.wb_sb_spd AS here_wb_sb_spd,
a."Speed Difference" AS bt_diff_spd,
b."Speed Difference" AS here_diff_spd,
a."Bias Towards" AS bt_bias,
b."Bias Towards" AS here_bias
FROM jchew.ratio_bt_top10_2days_spd a
RIGHT JOIN jchew.ratio_here_top10_2days_spd b
USING (analysis_id_1)
)
SELECT Y.analysis_id_1, Y.analysis_id_2, Y.street_name, 
Y.eb_nb, Y.wb_sb, Y.intersection_1, Y.intersection_2,
X.bt_ratio, X.here_ratio, X.bt_bias AS bt_bias_obs, X.here_bias AS here_bias_obs,
Y.bt_diff_spd, Y.here_diff_spd, Y.bt_bias AS bt_bias_spd, Y.here_bias AS here_bias_spd
FROM X JOIN Y
USING (analysis_id_1)
'''
print_table(sql, con)

analysis_id_1,analysis_id_2,street_name,eb_nb,wb_sb,intersection_1,intersection_2,bt_ratio,here_ratio,bt_bias_obs,here_bias_obs,bt_diff_spd,here_diff_spd,bt_bias_spd,here_bias_spd
1453262,1453483,Dundas,EB,WB,Dufferin,Bathurst,0.53913,0.504819,EB,WB,-1.014949,-1.086724,WB,WB
1453284,1453464,Dundas,EB,WB,Bathurst,Spadina,0.561466,0.522055,EB,WB,-3.09949,-0.530878,WB,WB
1453305,1453445,Dundas,EB,WB,Spadina,University,0.632231,0.517396,EB,EB,0.260898,1.587697,EB,EB
1453367,1453395,Dundas,EB,WB,Jarvis,Parliament,0.66183,0.681055,EB,EB,2.56167,1.594568,EB,EB
1454196,1454366,King,EB,WB,Spadina,University,0.65625,0.513158,WB,EB,4.566391,0.891459,EB,EB
1454209,1454352,King,EB,WB,University,Yonge,0.540541,0.679525,EB,EB,3.853934,0.881274,EB,EB
1454523,1454683,Front,EB,WB,Bathurst,Spadina,0.552045,0.56,EB,WB,-6.178899,1.59835,WB,EB
1454549,1454670,Front,EB,WB,Spadina,University,0.669231,0.589438,EB,EB,1.429603,0.44404,EB,EB
1455385,1455256,University,NB,SB,Queen,Dundas,0.735581,0.551682,SB,SB,4.892017,3.39117,NB,NB
1455400,1455243,University,NB,SB,Dundas,College,,0.603466,,SB,,6.696592,,NB
