# Data Validation of the top 10 imbalanced route (1month):
(analysis_id = 1453262, 1453284, 1453305, 1453367, 1453395, 1453445, 1453464, 1453483, 1454196, 1454209, \
1454352, 1454366, 1454523, 1454549, 1454670, 1454683, 1455243, 1455256, 1455385, 1455400)

- NOTE: No bt data found for 1455243 , 1455400
- Period: 1 month (From '2019-11-01 00:00:00' to '2019-11-30 23:59:00')

In [1]:
from psycopg2 import connect
import psycopg2.sql as pg
import configparser
import pandas.io.sql as pandasql
from IPython.display import HTML
def print_table(sql, con):
    return HTML(pandasql.read_sql(sql, con).to_html(index=False))

In [2]:
# setting up pgsql connection
CONFIG = configparser.ConfigParser()
CONFIG.read(r'/home/jchew/local/db.cfg')
dbset = CONFIG['DBSETTINGS']
con = connect(**dbset)

## 1. Comparing number of observations
### 1.1 Bluetooth obs
**Select bt obs data of those analysis_id within that time period**
```
CREATE OR REPLACE VIEW jchew.bt_top10_1month AS
SELECT bt.analysis_id, seg.bt_id, 
seg.street_name, seg.direction, seg.from_intersection, seg.to_intersection,
sum(obs) AS sum_bt_obs
FROM bluetooth.aggr_5min bt
JOIN king_pilot.bt_segments seg
USING (analysis_id)
WHERE bt.datetime_bin BETWEEN '2019-11-01 00:00:00' AND '2019-11-30 23:59:00'
AND seg.analysis_id IN (1453262, 1453284, 1453305, 1453367, 1453395, 1453445, 1453464, 1453483, 1454196, 1454209,
                        1454352, 1454366, 1454523, 1454549, 1454670, 1454683, 1455243, 1455256, 1455385, 1455400)
GROUP BY bt.analysis_id, seg.bt_id, seg.street_name, seg.direction, seg.from_intersection, seg.to_intersection
ORDER BY analysis_id
```
**Ratio for bt obs**
```
CREATE OR REPLACE VIEW jchew.ratio_bt_top10_1month AS
WITH X AS (
SELECT a.analysis_id AS analysis_id_1, a.bt_id AS bt_id_1, 
    b.analysis_id AS analysis_id_2, b.bt_id AS bt_id_2, a.street_name, 
    a.direction AS eb_nb, b.direction AS wb_sb, 
    a.from_intersection AS intersection_1, a.to_intersection AS intersection_2,
    a.sum_bt_obs AS eb_nb_obs, b.sum_bt_obs AS wb_sb_obs
    FROM jchew.bt_top10_1month a
JOIN jchew.bt_top10_1month b
ON a.street_name = b .street_name AND a.from_intersection = b.to_intersection 
AND a.to_intersection=b.from_intersection 
WHERE a.direction IN ('EB', 'NB')
)
SELECT *,
CASE WHEN X.eb_nb_obs > X.wb_sb_obs THEN (X.eb_nb_obs * 1.0) / (X.eb_nb_obs + X.wb_sb_obs)
WHEN X.eb_nb_obs < X.wb_sb_obs THEN (X.wb_sb_obs * 1.0) / (X.eb_nb_obs + X.wb_sb_obs)
END AS "EB/WB or NB/SB Ratio",
 CASE
WHEN X.eb_nb = 'EB' THEN CASE
    WHEN X.eb_nb_obs < X.wb_sb_obs THEN 'WB'
    WHEN X.eb_nb_obs > X.wb_sb_obs THEN 'EB'
    END
WHEN X.eb_nb = 'NB' THEN CASE
    WHEN X.eb_nb_obs < X.wb_sb_obs THEN 'SB'
    WHEN X.eb_nb_obs > X.wb_sb_obs THEN 'NB'
    END
        END AS "Bias Towards"
FROM X
ORDER BY analysis_id_1
```

## 1.2 HERE obs
**Select HERE obs data of those analysis_id within that time period**    \
Number of obs for HERE data is calculated by counting number of datetime bin
```
CREATE MATERIALIZED VIEW jchew.here_top10_1month AS
SELECT a.analysis_id, a.street_name, a.direction, 
a.from_intersection_name AS from_intersection, a.to_intersection_name AS to_intersection,
SUM(b.total) AS sum_here_obs, SUM(b.length) AS sum_length
FROM
(SELECT analysis_id, street_name, direction, 
 from_intersection_name, to_intersection_name, pp_link_dir AS link_dir, reference_length
FROM jchew.validation_bt_here
WHERE analysis_id IN (1453262, 1453284, 1453305, 1453367, 1453395, 1453445, 1453464, 1453483, 1454196, 1454209, 
                      1454352, 1454366, 1454523, 1454549, 1454670, 1454683, 1455243, 1455256, 1455385, 1455400)
) a
LEFT JOIN
(SELECT link_dir, count(tx) AS total, length FROM here.ta
WHERE tx BETWEEN '2019-11-01 00:00:00' AND '2019-11-30 23:59:00'
GROUP BY link_dir,length) b
USING (link_dir)
GROUP BY analysis_id, street_name, direction, from_intersection_name, to_intersection_name
```
**Ratio for HERE obs**
```
CREATE OR REPLACE VIEW jchew.ratio_here_top10_1month AS
WITH X AS (
SELECT a.analysis_id AS analysis_id_1, 
    b.analysis_id AS analysis_id_2, a.street_name,
    a.direction AS eb_nb, b.direction AS wb_sb, 
    a.from_intersection AS intersection_1, a.to_intersection AS intersection_2,
    a.sum_here_obs AS eb_nb_obs, b.sum_here_obs AS wb_sb_obs
    FROM jchew.here_top10_1month a
JOIN jchew.here_top10_1month b
ON a.street_name = b .street_name AND a.from_intersection = b.to_intersection 
AND a.to_intersection=b.from_intersection 
WHERE a.direction IN ('EB', 'NB')
)
SELECT *,
CASE WHEN X.eb_nb_obs > X.wb_sb_obs THEN (X.eb_nb_obs * 1.0) / (X.eb_nb_obs + X.wb_sb_obs)
WHEN X.eb_nb_obs < X.wb_sb_obs THEN (X.wb_sb_obs * 1.0) / (X.eb_nb_obs + X.wb_sb_obs)
END AS "EB/WB or NB/SB Ratio",
 CASE
WHEN X.eb_nb = 'EB' THEN CASE
    WHEN X.eb_nb_obs < X.wb_sb_obs THEN 'WB'
    WHEN X.eb_nb_obs > X.wb_sb_obs THEN 'EB'
    END
WHEN X.eb_nb = 'NB' THEN CASE
    WHEN X.eb_nb_obs < X.wb_sb_obs THEN 'SB'
    WHEN X.eb_nb_obs > X.wb_sb_obs THEN 'NB'
    END
        END AS "Bias Towards"
FROM X
ORDER BY analysis_id_1
```

### 1.3 Comparing ratio of obs for both bt and HERE

In [3]:
sql = '''
SELECT b.analysis_id_1, b.analysis_id_2, b.street_name, 
b.eb_nb, b.wb_sb, b.intersection_1, b.intersection_2, 
a.eb_nb_obs AS bt_eb_nb_obs, a.wb_sb_obs AS bt_wb_sb_obs,
b.eb_nb_obs AS here_eb_nb_obs, b.wb_sb_obs AS here_wb_sb_obs,
a."EB/WB or NB/SB Ratio" AS bt_ratio,
b."EB/WB or NB/SB Ratio" AS here_ratio,
a."Bias Towards" AS bt_bias,
b."Bias Towards" AS here_bias
FROM jchew.ratio_bt_top10_1month a
RIGHT JOIN jchew.ratio_here_top10_1month b
USING (analysis_id_1)
'''
print_table(sql, con)

analysis_id_1,analysis_id_2,street_name,eb_nb,wb_sb,intersection_1,intersection_2,bt_eb_nb_obs,bt_wb_sb_obs,here_eb_nb_obs,here_wb_sb_obs,bt_ratio,here_ratio,bt_bias,here_bias
1453262,1453483,Dundas,EB,WB,Dufferin,Bathurst,4240.0,3644.0,52265.0,56206.0,0.537798,0.518166,EB,WB
1453284,1453464,Dundas,EB,WB,Bathurst,Spadina,7131.0,5439.0,26886.0,29242.0,0.567303,0.520988,EB,WB
1453305,1453445,Dundas,EB,WB,Spadina,University,5025.0,3023.0,29355.0,32048.0,0.624379,0.521929,EB,WB
1453367,1453395,Dundas,EB,WB,Jarvis,Parliament,8973.0,4379.0,50715.0,26399.0,0.672034,0.657663,EB,EB
1454196,1454366,King,EB,WB,Spadina,University,270.0,369.0,4926.0,4960.0,0.577465,0.50172,WB,WB
1454209,1454352,King,EB,WB,University,Yonge,660.0,549.0,6679.0,4346.0,0.545906,0.605805,EB,EB
1454523,1454683,Front,EB,WB,Bathurst,Spadina,5362.0,3769.0,8356.0,10166.0,0.58723,0.548861,EB,WB
1454549,1454670,Front,EB,WB,Spadina,University,1568.0,661.0,24481.0,19584.0,0.703454,0.555566,EB,EB
1455385,1455256,University,NB,SB,Queen,Dundas,3684.0,11630.0,15616.0,17543.0,0.759436,0.529057,SB,SB
1455400,1455243,University,NB,SB,Dundas,College,,,43992.0,61196.0,,0.581777,,SB


## 2. Comparing median speed
### 2.1 Bluetooth spd
**Select bt spd data of those analysis_id within that time period**    \
For bluetooth data, `speed = (length of segment) / (median time travel)`
```
CREATE OR REPLACE VIEW jchew.bt_top10_1month_spd AS
WITH X AS 
(SELECT bt.analysis_id, PERCENTILE_CONT(0.5) WITHIN GROUP(ORDER BY tt) AS median_tt, l.length
FROM bluetooth.aggr_5min bt 
RIGHT JOIN bluetooth.segments l
USING (analysis_id)
WHERE bt.datetime_bin BETWEEN '2019-11-01 00:00:00' AND '2019-11-30 23:59:00'
GROUP BY bt.analysis_id, l.length)
SELECT X.analysis_id, seg.bt_id, 
seg.street_name, seg.direction, seg.from_intersection, seg.to_intersection, 
X.length, X.median_tt, (X.length*0.001)/(X.median_tt/3600) AS speed
FROM X
JOIN king_pilot.bt_segments seg
USING (analysis_id)
WHERE seg.analysis_id IN (1453262, 1453284, 1453305, 1453367, 1453395, 1453445, 1453464, 1453483, 1454196, 1454209,
                          1454352, 1454366, 1454523, 1454549, 1454670, 1454683, 1455243, 1455256, 1455385, 1455400)
```
**Ratio for bt spd**
```
CREATE OR REPLACE VIEW jchew.ratio_bt_top10_1month_spd AS
WITH X AS (
SELECT a.analysis_id AS analysis_id_1, a.bt_id AS bt_id_1, 
    b.analysis_id AS analysis_id_2, b.bt_id AS bt_id_2, a.street_name, 
    a.direction AS eb_nb, b.direction AS wb_sb, 
    a.from_intersection AS intersection_1, a.to_intersection AS intersection_2,
    a.speed AS eb_nb_spd, b.speed AS wb_sb_spd
    FROM jchew.bt_top10_1month_spd a
JOIN jchew.bt_top10_1month_spd b
ON a.street_name = b .street_name AND a.from_intersection = b.to_intersection 
AND a.to_intersection=b.from_intersection 
WHERE a.direction IN ('EB', 'NB')
)
SELECT *,
CASE WHEN X.eb_nb_spd > X.wb_sb_spd THEN (X.eb_nb_spd - X.wb_sb_spd)
WHEN X.eb_nb_spd < X.wb_sb_spd THEN (X.eb_nb_spd - X.wb_sb_spd)
END AS "Speed Difference",
 CASE
WHEN X.eb_nb = 'EB' THEN CASE
    WHEN X.eb_nb_spd < X.wb_sb_spd THEN 'WB'
    WHEN X.eb_nb_spd > X.wb_sb_spd THEN 'EB'
    END
WHEN X.eb_nb = 'NB' THEN CASE
    WHEN X.eb_nb_spd < X.wb_sb_spd THEN 'SB'
    WHEN X.eb_nb_spd > X.wb_sb_spd THEN 'NB'
    END
        END AS "Bias Towards"
FROM X
ORDER BY analysis_id_1
```

### 2.2 HERE spd
**Select HERE spd data of those analysis_id within that time period** \ 
Column `pct_50` is used to represent median speed in HERE data
```
CREATE OR REPLACE VIEW jchew.here_top10_1month_spd AS 
SELECT a.analysis_id, a.street_name, a.direction, a.from_intersection_name, a.to_intersection_name,
avg(pct_50) AS speed
FROM
(SELECT analysis_id, street_name, direction, from_intersection_name, to_intersection_name, pp_link_dir
FROM jchew.validation_bt_here bt
WHERE analysis_id IN (1453262, 1453284, 1453305, 1453367, 1453395, 1453445, 1453464, 1453483, 1454196, 1454209, 
1454352, 1454366, 1454523, 1454549, 1454670, 1454683, 1455243, 1455256, 1455385, 1455400)
) a
LEFT JOIN
(SELECT link_dir, pct_50 FROM here.ta
WHERE tx BETWEEN '2019-11-01 00:00:00' AND '2019-11-30 23:59:00'
) b
ON a.pp_link_dir = b.link_dir
GROUP BY analysis_id, street_name, direction, from_intersection_name, to_intersection_name
```
**Ratio for HERE spd**
```
CREATE OR REPLACE VIEW jchew.ratio_here_top10_1month_spd AS
WITH X AS (
SELECT a.analysis_id AS analysis_id_1,
    b.analysis_id AS analysis_id_2, a.street_name, 
    a.direction AS eb_nb, b.direction AS wb_sb, 
    a.from_intersection_name AS intersection_1, a.to_intersection_name AS intersection_2,
    a.speed AS eb_nb_spd, b.speed AS wb_sb_spd
    FROM jchew.here_top10_1month_spd a
JOIN jchew.here_top10_1month_spd b
ON a.street_name = b .street_name AND a.from_intersection_name = b.to_intersection_name 
AND a.to_intersection_name = b.from_intersection_name 
WHERE a.direction IN ('EB', 'NB')
)
SELECT *,
CASE WHEN X.eb_nb_spd > X.wb_sb_spd THEN (X.eb_nb_spd - X.wb_sb_spd)
WHEN X.eb_nb_spd < X.wb_sb_spd THEN (X.eb_nb_spd - X.wb_sb_spd)
END AS "Speed Difference",
 CASE
WHEN X.eb_nb = 'EB' THEN CASE
    WHEN X.eb_nb_spd < X.wb_sb_spd THEN 'WB'
    WHEN X.eb_nb_spd > X.wb_sb_spd THEN 'EB'
    END
WHEN X.eb_nb = 'NB' THEN CASE
    WHEN X.eb_nb_spd < X.wb_sb_spd THEN 'SB'
    WHEN X.eb_nb_spd > X.wb_sb_spd THEN 'NB'
    END
        END AS "Bias Towards"
FROM X
ORDER BY analysis_id_1
```

### 2.3 Comparing difference in spd for both bt and HERE
- EB/NB is always positive

In [5]:
sql = '''
SELECT b.analysis_id_1, b.analysis_id_2, b.street_name, 
b.eb_nb, b.wb_sb, b.intersection_1, b.intersection_2, 
a.eb_nb_spd AS bt_eb_nb_spd, a.wb_sb_spd AS bt_wb_sb_spd,
b.eb_nb_spd AS here_eb_nb_spd, b.wb_sb_spd AS here_wb_sb_spd,
a."Speed Difference" AS bt_diff_spd,
b."Speed Difference" AS here_diff_spd,
a."Bias Towards" AS bt_bias,
b."Bias Towards" AS here_bias
FROM jchew.ratio_bt_top10_1month_spd a
RIGHT JOIN jchew.ratio_here_top10_1month_spd b
USING (analysis_id_1)
'''
print_table(sql, con)

analysis_id_1,analysis_id_2,street_name,eb_nb,wb_sb,intersection_1,intersection_2,bt_eb_nb_spd,bt_wb_sb_spd,here_eb_nb_spd,here_wb_sb_spd,bt_diff_spd,here_diff_spd,bt_bias,here_bias
1453262,1453483,Dundas,EB,WB,Dufferin,Bathurst,19.398194,19.464763,26.728843,26.785183,-0.06657,-0.05634,WB,WB
1453284,1453464,Dundas,EB,WB,Bathurst,Spadina,20.858369,23.592233,28.424645,27.501368,-2.733864,0.923277,WB,EB
1453305,1453445,Dundas,EB,WB,Spadina,University,17.068235,13.558879,23.38634,20.440371,3.509357,2.945969,EB,EB
1453367,1453395,Dundas,EB,WB,Jarvis,Parliament,21.343874,19.669565,27.648999,25.703095,1.674308,1.945904,EB,EB
1454196,1454366,King,EB,WB,Spadina,University,14.416667,13.84,20.353837,19.973589,0.576667,0.380248,EB,EB
1454209,1454352,King,EB,WB,University,Yonge,13.844295,13.660927,21.447672,19.711919,0.183368,1.735753,EB,EB
1454523,1454683,Front,EB,WB,Bathurst,Spadina,22.428645,28.847368,27.019866,26.596498,-6.418724,0.423368,WB,EB
1454549,1454670,Front,EB,WB,Spadina,University,15.361017,14.077821,22.167844,22.699959,1.283196,-0.532115,EB,WB
1455385,1455256,University,NB,SB,Queen,Dundas,27.5625,22.45,35.062564,30.78681,5.1125,4.275754,NB,NB
1455400,1455243,University,NB,SB,Dundas,College,,,36.010388,29.606723,,6.403666,,NB


## 3. Comparing both obs and spd

In [6]:
sql = '''
WITH X AS
(SELECT b.analysis_id_1, b.analysis_id_2, b.street_name, 
b.eb_nb, b.wb_sb, b.intersection_1, b.intersection_2, 
a.eb_nb_obs AS bt_eb_nb_obs, a.wb_sb_obs AS bt_wb_sb_obs,
b.eb_nb_obs AS here_eb_nb_obs, b.wb_sb_obs AS here_wb_sb_obs,
a."EB/WB or NB/SB Ratio" AS bt_ratio,
b."EB/WB or NB/SB Ratio" AS here_ratio,
a."Bias Towards" AS bt_bias,
b."Bias Towards" AS here_bias
FROM jchew.ratio_bt_top10_1month a
RIGHT JOIN jchew.ratio_here_top10_1month b
USING (analysis_id_1)
),
Y AS
(SELECT b.analysis_id_1, b.analysis_id_2, b.street_name, 
b.eb_nb, b.wb_sb, b.intersection_1, b.intersection_2, 
a.eb_nb_spd AS bt_eb_nb_spd, a.wb_sb_spd AS bt_wb_sb_spd,
b.eb_nb_spd AS here_eb_nb_spd, b.wb_sb_spd AS here_wb_sb_spd,
a."Speed Difference" AS bt_diff_spd,
b."Speed Difference" AS here_diff_spd,
a."Bias Towards" AS bt_bias,
b."Bias Towards" AS here_bias
FROM jchew.ratio_bt_top10_1month_spd a
RIGHT JOIN jchew.ratio_here_top10_1month_spd b
USING (analysis_id_1)
)
SELECT Y.analysis_id_1, Y.analysis_id_2, Y.street_name, 
Y.eb_nb, Y.wb_sb, Y.intersection_1, Y.intersection_2,
X.bt_ratio, X.here_ratio, X.bt_bias AS bt_bias_obs, X.here_bias AS here_bias_obs,
Y.bt_diff_spd, Y.here_diff_spd, Y.bt_bias AS bt_bias_spd, Y.here_bias AS here_bias_spd
FROM X JOIN Y
USING (analysis_id_1)
'''
print_table(sql, con)

analysis_id_1,analysis_id_2,street_name,eb_nb,wb_sb,intersection_1,intersection_2,bt_ratio,here_ratio,bt_bias_obs,here_bias_obs,bt_diff_spd,here_diff_spd,bt_bias_spd,here_bias_spd
1453262,1453483,Dundas,EB,WB,Dufferin,Bathurst,0.537798,0.518166,EB,WB,-0.06657,-0.05634,WB,WB
1453284,1453464,Dundas,EB,WB,Bathurst,Spadina,0.567303,0.520988,EB,WB,-2.733864,0.923277,WB,EB
1453305,1453445,Dundas,EB,WB,Spadina,University,0.624379,0.521929,EB,WB,3.509357,2.945969,EB,EB
1453367,1453395,Dundas,EB,WB,Jarvis,Parliament,0.672034,0.657663,EB,EB,1.674308,1.945904,EB,EB
1454196,1454366,King,EB,WB,Spadina,University,0.577465,0.50172,WB,WB,0.576667,0.380248,EB,EB
1454209,1454352,King,EB,WB,University,Yonge,0.545906,0.605805,EB,EB,0.183368,1.735753,EB,EB
1454523,1454683,Front,EB,WB,Bathurst,Spadina,0.58723,0.548861,EB,WB,-6.418724,0.423368,WB,EB
1454549,1454670,Front,EB,WB,Spadina,University,0.703454,0.555566,EB,EB,1.283196,-0.532115,EB,WB
1455385,1455256,University,NB,SB,Queen,Dundas,0.759436,0.529057,SB,SB,5.1125,4.275754,NB,NB
1455400,1455243,University,NB,SB,Dundas,College,,0.581777,,SB,,6.403666,,NB
