# Repackage Shortest Paths Zipfiles

This notebook repackages the zip files in `shortest_path.zip` as a single pandas HDF5 to be read in by `Prepare Kriging.ipynb`.

In [1]:
# Check for duplicate ref IDs.
import glob
import re
import numpy as np
import pandas as pd

files = glob.glob(('/mnt/c/Users/czhu5/Documents/VolumeModel/TEPS-dev/'
                   'PRTCS/negative/shortest_path_csv/*.csv'))
# Sort by file number without leading zeros.
extract_fileno = re.compile(r'\d+')
files = sorted(files, key=lambda x: int(extract_fileno.findall(x.split("/")[-1])[0]))

for file in files:
    df = pd.read_csv(file)
    if df['ref_id'].unique().shape[0] != 1:
        raise ValueError('Found a file with more than one ref id!')

So every file has a unique ref ID.  This is great.

In [2]:
seen_values = []
bad_files = []

for file in files:
    df = pd.read_csv(file)
    ref_id = df['ref_id'].unique()[0]
    if ref_id in seen_values:
        bad_files.append(file)
    else:
        seen_values.append(ref_id)

In [3]:
len(bad_files)

7028

In [4]:
bad_files

['/mnt/c/Users/czhu5/Documents/VolumeModel/TEPS-dev/PRTCS/negative/shortest_path_csv/routh_13243.dbf.csv',
 '/mnt/c/Users/czhu5/Documents/VolumeModel/TEPS-dev/PRTCS/negative/shortest_path_csv/routh_13244.dbf.csv',
 '/mnt/c/Users/czhu5/Documents/VolumeModel/TEPS-dev/PRTCS/negative/shortest_path_csv/routh_13245.dbf.csv',
 '/mnt/c/Users/czhu5/Documents/VolumeModel/TEPS-dev/PRTCS/negative/shortest_path_csv/routh_13246.dbf.csv',
 '/mnt/c/Users/czhu5/Documents/VolumeModel/TEPS-dev/PRTCS/negative/shortest_path_csv/routh_13247.dbf.csv',
 '/mnt/c/Users/czhu5/Documents/VolumeModel/TEPS-dev/PRTCS/negative/shortest_path_csv/routh_13248.dbf.csv',
 '/mnt/c/Users/czhu5/Documents/VolumeModel/TEPS-dev/PRTCS/negative/shortest_path_csv/routh_13249.dbf.csv',
 '/mnt/c/Users/czhu5/Documents/VolumeModel/TEPS-dev/PRTCS/negative/shortest_path_csv/routh_13250.dbf.csv',
 '/mnt/c/Users/czhu5/Documents/VolumeModel/TEPS-dev/PRTCS/negative/shortest_path_csv/routh_13251.dbf.csv',
 '/mnt/c/Users/czhu5/Documents/Volume

So `ref_id` is not unique...

Is there any logic to how `ref_id` works?

In [5]:
ref_id = []
file_id = []
centreline_id = []

dupctr_file_id = []
dupctr_centrelines = []

extract_fileno = re.compile(r'\d+')

for file in files:
    df = pd.read_csv(file)
    ref_id.append(df['ref_id'].unique()[0])
    file_id.append(int(extract_fileno.findall(file.split("/")[-1])[0]))
    unique_ctrline_ids = df.loc[df['network_distance'] < 1e-10, 'centreline_id'].unique()
    if len(unique_ctrline_ids) != 1:
        dupctr_file_id.append(file)
        dupctr_centrelines.append(unique_ctrline_ids)
    centreline_id.append(unique_ctrline_ids[0])

In [6]:
dupctr_file_id 

['/mnt/c/Users/czhu5/Documents/VolumeModel/TEPS-dev/PRTCS/negative/shortest_path_csv/routh_246.dbf.csv',
 '/mnt/c/Users/czhu5/Documents/VolumeModel/TEPS-dev/PRTCS/negative/shortest_path_csv/routh_279.dbf.csv',
 '/mnt/c/Users/czhu5/Documents/VolumeModel/TEPS-dev/PRTCS/negative/shortest_path_csv/routh_480.dbf.csv',
 '/mnt/c/Users/czhu5/Documents/VolumeModel/TEPS-dev/PRTCS/negative/shortest_path_csv/routh_481.dbf.csv']

In [7]:
dupctr_centrelines

[array([1143129, 8128848]),
 array([1143129, 8128848]),
 array([103008, 103141]),
 array([103008, 103141])]

Simple solution - assign `1143129` to `routh_246.dbf.csv`, `8128848` to `routh_279.dbf.csv`, etc.

In [8]:
rc_df = pd.DataFrame({
    'ref_id': ref_id,
    'file_id': file_id,
    'centreline_id': centreline_id
})

In [9]:
rc_counts = rc_df.groupby('ref_id')['ref_id'].count()
rc_counts[rc_counts > 1].head()

ref_id
1    3
2    3
3    3
4    3
5    3
Name: ref_id, dtype: int64

In [10]:
rc_df.groupby('ref_id').get_group(2)

Unnamed: 0,ref_id,file_id,centreline_id
1,2,2,14188125
12833,2,13244,912255
18432,2,20638,20061414


In [11]:
np.all(rc_df.loc[:12831, 'ref_id'] == rc_df.loc[:12831, 'file_id'] )

True

In [12]:
rc_df.loc[12832:16000, 'ref_id']

12832       1
12833       2
12834       3
12835       4
12836       5
12837       6
12838       7
12839       8
12840       9
12841      10
12842      11
12843      12
12844      13
12845      14
12846      15
12847      16
12848      17
12849      18
12850      19
12851      20
12852      21
12853      22
12854      23
12855      24
12856      25
12857      26
12858      27
12859      28
12860      29
12861      30
         ... 
15971    3679
15972    3680
15973    3681
15974    3682
15975    3683
15976    3684
15977    3685
15978    3686
15979    3687
15980    3688
15981    3689
15982    3690
15983    3691
15984    3692
15985    3693
15986    3694
15987    3695
15988    3696
15989    3697
15990    3698
15991    3699
15992    3700
15993    3701
15994    3702
15995    3703
15996    3704
15997    3705
15998    3706
15999    3707
16000    3708
Name: ref_id, Length: 3169, dtype: int64

In [13]:
rc_df.loc[12832:16000, 'ref_id'].shape

(3169,)

So once the `ref_id`s cycle back to 1 they don't increment in steps of 1, and by 19000 they start looping back to 100, 200, etc.  We should just use the file name as the reference number.

Tried a `dask` thing:

In [14]:
# We probably don't need Dask for data this size, but it helps us read a bunch of CSVs in without appending to
# the same dataframe 20k times.
# import dask.dataframe as dd
# ddf = dd.read_csv('/mnt/bad_filesc/Users/czhu5/Documents/VolumeModel/TEPS-dev/PRTCS/negative/shortest_path_csv/*.csv')
# # Drop duplicates
# df = ddf.drop_duplicates().compute()
# # This is also way slower than just reading all 20k files into RAM.

But this doesn't work because we have too many preprocessing steps prior.

In [15]:
extract_fileno = re.compile(r'\d+')
duplicated_centreline_lookup = pd.Series(
    [1143129, 8128848, 103008, 103141], index=dupctr_file_id
)

def preproc_file(filename):
    cdf = pd.read_csv(filename)

    # sanity check that there's only one ref id.
    if cdf['ref_id'].unique().shape[0] != 1:
        raise ValueError('Found a file with more than one ref id!')

    # Extract reference number from file, and use that as the reference ID 
    extract_fileno = re.compile(r'\d+')
    file_ref_num = int(extract_fileno.findall(filename.split("/")[-1])[0])
    if cdf['ref_id'].unique()[0] != file_ref_num:
        cdf['ref_id'] = file_ref_num

    # Find and record the origin centreline ID (for a given reference number it's the)
    # one with network distance of zero.  The exception is for files with duplicate
    # origins - these are manually fixed using `duplicated_centreline_lookup`.
    if filename in duplicated_centreline_lookup.index:
        wanted_id = duplicated_centreline_lookup[filename]
        cdf['origin_centreline_id'] = wanted_id
        cdf.loc[(cdf['network_distance'] < 1e-10) &
                (cdf['centreline_id'] != wanted_id), 'network_distance'] = 0.001
    else:
        unique_ctrline_ids = cdf.loc[cdf['network_distance'] < 1e-10, 'centreline_id'].unique()
        if len(unique_ctrline_ids) != 1:
            raise ValueError('somehow file {0} has multiple zero-distance centrelines.'.format(filename))
        cdf['origin_centreline_id'] = unique_ctrline_ids[0]

    # Ensure unique from-to IDs (ignore street information).
    cdf.drop_duplicates(subset=('ref_id', 'centreline_id',
                                'speed_limit', 'network_distance'), inplace=True)
    
    # Sanity check of prior processing steps.
    cdf_zero_dist = cdf.loc[cdf['network_distance'] < 1e-10, :]
    assert cdf_zero_dist.shape[0] == 1
    assert cdf_zero_dist['centreline_id'].values == cdf_zero_dist['origin_centreline_id'].values
    return cdf

df = pd.concat([preproc_file(file) for file in files], ignore_index=True)

In [16]:
# Ensure street name formatting is consistent.
df['street_name'] = df['street_name'].str.upper()
# Point fix for Elizabeth street turning into Dr. Emily Stowe Way.
df.loc[df['centreline_id'].isin([14647000, 14023284, 14646999, 14647005]), 'street_name'] = 'DR. EMILY STOWE WAY'

In [17]:
# Create a map between reference ID and centreline ID.
ref_ctrline_table = df.loc[df['network_distance'] == 0, ['ref_id', 'centreline_id']]

In [18]:
# Check that there are no ref_ids that are used by two different centreline_ids.
unique_map_test = ref_ctrline_table.groupby('ref_id')['centreline_id'].count()
assert not np.any(unique_map_test != 1)

In [19]:
# Check that there are no centreline_ids used by two different ref_ids
unique_map_test = ref_ctrline_table.groupby('centreline_id')['ref_id'].count()
assert not np.any(unique_map_test != 1)

AssertionError: 

Oh.

In [20]:
unique_map_test[unique_map_test > 1].head(10)

centreline_id
1003    2
1821    2
1899    2
7228    2
7640    2
7777    2
7968    2
8090    2
8174    2
8310    2
Name: ref_id, dtype: int64

In [21]:
ref_ctrline_table.loc[ref_ctrline_table['centreline_id'] == 8310, :]

Unnamed: 0,ref_id,centreline_id
34568,127,8310
867608,2997,8310


In [22]:
ref_ctrline_table.loc[ref_ctrline_table['centreline_id'] == 30017253, :]

Unnamed: 0,ref_id,centreline_id
4269568,14330,30017253
5145530,21203,30017253


In [23]:
df.loc[df['ref_id'] == 127, :].head()

Unnamed: 0,ref_id,centreline_id,street_name,speed_limit,network_distance,origin_centreline_id
34557,127,7074349,WESTON RD,50,1.93365,8310
34558,127,7640,WESTON RD,50,1.6907,8310
34559,127,8879,ROGERS RD,50,1.82065,8310
34560,127,8155608,ROGERS RD,50,1.95372,8310
34561,127,9085034,GULLIVER RD,50,2.82439,8310


In [24]:
df.loc[df['ref_id'] == 2997, :].head()

Unnamed: 0,ref_id,centreline_id,street_name,speed_limit,network_distance,origin_centreline_id
867597,2997,7074349,WESTON RD,50,1.93365,8310
867598,2997,7640,WESTON RD,50,1.6907,8310
867599,2997,8879,ROGERS RD,50,1.82065,8310
867600,2997,8155608,ROGERS RD,50,1.95372,8310
867601,2997,9085034,GULLIVER RD,50,2.82439,8310


Okay, we need to drop duplicates.

In [25]:
df.drop_duplicates(subset=['centreline_id', 'street_name', 'speed_limit',
                           'network_distance', 'origin_centreline_id'], inplace=True)

In [26]:
ref_ctrline_table = df.loc[df['network_distance'] == 0, ['ref_id', 'centreline_id']]

In [27]:
# Check again that there are no centreline_ids used by two different ref_ids
unique_map_test = ref_ctrline_table.groupby('centreline_id')['ref_id'].count()
assert not np.any(unique_map_test != 1)

Yay!  However this does not end the saga, since what we really need to do is also make sure every given *origin* centreline ID refers only to one reference ID.  In some cases where the two `ref_id`s represent two separate queries for the same centreline, there will be some duplicate rows (including the zero network distance rows) and some that aren't, but still need to have their `ref_id` changed so that there's a 1-to-1 mapping between `ref_id` and `origin_centreline_id`.

In [28]:
# Find all origin centreline IDs mapped to more than one reference ID.
duplicate_ref_ids = (df.groupby(['origin_centreline_id', 'ref_id'])['centreline_id']
                     .count().reset_index().drop(columns='centreline_id')
                     .groupby('origin_centreline_id')['ref_id'].count())

In [29]:
duplicate_ref_ids[duplicate_ref_ids > 1].index.values

array([    1003,   102817,   102823, ..., 30029636, 30033762, 30033763])

In [30]:
# Create a one-to-one mapping between ref_id and origin_centreline_id (which, when network distance is zero, is
# the same as `centreline_id`).
ref_ctrline_map = pd.Series(ref_ctrline_table['ref_id'].values,
                            index=ref_ctrline_table['centreline_id'].values)

In [31]:
ref_ctrline_map[102817]

470

In [34]:
from tqdm.notebook import tqdm

for oci in tqdm(duplicate_ref_ids[duplicate_ref_ids > 1].index.values):
    df.loc[df['origin_centreline_id'] == oci, 'ref_id'] = ref_ctrline_map[oci]

HBox(children=(FloatProgress(value=0.0, max=2530.0), HTML(value='')))




In [43]:
# Double check that we've removed all duplicate reference IDs.
oci_refid_map = df.groupby('origin_centreline_id')['ref_id'].agg(['max', 'min'])
assert np.array_equal(oci_refid_map['max'], oci_refid_map['min'])
oci_refid_map['mapping'] = ref_ctrline_map[oci_refid_map.index]
assert np.array_equal(oci_refid_map['max'], oci_refid_map['mapping'])

YAY!

In [44]:
df.columns

Index(['ref_id', 'centreline_id', 'street_name', 'speed_limit',
       'network_distance', 'origin_centreline_id'],
      dtype='object')

In [45]:
df.rename(columns={
    'ref_id': 'origin_ref_id',
    'centreline_id': 'dest_centreline_id'}, inplace=True)

In [51]:
oci_ref_table = df.loc[df['network_distance'] == 0, ['origin_ref_id', 'origin_centreline_id']].reset_index(drop=True)
oci_ref_table.columns = ['dest_ref_id', 'dest_centreline_id']

In [53]:
df = pd.merge(df, oci_ref_table, how='left', left_on='dest_centreline_id', right_on='dest_centreline_id')

There are still some NaNs in `dest_ref_id`, but this means that `df` is not a full distance matrix (and it may not matter for constructing variograms).

In [69]:
df.reset_index(drop=True, inplace=True)

# This is basically resmat.txt without the land use data.
with pd.HDFStore('/home/czhu/Data/btp_sandbox_prep_kriglocaldata/resmat.hdf5', 'w') as store:
    store['resmat'] = df