Usage: create a shapefile for comparing two highway networks in Tableau

In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import os

In [2]:
working_dir = 'C:\\Users\\ywang\\Documents\\ArcGIS\\Projects\\Network_comparison'
folder_1 = '2015_Baseline'
folder_2 = '2050_Blueprint'

file_1 = os.path.join(working_dir, folder_1, 'freeflow.shp')
file_2 = os.path.join(working_dir, folder_2, 'freeflow.shp')

In [3]:
# read highway networks shapefile

gdf_1 = gpd.read_file(file_1)
print('Read {} rows of {} highway link data from {}'.format(gdf_1.shape[0], folder_1, file_1))
print(gdf_1.crs)

gdf_2 = gpd.read_file(file_2)
print('Read {} rows of {} highway link data from {}'.format(gdf_2.shape[0], folder_2, file_2))
print(gdf_2.crs)

Read 33798 rows of 2015_Baseline highway link data from C:\Users\ywang\Documents\ArcGIS\Projects\Network_comparison\2015_Baseline\freeflow.shp
{}
Read 34582 rows of 2050_Blueprint highway link data from C:\Users\ywang\Documents\ArcGIS\Projects\Network_comparison\2050_Blueprint\freeflow.shp
{}


In [4]:
def hwy_link_to_pair(link_df_raw):
    """
    input: .shp, highway network links. Number of rows = number of links in the network
    
    output: .shp, highway network links collapsed by node pair (number of rows = number of node pairs) with 
    the following attributes:
        AB_pair: node A and node B pair, each pair may contain one link (AB or BA) or two links (AB and BA)
        DIS_AB, DIS_BA: distances of the link(s) on each node pair
        LN_AB, LN_BA: number of lanes of the link(s) on each node pair
        AT_AB, AT_BA: Area Type of the link(s) on each node pair
        FT_AB, FT_BA: Facility Type of the link(s) on each node pair
        BRT_AB, BRT_BA: BRT type of the link(s) on each node pair
        P_AB, P_BA: transportation projects associated with the link(s) on each node pair
        DIR: direction of the node pair, 'one-way' or 'two-way'
        geometry: lingstring of the node pair
    """
    
    link_df = link_df_raw.copy()
    link_df.rename(columns = {'DISTANCE': 'DIS',
                              'LANES'   : 'LN',
                              'PROJ'    : 'P'}, inplace=True)
    
    # clean up 'None' and 'NaN' in 'P' column
    link_df.loc[link_df.P == '']
    
    # create a variable 'AB_link' to represent A-B link with the original direction,
    # and a variable 'AB_pair' to represent the pair of linked nodes without showing the link direction
    link_df['AB_link'] = link_df['A'].astype(str) + '-' + link_df['B'].astype(str)
    link_df['AB_pair'] = link_df['AB_link']
    link_df.loc[link_df.A > link_df.B, 'AB_pair'] = link_df['B'].astype(str) + '-' + link_df['A'].astype(str)
    
    # only keep needed columns
    link_df = link_df[['AB_pair', 'AB_link', 'DIS', 'LN', 'AT', 'FT', 'BRT', 'P', 'geometry']].sort_values(by='AB_pair')
    
    # calculate each AB_pair has how many links 
    link_df['link_cnt'] = link_df.groupby('AB_pair')['AB_link'].transform('size')
    
    # create columns to unstack links of the same node pairs 
    for i in ['DIS', 'LN', 'AT', 'FT', 'BRT', 'P', 'geometry']:
        link_df[i+'_AB_temp'] = None
        link_df.loc[link_df.AB_pair == link_df.AB_link, i+'_AB_temp'] = link_df[i]
        link_df[i+'_BA_temp'] = None
        link_df.loc[link_df.AB_pair != link_df.AB_link, i+'_BA_temp'] = link_df[i]

    # fill out values for node pairs
    for i in ['DIS', 'LN', 'AT', 'FT', 'BRT', 'P', 'geometry']:
        df_AB = link_df.loc[link_df[i+'_AB_temp'].notnull()][['AB_pair', i+'_AB_temp']]
        df_AB.columns = ['AB_pair', i + '_AB']
        df_BA = link_df.loc[link_df[i+'_BA_temp'].notnull()][['AB_pair', i+'_BA_temp']]
        df_BA.columns = ['AB_pair', i + '_BA']

        link_df = link_df.merge(df_AB, on='AB_pair', how='left').merge(df_BA, on='AB_pair', how='left')
        link_df.drop(columns = [i+'_AB_temp', i+'_BA_temp'], inplace=True)
    print('link_df has {} rows'.format(link_df.shape[0]))
    
    # keep unique node pairs
    link_pair = link_df[['AB_pair', 'link_cnt', 'DIS_AB', 'DIS_BA',
                         'LN_AB', 'LN_BA', 'AT_AB', 'AT_BA',
                         'FT_AB', 'FT_BA', 'BRT_AB', 'BRT_BA', 'P_AB', 'P_BA']].drop_duplicates()
    
    # label whether each link pair is on a one-way segment or a two-way segment
    link_pair['DIR'] = 'one-way'
    link_pair.loc[link_pair.link_cnt == 2, 'DIR'] = 'two-way'
    print('link_pair has {} rows'.format(link_pair.shape[0]))
    
    # QA/QC: check the number of one-way links
    print('Num of node pairs with DIS value in only one direction: {}'.format(
        link_pair.loc[link_pair.DIS_AB.isnull() | link_pair.DIS_BA.isnull()].shape[0]))
    print('Num of one-way node pairs: {}'.format(link_pair.loc[link_pair.DIR == 'one-way'].shape[0]))
    print('Num of two-way node pairs: {}'.format(link_pair.loc[link_pair.DIR == 'two-way'].shape[0]))
    
    # obtain geometry from the link input and append to the node-pair dataframe
    link_df['geometry'] = link_df['geometry_AB']
    link_df.loc[link_df['geometry_AB'].isnull(), 'geometry'] = link_df['geometry_BA']
    pair_geo = link_df[['AB_pair', 'geometry']].drop_duplicates()
    link_pair_geo = link_pair.merge(pair_geo, on='AB_pair', how='left')
    link_pair_geo.drop(columns = ['link_cnt'], inplace=True)
    
    return link_pair_geo

In [5]:
# Step1: create node pair file for each network version

link_shp_1 = hwy_link_to_pair(gdf_1)
print('network 1 converted to {} node pairs'.format(link_shp_1.shape[0]))
link_shp_2 = hwy_link_to_pair(gdf_2)
print('network 2 converted to {} node pairs'.format(link_shp_2.shape[0]))

link_df has 33798 rows
link_pair has 19864 rows
Num of node pairs with DIS value in only one direction: 5930
Num of one-way node pairs: 5930
Num of two-way node pairs: 13934
network 1 converted to 19864 node pairs
link_df has 34582 rows
link_pair has 20559 rows
Num of node pairs with DIS value in only one direction: 6536
Num of one-way node pairs: 6536
Num of two-way node pairs: 14023
network 2 converted to 20559 node pairs


In [6]:
# Step2: merge the node-pair tables of the two networks

# first, append version label to column names
df1 = link_shp_1[['AB_pair', 'DIS_AB', 'LN_AB', 'AT_AB', 'FT_AB', 'BRT_AB', 'P_AB',
                  'DIS_BA', 'LN_BA', 'AT_BA', 'FT_BA', 'BRT_BA', 'P_BA', 'DIR']]
df1.columns = [x+'_n1' for x in list(df1)]

df2 = link_shp_2[['AB_pair', 'DIS_AB', 'LN_AB', 'AT_AB', 'FT_AB', 'BRT_AB', 'P_AB',
                  'DIS_BA', 'LN_BA', 'AT_BA', 'FT_BA', 'BRT_BA', 'P_BA', 'DIR']]
df2.columns = [x+'_n2' for x in list(df2)]

# outer join
df_comp = df1.merge(df2, left_on='AB_pair_n1', right_on='AB_pair_n2', how='outer')

# QA/QC: check number of node pairs
print('Number of rows after merge: {}'.format(df_comp.shape[0]))
print('Unique node pairs of the two networks: {}'.format(df_comp[['AB_pair_n1', 'AB_pair_n2']].drop_duplicates().shape[0]))

Number of rows after merge: 20640
Unique node pairs of the two networks: 20640


In [7]:
# Step3: create columns 'diff_xx' to compare each feature: 1 for value change, 0 for no value change 

for i in ['DIS_AB', 'LN_AB', 'AT_AB', 'FT_AB', 'BRT_AB', 'P_AB',
          'DIS_BA', 'LN_BA', 'AT_BA', 'FT_BA', 'BRT_BA', 'P_BA']:
    df_comp['diff_'+i] = 0
    df_comp.loc[df_comp[i+'_n1'] != df_comp[i+'_n2'], 'diff_'+i] = 1
    df_comp.loc[df_comp[i+'_n1'].isnull() & df_comp[i+'_n2'].isnull(), 'diff_'+i] = 0

for i in ['DIS', 'LN', 'AT', 'FT', 'BRT', 'P']:
    df_comp['diff_'+i] = 0
    df_comp.loc[(df_comp['diff_'+i+'_AB'] == 1)|(df_comp['diff_'+i+'_BA'] == 1), 'diff_'+i] = 1
    
# create 'any_diff' to label is any change exists
compare_columns = ['diff_'+x for x in ['DIS', 'LN', 'AT', 'FT', 'BRT', 'P']]
df_comp['diff_any'] = df_comp[compare_columns].sum(axis=1)
# print(df_comp.diff_any.value_counts())

# if a node pair has one attribute change and P_diff==1 (project change), then
# there are changes in attributes other than DISTANCE, LANES, AT, FT, BRT
df_comp['diff_other'] = 0
df_comp.loc[(df_comp.diff_any==1) & (df_comp.diff_P==1), 'diff_other'] = 1

# convert diff_any to 1 or 0
df_comp.loc[df_comp.diff_any > 0, 'diff_any'] = 1

In [8]:
# Step 4: create columns 'link_chg' to represent if link(s) were added or deleted between the two networks
df_comp['link_chg'] = 'No link change'
df_comp.loc[df_comp.diff_any==1, 'link_chg'] = 'Other link change'

# if DIR_n1 is null or DIR_n1 < DIR_n2, link added
df_comp.loc[(df_comp.DIR_n1.isnull() & df_comp.DIR_n2.notnull()) | (
             (df_comp.DIR_n1 == 'one-way') & (df_comp.DIR_n2 == 'two-way')), 'link_chg'] = 'Link added'

# if DIR_n2 is null or DIR_n1 > DIR_n2, link deleted
df_comp.loc[(df_comp.DIR_n1.notnull() & df_comp.DIR_n2.isnull()) | (
             (df_comp.DIR_n1 == 'two-way') & (df_comp.DIR_n2 == 'one-way')), 'link_chg'] = 'Link deleted'

In [9]:
# Step 5: consolidate Proj_AB and Proj_BA into one variable - easy for visualization
for i in ['_n1', '_n2']:
    df_comp['P'+i] = None
    df_comp.loc[df_comp['P_AB'+i].isnull() & df_comp['P_BA'+i].notnull(),
                'P'+i] = df_comp['P_BA'+i].astype(str)+'(BA)'    
    df_comp.loc[df_comp['P_AB'+i].notnull() & df_comp['P_BA'+i].isnull(),
                'P'+i] = df_comp['P_AB'+i].astype(str)+'(AB)'
    df_comp.loc[df_comp['P_AB'+i].notnull() & df_comp['P_BA'+i].notnull(),
                'P'+i] = df_comp['P_AB'+i].astype(str)+'(AB)/' + df_comp['P_BA'+i].astype(str)+'(BA)'
    df_comp.loc[df_comp['P_AB'+i].notnull() & df_comp['P_BA'+i].notnull() & (df_comp['P_AB'+i] == df_comp['P_BA'+i]),
                'P'+i] = df_comp['P_AB'+i].astype(str)+'(AB/BA)'

In [10]:
# Step 6: join geometry back to the dataframe
df_comp_geo = df_comp.merge(link_shp_1[['AB_pair', 'geometry']],
                            left_on='AB_pair_n1',
                            right_on='AB_pair',
                            how='left').merge(link_shp_2[['AB_pair', 'geometry']],
                                              left_on='AB_pair_n2',
                                              right_on='AB_pair',
                                              how='left')
df_comp_geo['geometry'] = df_comp_geo['geometry_x']
df_comp_geo.loc[df_comp_geo.geometry_x.isnull(), 'geometry'] = df_comp_geo['geometry_y']

df_comp_geo.drop(columns = ['AB_pair_x', 'AB_pair_y', 'geometry_x', 'geometry_y'], inplace=True)

In [11]:
# Step 7: define geometry and export
df_comp_geo = gpd.GeoDataFrame(df_comp_geo, crs="EPSG:26910", geometry='geometry')
print('export {} rows of node-pair comparison data'.format(df_comp_geo.shape[0]))
df_comp_geo.to_file(os.path.join(working_dir, 'hwy_link_comp.shp'))

export 20640 rows of node-pair comparison data
