In [8]:
import os, glob, getpass
import pandas as pd
import fiona
import geopandas as gpd

username = getpass.getuser()

## Set up input/output folders

In [77]:
geojson_dir = os.path.join('/Users',
                          username,
                          'Box',
                          'Dataviz Projects',
                          'Spatial Analysis and Mapping',
                          'Active Transportation Plan',
                          'Data',
                          'geojson')
match_dir = os.path.join('/Users',
                        username,
                        'Box',
                        'DataViz Projects',
                        'Spatial Analysis and Mapping',
                        'Active Transportation Plan',
                        'Data',
                        'shst_match_results',
                        'matched')
output_dir = os.path.join('/Users',
                          username,
                          'Box',
                          'MTC Data for Toole Design',
                          'final_nw_datasets')

## Functions

In [6]:
layer_list = [
    'san_jose_bike_nw_epsg4326',
    'vta_bike_network_v2_epsg4326',
    'batc_bike_network_v2_epsg4326',
    'caltrans_d4_bike_network_epsg4326',
    'oakland_bike_network_epsg4326',
    'batc_bike_network_epsg4326',
    'vta_bike_network_epsg4326',
    'tam_bike_network_epsg4326',
    'sta_bike_network_epsg4326',
    'sfcta_bike_network_epsg4326',
    'scta_bike_network_epsg4326',
    'nvta_bike_network_epsg4326',
    'ccta_bike_network_epsg4326',
   'ccag_bike_network_epsg4326',
    'actc_bike_network_epsg4326'
]

In [46]:
in_file = 'actc_bike_network_epsg4326'

In [121]:
def merge_deduplicate_matched_nw(match_dir,in_file):
    """
    Merge and deduplicate matched exising and proposed bike network datasets. 
    
    Return dataframe. 
    """
    
    match_files = glob.glob(match_dir + '/' + in_file + '?*.geojson')
    
    col_name_dict = {
        'san_jose_bike_nw_epsg4326': 'sj',
        'vta_bike_network_v2_epsg4326': 'cma',
        'batc_bike_network_v2_epsg4326': 'batc',
        'caltrans_d4_bike_network_epsg4326': 'caltrans',
        'oakland_bike_network_epsg4326': 'oak',
        'tam_bike_network_epsg4326': 'cma',
        'sta_bike_network_epsg4326': 'cma',
        'sfcta_bike_network_epsg4326': 'cma',
        'scta_bike_network_epsg4326': 'cma',
        'nvta_bike_network_epsg4326': 'cma',
        'ccta_bike_network_epsg4326': 'cma',
        'ccag_bike_network_epsg4326': 'cma',
        'actc_bike_network_epsg4326': 'cma'
    }
    
    print('----------Start reading shst match results data-------------')
    
    concat_gdf = gpd.GeoDataFrame()
    for file in match_files:
        print('Reading shst match results data: ' + file)
        gdf = gpd.read_file(file)
        concat_gdf = pd.concat([concat_gdf,gdf],
                               ignore_index=True,
                               sort=False)
        
    print('----------Finished reading shst match results data----------')
    
    print('\n-----------Renaming columns-------------------------------')
    
    rename_dict = {
        'pp_ex_class':col_name_dict[in_file] + '_ex_class',
        'pp_pln_class':col_name_dict[in_file] + '_pln_class',
        'source':col_name_dict[in_file] + '_source',
        'pp_mtc_facility_id':'mtc_facility_id'
    }
    
    concat_gdf.rename(columns=rename_dict,
                      inplace=True)

    print('\n-----------Reprojecting-----------------------------------')
    
    concat_gdf.to_crs('EPSG:3857',inplace=True)
    
    print('\n-----------Adding link length column----------------------')
    
    concat_gdf['length'] = concat_gdf['geometry'].length
    
    print('\n--Sorting by length decending, drop duplicates by subset--')
    print('\nCount of records: ',concat_gdf.shape[0])
    
    subset = [
        'shstReferenceId',
        'shstGeometryId'
    ]
    dedup_gdf = (
        concat_gdf
        .sort_values('length',ascending=False)
        .drop_duplicates(subset=subset,keep='first')
        .copy()
    )
    
    print('\nFinal count of deduped records: ',dedup_gdf.shape[0])
    
    col_subset = [
        'shstReferenceId',
        'shstGeometryId',
        'fromIntersectionId',
        'toIntersectionId',
        col_name_dict[in_file] + '_ex_class',
        col_name_dict[in_file] + '_pln_class',
        col_name_dict[in_file] + '_source',
        'mtc_facility_id'
    ]
    return dedup_gdf[col_subset]

In [122]:
actc_dedup = merge_deduplicate_matched_nw(match_dir=match_dir,in_file=in_file)

----------Start reading shst match results data-------------
Reading shst match results data: /Users/jcroff/Box/DataViz Projects/Spatial Analysis and Mapping/Active Transportation Plan/Data/shst_match_results/matched/actc_bike_network_epsg4326_exst_matched.geojson
Reading shst match results data: /Users/jcroff/Box/DataViz Projects/Spatial Analysis and Mapping/Active Transportation Plan/Data/shst_match_results/matched/actc_bike_network_epsg4326_ppsd_matched.geojson
----------Finished reading shst match results data----------

-----------------Renaming columns-------------------------

----------------Reprojecting------------------------------

-----------Adding link length column----------------------

--Sorting by length decending, drop duplicates by subset--

Count of records:  89550

Final count of deduped records:  45519


In [123]:
def compare_matched_raw_nw(matched_df,geojson_dir,in_file):
    """
    Compare matched bike network to original raw input and return unmatched as gdf.
    """
    print('-----------Reading raw files------------------------------')
    
    raw_file = os.path.join(geojson_dir,in_file +'.geojson')
    
    raw_gdf = gpd.read_file(raw_file)
    
    raw_cols = raw_gdf.columns.to_list()
    
    print('\n-----------Merging matched and raw files------------------')
    
    raw_match_merge = pd.merge(raw_gdf,
                           actc_dedup,
                           how='left',
                           on='mtc_facility_id',
                           indicator=True)
    
    print('\n----------Creating new unmatched dataframe----------------')
    
    unmatched_gdf = raw_match_merge[raw_match_merge['_merge'] == 'left_only'].copy()
    
    print('\nUnmatched records: ',unmatched.shape[0])
    
    return unmatched_gdf

In [124]:
actc_unmatched = compare_matched_raw_nw(matched_df=actc_dedup,geojson_dir=geojson_dir,in_file=in_file)

-----------Reading raw files------------------------------

-----------Merging matched and raw files------------------

----------Creating new unmatched dataframe----------------

Unmatched records:  6542
