### Script for aggregating quadtree data to census tract level

In [1]:
#import packages
import pandas as pd
import geopandas as gpd
import numpy as np
import convert_quadtree_to_latlon as con
from shapely.geometry import Polygon
from pygeotile.tile import Tile
from shapely.geometry import Point
import os
import os.path

In [2]:
#set working directory
os.chdir('/projects/mpi/shared/Data')
os.getcwd()

'/projects/mpi/shared/Data'

In [3]:
#grab file strings
chicago = 'chicago_all.csv'
nyc = 'nyc_all.csv'
la = 'la_all.csv'
sf = 'sf_all.csv'

In [4]:
col_list = ['qtid_origin','qtid_destination','OD_counts','num_dates','avg_travel_distance (mile)',
            'avg_travel_time (second)','time_period','dominant_mode_1','dominant_mode_2']

In [5]:
def qt2geom(qt):
    bbox = Tile.from_quad_tree(qt).bounds
    # min_lng, min_lat, max_lng, max_lat
    poly = Polygon.from_bounds(bbox[0][1], bbox[0][0], bbox[1][1], bbox[1][0]) 
    return poly

In [6]:
def qtid_to_geo(filepath,col_list):
    '''Function for assigning lat-lon geometry to qtid files'''
    #read data into a pandas df
    df = pd.read_csv(filepath,header = None,names =col_list,
                       dtype={'qtid_origin': object, 'qtid_destination': object})
    
    #extract qtid for origin and destination and apply polygon mapping function
    qcode_list = set(list(df['qtid_origin'])+list(df['qtid_destination']))
    qt_poly = list(map(qt2geom,qcode_list))
    
    #set as dataframes for merging
    qcode_df = pd.DataFrame(list(qcode_list),columns=['qtid'])
    qt_poly_df = pd.DataFrame(qt_poly,columns=['geometry'])
    
    #merge polygon and qtid dfs together and merge back with OD data to grab the origin polygon
    merged = pd.merge(left=qcode_df,right=qt_poly_df,left_index=True,right_index=True)
    data = pd.merge(left=df,right=merged,how='left',left_on='qtid_origin',right_on='qtid')
    data.rename(columns={'geometry':'origin_geo'},inplace=True) #clean up
    data.drop(['qtid'],axis=1,inplace=True) #clean up
    
    #merge again to get the destination polygon 
    data = pd.merge(left=data,right=merged,how='left',left_on='qtid_destination',right_on='qtid')
    data.rename(columns={'geometry':'dest_geo'},inplace=True) #clean up
    data.drop(['qtid'],axis=1,inplace=True) #clean up
    
    return data

In [7]:
#generate dataframes with polygons for each city
chic = qtid_to_geo(chicago,col_list)
nyc = qtid_to_geo(nyc,col_list)
la = qtid_to_geo(la,col_list)
sf = qtid_to_geo(sf,col_list)

In [51]:
#list of census tract files. source: https://www.census.gov/geo/maps-data/data/cbf/cbf_tracts.html

census_tract_zips = ['cb_2017_06_tract_500k.zip','cb_2017_09_tract_500k.zip','cb_2017_17_tract_500k.zip',
                    'cb_2017_34_tract_500k.zip','cb_2017_36_tract_500k.zip','cb_2017_18_tract_500k.zip',
                    'cb_2017_55_tract_500k.zip','cb_2017_26_tract_500k.zip']

#move and unzip the files
for file in census_tract_zips:
    if os.path.isfile(os.getcwd()+'/shapefiles/'+file):
        os.system('unzip '+os.getcwd()+'/shapefiles/'+file)
    else:
        os.system('mv '+file+' '+os.getcwd()+'/shapefiles')
        os.system('unzip '+os.getcwd()+'/shapefiles/'+file)

In [8]:
#list of census tracts to loop over and import and concatenate
folder = os.getcwd()+'/shapefiles/'

census_tracts = ['cb_2017_06_tract_500k.shp','cb_2017_09_tract_500k.shp','cb_2017_17_tract_500k.shp',
                    'cb_2017_34_tract_500k.shp','cb_2017_36_tract_500k.shp','cb_2017_18_tract_500k.shp',
                    'cb_2017_55_tract_500k.shp','cb_2017_26_tract_500k.shp']

In [9]:
#function for reading in the shapefiles and concatenating them
def geo_agg(tract_file_list,path):
    gdf_list = []
    for i in tract_file_list:
        ct = gpd.read_file(path+i)
        gdf_list.append(ct)
    
    all_tracts = pd.concat(gdf_list)
    
    return all_tracts

all_tracts = geo_agg(census_tracts,folder)

In [10]:
all_tracts['STATEFP'].unique()

array(['06', '09', '17', '34', '36', '18', '55', '26'], dtype=object)

In [11]:
all_tracts.head()

Unnamed: 0,STATEFP,COUNTYFP,TRACTCE,AFFGEOID,GEOID,NAME,LSAD,ALAND,AWATER,geometry
0,6,1,400600,1400000US06001400600,6001400600,4006.0,CT,297856,0,"POLYGON ((-122.26807 37.844136, -122.26514 37...."
1,6,1,400900,1400000US06001400900,6001400900,4009.0,CT,420877,0,"POLYGON ((-122.285576 37.839778, -122.283186 3..."
2,6,1,401400,1400000US06001401400,6001401400,4014.0,CT,758204,0,"POLYGON ((-122.278611 37.826878, -122.268563 3..."
3,6,1,403000,1400000US06001403000,6001403000,4030.0,CT,352394,0,"POLYGON ((-122.274757 37.79883299999999, -122...."
4,6,1,405902,1400000US06001405902,6001405902,4059.02,CT,487280,0,"POLYGON ((-122.247175 37.789913, -122.243512 3..."


In [12]:
all_tracts.shape #check total number of census tracts

(24580, 10)

In [12]:
#save aggregated CT file
all_tracts.to_file('all_cts.shp')

In [13]:
def qtid_ct_join(df,shp):
    '''function to geojoin cts to qtid
    input: df= dataframe of OD pair data; shp = tract shapefile
    output: qtid level data frame with census tracts tagged'''
    #turn dataframes into geodata frames and assign geometry for origin and destination
    origin_df = gpd.GeoDataFrame(df, crs={'init': 'epsg:4269'}, geometry=df['origin_geo'])
    dest_df = gpd.GeoDataFrame(df, crs={'init': 'epsg:4269'}, geometry=df['dest_geo'])
    
    #join each origin and destination geo_df to the census tract file
    origin_geo = gpd.sjoin(origin_df,shp, how='left', op='intersects')
    dest_geo = gpd.sjoin(dest_df,shp, how='left', op='intersects')
    
    #clean up geodataframes
    origin_geo.drop(['index_right','COUNTYFP','AFFGEOID','GEOID', 'NAME', 'LSAD', 'ALAND', 'AWATER'],axis=1,inplace=True)
    origin_geo.rename(columns={'STATEFP':'origin_st','TRACTCE':'origin_ct'},inplace=True)
    
    dest_geo.drop(['index_right','COUNTYFP','AFFGEOID','GEOID', 'NAME', 'LSAD', 'ALAND', 'AWATER'],axis=1,inplace=True)
    dest_geo.rename(columns={'STATEFP':'dest_st','TRACTCE':'dest_ct'},inplace=True)
    
    df_od = pd.merge(left=origin_geo,right=dest_geo,left_index=True,right_index=True) #merge dataframes
    
    df_od = df_od[~df_od.index.duplicated(keep='last')] #drop the duplicate indices
    
    df_od = df_od[['qtid_origin_x','qtid_destination_x','OD_counts_x',
                'num_dates_x','time_period_x','origin_st', 'origin_ct','dest_st', 'dest_ct']] #just keep the columns needed
    
    df_od.rename({'qtid_origin_x':'qtid_origin','qtid_destination_x':'qtid_dest','OD_counts_x':'od_counts',
                'num_dates_x':'num_dates','time_period_x':'time_period'},inplace=True)
    
    return df_od

In [15]:
#create an linked file for each
chicago = qtid_ct_join(chic,all_tracts)
new_york = qtid_ct_join(nyc,all_tracts)
la = qtid_ct_join(la,all_tracts)
sf = qtid_ct_join(sf,all_tracts)

In [16]:
#output the qtid level data with ct tags
chicago.to_csv('chic_qtid_ct.csv')
new_york.to_csv('ny_qtid_ct.csv')
la.to_csv('la_qtid_ct.csv')
sf.to_csv('sf_qtid_ct.csv')

In [None]:
chic = chicago.groupby(['origin_ct','dest_ct','time_period_x','origin_st','dest_st'])[['OD_counts_x','num_dates_x']].sum()
ny = new_york.groupby(['origin_ct','dest_ct','time_period_x','origin_st','dest_st'])[['OD_counts_x','num_dates_x']].sum()
la = la.groupby(['origin_ct','dest_ct','time_period_x','origin_st','dest_st'])[['OD_counts_x','num_dates_x']].sum()
sf = sf.groupby(['origin_ct','dest_ct','time_period_x','origin_st','dest_st'])[['OD_counts_x','num_dates_x']].sum()


In [None]:
chic.to_csv("chic_ct.csv")
ny.to_csv("ny_ct.csv")
la.to_csv("la_ct.csv")
sf.to_csv("sf_ct.csv")