In [3]:
cd ..

/Users/kylemagida/git/blobs


In [4]:
# Import required libraries and assign root directory of blobs to root
import blobs
import numpy as np
import pandas as pd
import pysal as ps
import os
from shapely import geometry
import geopandas as gpd #Build most recent version from Git, don't use pip install, libspatialindex and Rtree are dependencies as well
from geopandas.tools import sjoin

root = os.getcwd()
%pylab inline
SHAPE_FILE = '/Users/kylemagida/Downloads/tl_2015_33_tract/tl_2015_33_tract.shp' #root + '/tracts/CensusTractsTIGER2010.shp'
DBF_FILE = '/Users/kylemagida/Downloads/tl_2015_33_tract/tl_2015_33_tract.dbf'
OUTPUT_FILE = root + '/test_data.csv'

Your version of PySAL is 188 days old.
There have likely been 1 new release(s).
Disable this check? [Y/n]n
Populating the interactive namespace from numpy and matplotlib


In [5]:
#Load shapefiles into geopandas and convert to the correct projection

POINT_PROJECTION = 4326 #Projection system of points, 4326 is traditional lat/long
GEOMETRY = 'geometry' # Location of geometry in the shapefile database
GEO_ID = 'TRACTCE' # ID in the shapefile to be aggregated on, can be changed to blocks or tracts
DEFAULT_CRS = {'datum': 'NAD83','k': 0.999975, 'lat_0': 36.66666666666666,'lon_0': -88.33333333333333,
 'no_defs': True,'proj': u'tmerc',u'units': u'us-ft',u'x_0': 300000, u'y_0': 0} # Default CRS assuming census tiger tracts with NAD83 projections

shape_df = gpd.read_file(SHAPE_FILE)
if shape_df.crs == {}:
    shape_df = gpd.GeoDataFrame(shape_df,crs=DEFAULT_CRS)
shape_df[GEOMETRY] = shape_df[GEOMETRY].to_crs(epsg=POINT_PROJECTION)
shape_df = gpd.GeoDataFrame(shape_df,crs=None) #Remove crs to complete join

In [21]:
point_df

Unnamed: 0,cmte_id,lat,long,"Bush, Jeb","Carson, Benjamin S.","Christie, Christopher J.","Clinton, Hillary Rodham","Cruz, Rafael Edward 'Ted'","Fiorina, Carly","Graham, Lindsey O.",...,"Pataki, George E.","Paul, Rand","Perry, James R. (Rick)","Rubio, Marco","Sanders, Bernard","Santorum, Richard J.","Trump, Donald J.","Walker, Scott","Webb, James Henry Jr.",geometry
0,,,,,,,,,,,...,,,,,,,,,,POINT (nan nan)
1,C00577130,44.336411,-71.785029,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,POINT (-71.78502940000001 44.3364107)
2,,,,,,,,,,,...,,,,,,,,,,POINT (nan nan)
3,C00577130,42.982344,-70.823887,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,POINT (-70.8238874 42.9823441)
4,,,,,,,,,,,...,,,,,,,,,,POINT (nan nan)
5,C00458844,43.008856,-71.447747,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,POINT (-71.4477469 43.0088559)
6,,,,,,,,,,,...,,,,,,,,,,POINT (nan nan)
7,C00577130,42.992747,-71.833415,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,POINT (-71.8334145 42.9927468)
8,,,,,,,,,,,...,,,,,,,,,,POINT (nan nan)
9,C00577130,43.726034,-72.142917,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,POINT (-72.1429172 43.7260338)


In [22]:
# Run through each file in filelist and build a final dataframe with points from all the specified files, each column with the given header

# This filelist can be built more dynamically if we want to rely on people pulling from Plenario
file_list = [{'location':'/Users/kylemagida/Downloads/new_hampshire_dummies_lat_long.csv','lat':'lat','long':'long',
             'unique_id':'cmte_id'}]

init_calls = pd.DataFrame()

for file_dict in file_list:
    unique_id = file_dict['unique_id']
    point_df = build_new_points(file_dict)
    data_cols = point_df.columns.tolist()
    for column in [GEOMETRY,file_dict['unique_id'],file_dict['lat'],file_dict['long']]:
        data_cols.remove(column)
    final_df = points_in_shapes(point_df,shape_df,file_dict)

    grouped = final_df.groupby(by=GEO_ID).sum()[data_cols]
    init_calls = init_calls.join(grouped,how='outer')

for c in init_calls.columns:
    init_calls[c] = init_calls[c].astype('float')
#init_calls[GEO_ID] = init_calls.index
init_calls.rename(columns={GEO_ID:'tractce10'},inplace=True)
init_calls.to_csv(OUTPUT_FILE,index_label='tractce10')

In [15]:
# Functions to use to log the associated tracts of a given set of points
def points_in_shapes(point_df,shape_df,file_dict):
    '''
    Joines points and shapes and returns the joined dataframe
    '''
    active_points_df = point_out_of_bounds(point_df,shape_df,file_dict)
    join_left_df = sjoin(active_points_df, shape_df, how="left")
    
    return join_left_df

def point_out_of_bounds(point_df, shape_df,file_dict):
    '''
    Drops points that aren't in the bounded area or are Null
    '''
    minx, miny, maxx, maxy = shape_df.total_bounds
    return point_df[(point_df[file_dict['lat']] > miny) & (point_df[file_dict['lat']] < maxy) & (point_df[file_dict['long']] > minx) & (point_df[file_dict['long']] < maxx)] #Drop Null & out of range points
   
def build_new_points(file_dict):
    '''
    Creates dataframe from file to join to shapefile
    '''
    point_df = pd.read_csv(file_dict['location'])
    point_df = gpd.GeoDataFrame(point_df)
    #Add Points to df
    point_list = []
    for index, row in point_df.iterrows():
        point_list.append(geometry.Point(row[file_dict['long']],row[file_dict['lat']]))
    point_df[GEOMETRY] = point_list
    return point_df