In [1]:
# Source: https://github.com/prestinomills/aqueduct/blob/Know_Your_Community_Pipelines/civis/geohub/ActiveBusinessBlockgroupAggregation/Active_Business_Finalized_Script.py
# %load active_business_script.py
"""
Created on Wed May  1 08:51:03 2019
@author: myrfid041
"""
import geopandas as gpd
import os
import pandas as pd

import esri_credentials

from sodapy import Socrata
from arcgis.gis import GIS
from arcgis.features.summarize_data import join_features
from IPython.display import display
from arcgis.features import FeatureLayer
from arcgis.features import FeatureLayerCollection

lahub_user = os.environ["LAHUB_ACC_USERNAME"]
lahub_pass = os.environ["LAHUB_ACC_PASSWORD"]


#---Setting the Outputs
OUTPUT_FILE = "./Listing_of_Active_Businesses.csv"
output_layer_name = '067a9242fbef4afeb1ca0744952e5724'

max_record_count = 250_000

#---Pulling Active Business Data
#client = Socrata("data.lacity.org", None)
#abiz = pd.DataFrame(client.get('ngkp-kqkn', limit=10000000))
abiz = pd.read_pickle("../data/abiz.p")


#---Pull NAIC Industry Table
n_table=(
    'https://raw.githubusercontent.com/CityofLosAngeles/civis-gcp-transition/{}/'
    'data/naics_industry_table.csv'
)
naics_table=pd.read_csv(n_table.format("active-business"))



def dataprep(df,naics_table):
    # Grab location info
    df = (df.dropna(subset=['location_1', 'naics'])
        .assign(
            location_2 = df.location_1.astype(str).str[34:-2]
        )
    )

    df = df.assign(
        longitude = df.location_2.str.split(",", expand=True)[0].astype(float),
        latitude = df.location_2.str.split(",", expand=True)[1].astype(float),
        naics_sector = df.naics.str[:2].astype(str),
    ).dropna(subset=["longitude", "latitude"])

    # Merge in NAICS sector
    df2 = pd.merge(df, 
                   naics_table.assign(
                       naics_sector = naics_table.naics_sector.astype(str)
                   ), 
            how = 'inner', on = 'naics_sector', validate = 'm:1'
            )

    # Create geometry column
    gdf = gpd.GeoDataFrame(df2.dropna(subset=['longitude', 'latitude']), 
        geometry = gpd.points_from_xy(df2.longitude, df2.latitude),
                                      crs = "EPSG:4326"
    ).to_crs("EPSG:2229") # Change to CA State Plane

    # Import block groups
    block_group_file=(
        'https://raw.githubusercontent.com/CityofLosAngeles/civis-gcp-transition/{}/'
        'data/LACounty_Blockgroup.geojson'
    )
    block = gpd.read_file(block_group_file.format("active-business"))

    # Aggregate
    sjoin=gpd.sjoin(gdf, block, how='inner', op='intersects')
    
    sjoin = sjoin.assign(
        GEOID10 = sjoin.GEOID10.astype(str).apply(lambda x: '{0:0>12}'.format(x))
    )

    sjoin2=(sjoin.pivot_table(index='GEOID10', 
                    values='business_name',
                    columns=['naics_industry'], 
                    aggfunc=len)
        .reset_index()
        .fillna(0)
        .rename_axis(None, axis="columns")
    )
    
    # Merge geometry back in, since we lose the block group's polygon geometry when we aggregate
    sjoin3 = pd.merge(block, sjoin2, 
                      on = "GEOID10", how = "inner", validate = "1:1")
    
    return sjoin3


def top10(df):
    '''
    Find the top 10 predominant industries in entire county
    Exclude 2 categories
    Return a list (used to update feature layer item property)
    '''
    
    # Exclude these cols because we can't use idxmax on them
    exclude_cols = ['CTBG10', 'CT10', 'AreaSqMil', 'LABEL', 'FIP10', 'FIP10RV',
       'CDP_NAME', 'CITYNAME', 'COMMNAME', 'Shape_STAr', 'Shape_STLe',
       'geometry',]
    
    county_aggregate = (
        pd.DataFrame(df.drop(columns = exclude_cols)
                     .set_index("GEOID10")
                     .idxmax(axis=1))
        .reset_index()
        .rename(columns = {0: "predominant_industry"})
    )
    
    # Get a list, descending order
    predominant_industries = (county_aggregate.predominant_industry.value_counts()
                              .index
                              .to_list()
                             )
    
    # Exclude these categories, then grab top 10
    exclude_me = ['Professional, Scientific, and Technical Services', 
              'Other Services (except Public Administration)']
    for i in exclude_me:
        predominant_industries.remove(i)
    
    top10_industries = predominant_industries[0:10]
    
    return top10_industries



'''
ESRI stores the column names slightly differently (subject to 10 char limits)
Use dict to map and rename (key-value pair)
Key: dataframe's existing column name
Value: ESRI column name
'''
layer_rename_columns_dict = {
    'Accommodation and Food Services': 'Accommodation_and_Food_Services',
    'Administrative and Support and Waste Management and Remediation Services': 'Administrative_and_Support_and_',
    'Agriculture, Forestry, Fishing and Hunting': 'Agriculture__Forestry__Fishing_',
    'Arts, Entertainment, and Recreation': 'Arts__Entertainment__and_Recrea',
    'Construction': 'Construction',
    'Educational Services': 'Educational_Services',
    'Finance and Insurance': 'Finance_and_Insurance',
    'Health Care and Social Assistance': 'Health_Care_and_Social_Assistan',
    'Information': 'Information',
    'Manufacturing': 'Manufacturing',
    'Medical Marijuana Collective': 'Medical_Marijuana_Collective',
    'Mining': 'Mining',
    'Not Classified': 'Not_Classified',
    'Other Services (except Public Administration)': 'Other_Services__except_Public_A',
    'Professional, Scientific, and Technical Services': 'Professional__Scientific__and_T',
    'Real Estate Rental and Leasing': 'Real_Estate_Rental_and_Leasing',
    'Retail Trade': 'Retail_Trade',
    'Transportation and Warehousing': 'Transportation_and_Warehousing',
    'Utilities': 'Utilities',
    'Wholesale Trade': 'Wholesale_Trade'                                               
}


df=dataprep(abiz,naics_table)
top10_industries = top10(df)

df.to_parquet("./cleaned_df.parquet")


This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.



ArrowTypeError: ('Did not pass numpy.dtype object', 'Conversion failed for column None with type int64')

In [41]:
df = df.rename(columns = layer_rename_columns_dict)
for c in layer_rename_columns_dict.values():
    df[c] = df[c].fillna(0).astype(int)
    
df.to_pickle("./cleaned_df.p")

In [42]:
df = pd.read_pickle("./cleaned_df.p")

lahub_user = esri_credentials.tiffany_user
lahub_pass = esri_credentials.tiffany_pw
output_layer_name = c_shapefile

print(output_layer_name)

a6b364badfb748aaa6771eb77a48f2ff


In [44]:
import shutil

# Make zipped shapefile (used for AGOL web app)
def make_zipped_shapefile(df, path):
    # Grab first element of path (can input filename.zip or filename)
    dirname = os.path.splitext(path)[0]
    print(f'Path name: {path}')
    print(f'Dirname (1st element of path): {dirname}')
    # Make sure there's no folder with the same name
    shutil.rmtree(dirname, ignore_errors = True)
    # Make folder
    os.mkdir(dirname)
    shapefile_name = f'{os.path.basename(dirname)}.shp'
    print(f'Shapefile name: {shapefile_name}')
    # Export shapefile into its own folder with the same name 
    df.to_file(driver = 'ESRI Shapefile', filename = f'{dirname}/{shapefile_name}')
    print(f'Shapefile component parts folder: {dirname}/{shapefile_name}')
    # Zip it up
    shutil.make_archive(dirname, 'zip', dirname)
    # Remove the unzipped folder
    shutil.rmtree(dirname, ignore_errors = True)
    
    
    
make_zipped_shapefile(
    gpd.GeoDataFrame(df, geometry=df.geometry.to_crs("EPSG:4326")), 
    "./upload_me.zip")

Path name: ./upload_me.zip
Dirname (1st element of path): ./upload_me
Shapefile name: upload_me.shp


  app.launch_new_instance()


Shapefile component parts folder: ./upload_me/upload_me.shp


In [None]:
def geohub_updates(x,user,pas, feature_layer_id, 
        top10_industries, column_renaming_dict, OUTPUT_FILE):
    gis = GIS('https://lahub.maps.arcgis.com',  username=user, password=pas)
    '''
    #feature_layer_id = '067a9242fbef4afeb1ca0744952e5724' # Preston's layer
    actbus=gis.content.search(feature_layer_id)
    ActiveBusinesses_item = actbus[0]
    ActiveBusinesses_flayer = ActiveBusinesses_item.layers[0]
    ActiveBusinesses_fset = ActiveBusinesses_flayer.query() #querying without any conditions returns all the features
    # Possibly not all the block groups are mapped
    # Select the block groups that are existing in the map (inner merge), and save those to CSV
    '''
    flayer = gis.content.get(feature_layer_id)
    ActiveBusinesses_flayer = flayer.layers[0].query()
    flayer_collection = FeatureLayerCollection.fromitem(flayer)
    
    existing_table = ActiveBusinesses_flayer.sdf
    display(existing_table.head(10))
    print(existing_table.columns)
    print(existing_table.dtypes)
    updated_table = pd.merge(
        #existing_table[["OBJECTID", "GEOID10", "SHAPE"]], 
        existing_table[["ObjectId", "GEOID10", "SHAPE"]].assign(
            GEOID10 = existing_table.GEOID10.str.replace("A", "")
        ),
        # Rename columns to match, drop columns not in layer
        # ESRI renamed the geometry to be SHAPE...and it's stored slightly different than in geodataframe
        (df.rename(columns = layer_rename_columns_dict)
        .drop(columns = ['Shape_STAr','Shape_STLe'])
        ), 
        on = "GEOID10", 
        how = "inner")

    # Integers, not floats
    integrify_me = list(layer_rename_columns_dict.values())

    for c in integrify_me:
        updated_table[c] = updated_table[c].astype(int)
    
    #ActiveBusinesses_flayer.sdf = updated_table
    #print("updated table")
    return updated_table

    '''
    # Stage 2 files, one to check into GitHub (no geometry), one to use to back dashboard
    #gpd.GeoDataFrame(updated_table).to_file(driver = "GeoJSON", filename = "./upload_me.geojson")
    updated_table.to_csv('./upload_me.csv', index=False)
    updated_table[["GEOID10"] + integrify_me].to_csv(OUTPUT_FILE, index=False)
        
    # Overwrite table
    flayer_collection.manager.overwrite("./upload_me.csv")
    flayer_collection.manager.update_definition({"maxRecordCount": max_record_count})
    
    os.remove("./upload_me.geojson")
    
    text = """
    This layer is aggregating 
    <a href="https://data.lacity.org/A-Prosperous-City/Listing-of-Active-Businesses/6rrh-rzua">
    Listing of Active Businesses Data</a> 
    that have geospatial information associated. 
    The top 10 most frequent industries in block groups are:
    {}
    """
    x = ', '.join([str(elem) for elem in top10_industries]) 
    item_props = {'title' : 'Active Businesses Data by Block Group', 'description':text.format(x)}
    # Use flayer.update or flayer_collection.update?
    flayer_collection.update(item_properties=item_props)
    print("updates made!")
    '''

In [None]:
test = geohub_updates(df, lahub_user, lahub_pass, output_layer_name,  top10_industries,
        layer_rename_columns_dict, OUTPUT_FILE)