In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import geopandas as gpd
from shapely.geometry import Point, Polygon

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
import zipfile
import requests
import os
import shutil
%matplotlib inline

In [52]:
#Load the BBL list
BBL12_17CSV = ['https://opendata.arcgis.com/datasets/82ab09c9541b4eb8ba4b537e131998ce_22.csv', 'https://opendata.arcgis.com/datasets/4c4d6b4defdf4561b737a594b6f2b0dd_23.csv',   'https://opendata.arcgis.com/datasets/d7aa6d3a3fdc42c4b354b9e90da443b7_1.csv',     'https://opendata.arcgis.com/datasets/a8434614d90e416b80fbdfe2cb2901d8_2.csv', 'https://opendata.arcgis.com/datasets/714d5f8b06914b8596b34b181439e702_36.csv',     'https://opendata.arcgis.com/datasets/c4368a66ce65455595a211d530facc54_3.csv',]

In [76]:
def data_pipeline(shapetype, bbl_links, supplement=None,
                 dex=None, ts_lst_range=None):
    #A pipeline for group_e dataframe operations
    
    #Test inputs
    if supplement:
        assert isinstance(supplement, dict)
    assert isinstance(bbl_links, list)
    if ts_lst_range:
        assert isinstance(ts_lst_range, list)
        assert len(ts_lst_range) == 2 #Must be list of format [start-yr, end-yr]
        

    
    #We'll need our addresspoints and our shapefile
    #for the time_unit_of_analysis
    if not dex:
        dex = addr_shape(shapetype)
    
    #We need a list of time_unit_of_analysis
    if ts_lst_range:
        ts_lst = [x+(i/100) for i in range(1,13,1) for x in range(1980, 2025)]
        ts_lst = [x for x in ts_lst if 
                  x >= ts_lst_range[0] and x <= ts_lst_range[1]]
        ts_lst = sorted(ts_lst)
    if not ts_lst_range:
        ts_lst = [x+(i/100) for i in range(1,13,1) for x in range(2012, 2017)]
        ts_lst = sorted(ts_lst)
    
    #Now we need to stack our BBL data
    
    #Begin by forming an empty DF
    bbl_df = pd.DataFrame()
    for i in bbl_links:
        bbl = pd.read_csv(i, encoding='utf-8', low_memory=False)
        col_len = len(bbl.columns)
        bbl_df = bbl_df.append(bbl)
        if len(bbl.columns) != col_len:
            print('Column Mismatch!')
        del bbl
    bbl_df.LICENSE_START_DATE = pd.to_datetime(bbl_df.LICENSE_START_DATE)

    bbl_df.sort_values('LICENSE_START_DATE')

    bbl_df['month'] = 0
    
    bbl_df['month'] = bbl_df['LICENSE_START_DATE'].dt.year + (
        bbl_df['LICENSE_START_DATE'].dt.month/100
    )
    
    bbl_df = bbl_df.dropna(subset=['month'])
    bbl_df = bbl_df.set_index(['MARADDRESSREPOSITORYID','month'])
    bbl_df = bbl_df.sort_index(ascending=True)
    bbl_df.reset_index(inplace=True)
        

dex = addr_shape('anc')
data_pipeline('anc', BBL12_17CSV, supplement=None, dex=dex, ts_lst_range=None)


Unnamed: 0,X,Y,OBJECTID,BBL_LICENSE_FACT_ID,LICENSESTATUS,LICENSECATEGORY,CUST_NUM,TRADE_NAME,LICENSE_START_DATE,LICENSE_EXPIRATION_DATE,...,WARD,ANC,SMD,DISTRICT,PSA,NEIGHBORHOODCLUSTER,HOTSPOT2006NAME,HOTSPOT2005NAME,HOTSPOT2004NAME,BUSINESSIMPROVEMENTDISTRICT
0,-77.036912,38.912172,27389990,313275,ACTIVE,Charitable Solicitation,400212000232,NATIONAL COALITION FOR ASIAN PACIFIC AMERICAN ...,2016-05-01T00:00:00.000Z,2018-04-30T00:00:00.000Z,...,2,2B,2B04,THIRD,301,Cluster 6,,,,
1,-77.030596,38.897613,27389991,313276,ACTIVE,Charitable Solicitation,400212000234,FIRST BOOK,2016-05-01T00:00:00.000Z,2018-04-30T00:00:00.000Z,...,2,2C,2C01,FIRST,101,Cluster 8,,,,Downtown BID
2,-77.040352,38.903515,27389993,313279,ACTIVE,Charitable Solicitation,400212000239,,2016-05-01T00:00:00.000Z,2018-04-30T00:00:00.000Z,...,2,2B,2B06,SECOND,207,Cluster 6,,,,Golden Triangle BID
3,-77.045249,38.912062,27389994,313280,ACTIVE,Charitable Solicitation,400212000240,INTERNATIONAL UNION FOR CONSERVATION OF NATURE...,2016-05-01T00:00:00.000Z,2018-04-30T00:00:00.000Z,...,2,2B,2B02,SECOND,208,Cluster 6,,,,
4,-77.042468,38.904010,27389995,313281,ACTIVE,Charitable Solicitation,400212000245,JOHN MANJIRO-WHITFIELD COMMEMORATIVE CENTER FO...,2016-05-01T00:00:00.000Z,2018-04-30T00:00:00.000Z,...,2,2B,2B06,SECOND,207,Cluster 6,,,,Golden Triangle BID
5,-77.032813,38.902150,27389996,313286,ACTIVE,Charitable Solicitation,400212000255,CULTURAL VISTAS INC.,2016-06-01T00:00:00.000Z,2018-05-31T00:00:00.000Z,...,2,2F,2F05,SECOND,207,Cluster 8,,,,Downtown BID
6,-76.999957,38.825099,27389997,313288,ACTIVE,Charitable Solicitation,400212000258,STR8-N-UP PRODUCTIONS INC,2016-06-01T00:00:00.000Z,2018-05-31T00:00:00.000Z,...,8,8D,8D02,SEVENTH,708,Cluster 39,,,,
7,-76.986598,38.880354,27389998,313289,ACTIVE,Charitable Solicitation,400212000260,WASHINGTON FEDERAL TRIANGLES SOCCER CLUB,2016-03-01T00:00:00.000Z,2018-02-28T00:00:00.000Z,...,6,6B,6B07,FIRST,106,Cluster 26,,,,Capitol Hill BID
8,-77.108831,38.936167,27389999,313290,ACTIVE,Charitable Solicitation,400212000263,SIBLEY MEMORIAL HOSPITAL FOUNDATION,2016-04-01T00:00:00.000Z,2018-03-31T00:00:00.000Z,...,3,3D,3D04,SECOND,205,Cluster 13,,,,
9,-77.040839,38.915374,27390000,313295,ACTIVE,Charitable Solicitation,400212000277,ROCK CREEK RIDERS,2016-06-01T00:00:00.000Z,2018-05-31T00:00:00.000Z,...,2,2B,2B08,THIRD,301,Cluster 6,,,,


In [62]:
def addr_shape(shapetype):
    
    #Process user-defined shapefile
    assert shapetype == 'census' or 'ward' or 'anc' #Currently only supports these
    
    crs='EPSG:4326' #Convenience assignment of crs throughout
    
    if shapetype == 'census':
        shp_fl = down_extract_zip(
            'https://opendata.arcgis.com/datasets/6969dd63c5cb4d6aa32f15effb8311f3_8.zip'
        ) #Download the zip file and extract it, then assign the shapefile path
        shp_census = gpd.read_file(shp_fl, crs=crs)
        shp_df     = gpd.GeoDataFrame(shp_census,
                                  crs=crs,
                                  geometry=shp_census['geometry']
                                 )

    
    if shapetype == 'ward':
        shp_fl = down_extract_zip(
            'https://opendata.arcgis.com/datasets/0ef47379cbae44e88267c01eaec2ff6e_31.zip'
        ) #Download the zip file and extract it, then assign the shapefile path
        shp_ward   = gpd.read_file(shp_fl, crs=crs)
        shp_df     = gpd.GeoDataFrame(shp_ward,
                                  crs=crs,
                                  geometry=shp_ward['geometry']
                                 )

    if shapetype == 'anc':
        shp_fl = down_extract_zip(
            'https://opendata.arcgis.com/datasets/fcfbf29074e549d8aff9b9c708179291_1.zip'
        ) #Download the zip file and extract it, then assign the shapefile path
        shp_anc    = gpd.read_file(shp_fl, crs=crs)
        shp_df     = gpd.GeoDataFrame(shp_anc,
                                  crs=crs,
                                  geometry=shp_anc['geometry']
                                 )
        
    
    adr_df   = pd.read_csv('https://opendata.arcgis.com/datasets/aa514416aaf74fdc94748f1e56e7cc8a_0.csv',
                     encoding = 'utf-8', low_memory= False)

    return [adr_df, shp_df]

In [61]:
def down_extract_zip(url):
    #Downloads and unzips a DCopendata shape file
    #and returns the filepath of the shape file
    
    #Usage: must have file named 'data' in cwd.
    #Then: flname = down_extract_zip(url_of_zipfile)
    #flname is now the path of the shpfile.
    
    local_filename = str('./data/' + url.split('/')[-1])
        
    r = requests.get(url)
    assert r.status_code == 200 #Check connection
    with open(local_filename, 'wb') as f:
        f.write(r.content)
    zip_fld = local_filename[:-4]
    if not os.path.exists(zip_fld):
        os.makedirs(zip_fld)
        with zipfile.ZipFile(local_filename, "r") as zip_pl:
            zip_pl.extractall(zip_fld)
    fld_arr = os.listdir(zip_fld) #get array of unzipped files
    for unzippedfile in fld_arr:  #find shapefile in unzipped files
        if str(unzippedfile[-3:]) == 'shp':
            flname = zip_fld + '/' + unzippedfile
    if not flname: 
        print('Shapefile not found!')
    return flname

In [51]:
def joinOurShapes(dex=[adr_df, shp_df], toJoin=None, transf=None):
    assert isinstance(toJoin, pd.DataFrame) #Must join a dataframe!
    
       
    

['Advisory_Neighborhood_Commissions_from_2013.cpg', 'Advisory_Neighborhood_Commissions_from_2013.dbf', 'Advisory_Neighborhood_Commissions_from_2013.shp', 'Advisory_Neighborhood_Commissions_from_2013.prj', 'Advisory_Neighborhood_Commissions_from_2013.shx', 'Advisory_Neighborhood_Commissions_from_2013.xml']
cpg
dbf
shp
prj
shx
xml


[                X          Y  OBJECTID_12  SITE_ADDRESS_PK  ADDRESS_ID  \
 0      -77.007681  38.967165       833017            40832       40832   
 1      -77.009064  38.833812       833018            29593       29593   
 2      -77.009933  38.834297       833019            29670       29670   
 3      -77.009680  38.829462       833020            56085       56085   
 4      -77.009620  38.829463       833021            60469       60469   
 5      -77.008261  38.828031       833022              458         458   
 6      -77.009033  38.827188       833023             5028        5028   
 7      -77.008894  38.826822       833024             6458        6458   
 8      -77.012338  38.826667       833025            23227       23227   
 9      -77.012310  38.827995       833026            25310       25310   
 10     -77.012309  38.826827       833027            54796       54796   
 11     -77.012255  38.826823       833028            54837       54837   
 12     -77.011575  38.82

In [5]:
teststr = 'jdwqadl;w'
teststr[:-3]

'jdwqad'