In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import geopandas as gpd
from shapely.geometry import Point, Polygon

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
import zipfile
import requests
import os
import shutil
%matplotlib inline

In [6]:
#Load the BBL list
BBL12_17CSV = ['https://opendata.arcgis.com/datasets/82ab09c9541b4eb8ba4b537e131998ce_22.csv', 'https://opendata.arcgis.com/datasets/4c4d6b4defdf4561b737a594b6f2b0dd_23.csv',   'https://opendata.arcgis.com/datasets/d7aa6d3a3fdc42c4b354b9e90da443b7_1.csv',     'https://opendata.arcgis.com/datasets/a8434614d90e416b80fbdfe2cb2901d8_2.csv', 'https://opendata.arcgis.com/datasets/714d5f8b06914b8596b34b181439e702_36.csv',     'https://opendata.arcgis.com/datasets/c4368a66ce65455595a211d530facc54_3.csv',]

In [7]:
def addr_shape(shapetype):
    
    #Process user-defined shapefile
    assert shapetype == 'census' or 'ward' or 'anc' #Currently only supports these
    
    crs='EPSG:4326' #Convenience assignment of crs throughout
    
    if shapetype == 'census':
        shp_fl = down_extract_zip(
            'https://opendata.arcgis.com/datasets/6969dd63c5cb4d6aa32f15effb8311f3_8.zip'
        ) #Download the zip file and extract it, then assign the shapefile path
        shp_census = gpd.read_file(shp_fl, crs=crs)
        shp_df     = gpd.GeoDataFrame(shp_census,
                                  crs=crs,
                                  geometry=shp_census['geometry']
                                 )

    
    if shapetype == 'ward':
        shp_fl = down_extract_zip(
            'https://opendata.arcgis.com/datasets/0ef47379cbae44e88267c01eaec2ff6e_31.zip'
        ) #Download the zip file and extract it, then assign the shapefile path
        shp_ward   = gpd.read_file(shp_fl, crs=crs)
        shp_df     = gpd.GeoDataFrame(shp_ward,
                                  crs=crs,
                                  geometry=shp_ward['geometry']
                                 )

    if shapetype == 'anc':
        shp_fl = down_extract_zip(
            'https://opendata.arcgis.com/datasets/fcfbf29074e549d8aff9b9c708179291_1.zip'
        ) #Download the zip file and extract it, then assign the shapefile path
        shp_anc    = gpd.read_file(shp_fl, crs=crs)
        shp_df     = gpd.GeoDataFrame(shp_anc,
                                  crs=crs,
                                  geometry=shp_anc['geometry']
                                 )
        
    
    adr_df   = pd.read_csv('https://opendata.arcgis.com/datasets/aa514416aaf74fdc94748f1e56e7cc8a_0.csv',
                     encoding = 'utf-8', low_memory= False)

    return [adr_df, shp_df]

In [8]:
def down_extract_zip(url):
    #Downloads and unzips a DCopendata shape file
    #and returns the filepath of the shape file
    
    #Usage: must have file named 'data' in cwd.
    #Then: flname = down_extract_zip(url_of_zipfile)
    #flname is now the path of the shpfile.
    
    local_filename = str('./data/' + url.split('/')[-1])
        
    r = requests.get(url)
    assert r.status_code == 200 #Check connection
    with open(local_filename, 'wb') as f:
        f.write(r.content)
    zip_fld = local_filename[:-4]
    if not os.path.exists(zip_fld):
        os.makedirs(zip_fld)
        with zipfile.ZipFile(local_filename, "r") as zip_pl:
            zip_pl.extractall(zip_fld)
    fld_arr = os.listdir(zip_fld) #get array of unzipped files
    for unzippedfile in fld_arr:  #find shapefile in unzipped files
        if str(unzippedfile[-3:]) == 'shp':
            flname = zip_fld + '/' + unzippedfile
    if not flname: 
        print('Shapefile not found!')
    return flname

In [10]:
def joinOurShapes(dex=None, toJoin=None, transf=None):
    assert isinstance(toJoin, pd.DataFrame) #Must join a dataframe!
    
       
    

In [14]:
def data_pipeline(shapetype, bbl_links, supplement=None,
                 dex=None, ts_lst_range=None):
    #A pipeline for group_e dataframe operations
    
    #Test inputs
    if supplement:
        assert isinstance(supplement, dict)
    assert isinstance(bbl_links, list)
    if ts_lst_range:
        assert isinstance(ts_lst_range, list)
        assert len(ts_lst_range) == 2 #Must be list of format [start-yr, end-yr]
        

    
    #We'll need our addresspoints and our shapefile
    #for the time_unit_of_analysis
    if not dex:
        dex = addr_shape(shapetype)
    
    #We need a list of time_unit_of_analysis
    if ts_lst_range:
        ts_lst = [x+(i/100) for i in range(1,13,1) for x in range(1980, 2025)]
        ts_lst = [x for x in ts_lst if 
                  x >= ts_lst_range[0] and x <= ts_lst_range[1]]
        ts_lst = sorted(ts_lst)
    if not ts_lst_range:
        ts_lst = [x+(i/100) for i in range(1,13,1) for x in range(2012, 2017)]
        ts_lst = sorted(ts_lst)
    
    #Now we need to stack our BBL data
    
    #Begin by forming an empty DF
    bbl_df = pd.DataFrame()
    for i in bbl_links:
        bbl = pd.read_csv(i, encoding='utf-8', low_memory=False)
        col_len = len(bbl.columns)
        bbl_df = bbl_df.append(bbl)
        if len(bbl.columns) != col_len:
            print('Column Mismatch!')
        del bbl
    bbl_df.LICENSE_START_DATE = pd.to_datetime(bbl_df.LICENSE_START_DATE)

    bbl_df.sort_values('LICENSE_START_DATE')

    bbl_df['month'] = 0
    
    bbl_df['month'] = bbl_df['LICENSE_START_DATE'].dt.year + (
        bbl_df['LICENSE_START_DATE'].dt.month/100
    )
    
    bbl_df = bbl_df.dropna(subset=['month'])
    bbl_df = bbl_df.set_index(['MARADDRESSREPOSITORYID','month'])
    bbl_df = bbl_df.sort_index(ascending=True)
    bbl_df.reset_index(inplace=True)
    print(bbl_df.shape)
    bbl_df = bbl_df[bbl_df['LICENSESTATUS']== 'ACTIVE']
    bbl_df = bbl_df.dropna('LICENSESTATUS')
    print(bbl_df.shape)
    
    #Now that we have the BBL data, let's create our flag and points data.
    addr_df = dex[0]
    addr_df['geometry'] = [Point(xy) for xy in zip(addr_df.LONGITUDE.apply(float), addr_df.LATITUDE.apply(float))]
    
    
        

dex = addr_shape('anc')
data_pipeline('anc', BBL12_17CSV, supplement=None, dex=dex, ts_lst_range=None)


(80250, 35)
(43557, 35)


In [None]:
df['flag'][dffunctuionschei] = 1

In [5]:
teststr = 'jdwqadl;w'
teststr[:-3]

'jdwqad'