In [121]:
import json
from dotenv import load_dotenv
load_dotenv()  # take environment variables from .env.

True

In [2]:
! printenv | grep ZGNY_GEOCODER_LOG_LEVEL

ZGNY_GEOCODER_LOG_LEVEL=DEBUG


In [3]:
# ! pip install zgny_geocoder_client==0.2.38

In [4]:
from zgny_geocoder_client import __version__
__version__

'0.1.0'

In [39]:
import pandas as pd
import os

from zgny_geocoder_client.georef import GeorefGeocoder
from zgny_geocoder_client.utils import dicts_to_jsonl

## Load Data

In [148]:
trxns = pd.read_parquet('./data/trxn_2024_03_08.parquet').set_index('id')

In [149]:
# trxns_georef.loc[162]

state                 NY
city            New York
zipcode            10000
num                  214
stname     N 11th Street
boro                <NA>
Name: 162, dtype: object

In [49]:
trxns = trxns.replace({'NO DATA PROVIDED':pd.np.nan})
trxns['state'] = trxns['state'].replace('New York', 'NY').str.upper()
trxns['sale_id'] = trxns['sale_id'].astype(pd.Int64Dtype())

  trxns = trxns.replace({'NO DATA PROVIDED':pd.np.nan})


In [43]:
mask = trxns[['addr_line' ]].notnull().all(1)

In [44]:
(~mask).sum()

3

In [45]:
trxns = trxns[mask]

In [46]:
def split_address(addr: pd.Series) -> pd.DataFrame:
    """
    Split an address into its components.
    """
    ptrrn = r"^(?P<num>\S+)\s(?P<stname>.+)$"
    return addr.str.extract(ptrrn, expand=True)

In [47]:
trxns[['num', 'stname']] = split_address(trxns['addr_line'])

In [79]:
boro_mapping = {
    'manhattan': 1,
    'brooklyn': 3,
    'bk': 3,
    'bronx': 2,
    'forest hills': 4,
    'astoria': 4,
    'woodside': 4,
    'jackson heights': 4,
    'queens': 4,
    'sunnyside': 4,
    'rego park': 4,
    'flushing': 4,
    'ridgewood': 4,
    'elmhurst': 4,
    'jamaica': 4,
    'williamsburg': 3,
    'long island city': 4
}

In [84]:
trxns['boro'] = trxns['city'].str.lower().map(boro_mapping).astype(pd.Int64Dtype())

In [60]:
# trxns['city'].value_counts().head(30)

In [85]:
trxns

Unnamed: 0_level_0,state,city,zip,secondary,addr_line,sale_id,num,stname,boro
trxn_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,NY,New York,10000,5F,61 West 62nd Street,1438036,61,West 62nd Street,
3,NY,New York,10000,9C,325 Lexington Ave,1399238,325,Lexington Ave,
4,NY,New York,10000,5D,280 Metropolitan Ave,1435614,280,Metropolitan Ave,
5,NY,Long Island City,11101,445,27-28 Thomson Avenue,1407899,27-28,Thomson Avenue,4
6,NY,New York,10000,2203,20 Pine Street,1413486,20,Pine Street,
...,...,...,...,...,...,...,...,...,...
7922,NY,Jackson Heights,11372,2E,34-15 74th St,1587656,34-15,74th St,4
7924,NY,Brooklyn,11221,3B,725 Lafayette Avenue,,725,Lafayette Avenue,3
7929,NY,Jackson Heights,11372,8P,37-31 73 st,1660776,37-31,73 st,4
7957,NY,Manhattan,10039,1,214 BRADHURST AVENUE,1688573,214,BRADHURST AVENUE,1


In [97]:
trxns['state'] = trxns['state'].fillna('NY').replace('', 'NY')

In [98]:
trxns.index.name = 'trxn_id'

In [99]:
trxns['city'] = trxns['city'].replace('', pd.NA)

In [100]:
COLS = ['state', 'city', 'zipcode', 'num', 'stname', 'boro']

In [101]:
columns = {
    'secondary': 'unit',
    'zip': 'zipcode'
}

trxns_georef = trxns.rename(columns=columns)

In [102]:
trxns_georef = trxns_georef[COLS]

## GeoRef


In [103]:
iteration_config = dict(
        tries=3,
        sleep_between_retries=1,
        sleep_between_rows=0,
        chunk_size=100,  # smaller chunks will be slower, but more finegrained to problematic rows
    )

In [88]:
gc = GeorefGeocoder(
    endpoint='prod', iteration_config=iteration_config
)
# geocoding

In [104]:
result, errors = gc.bulk_geocode(trxns_georef)

Executing attempt 0/3: 100%|███████████████████████████████████████████████████████████████████████| 6461/6461 [2:09:56<00:00,  1.21s/it]


In [28]:
# result[:2]

In [10]:
# set(trxns.columns).issubset({"addr_line", "zip", "city", "state", "secondary"})

In [126]:
with open('./data/georef_response.jsonl', 'w') as f:
    s = dicts_to_jsonl(result)
    f.write(s)

## Analysis

In [147]:
trxns_georef.loc[162]

state                 NY
city            New York
zipcode            10000
num                  214
stname     N 11th Street
boro                <NA>
Name: 162, dtype: object

In [115]:
georef_result = pd.DataFrame({el['trxn_id']: el.get('centroid_lot', {}) for el in result}).T

In [120]:
georef_result.reindex(trxns_georef.index)['lon'].isnull().sum()

211

In [119]:
georef_result.reindex(trxns_georef.index)['lon'].isnull().mean()

0.03265748336170871

In [144]:
# georef_result

# Compare to Malone

In [132]:
with open('./data/malone_response.jsonl', 'r') as f:
    r = []
    for line in f.readlines():
        r.append(json.loads(line))
    

In [140]:
r[0]['response']['address']['geo'].keys()

dict_keys(['latitude', 'longitude', 'precision', 'srid'])

In [145]:
malone_results = pd.DataFrame({
    el['trxn_id']: el['response']['address'].get('geo') for el in r
}).T