# Latest Full Run as of Spring 2020
- Using new output from ES, after bug preventing metaphone matching was fixed

In [2]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

%load_ext autoreload
%autoreload 2

import pandas as pd
import pyjarowinkler
import networkx as nx
from disambiguation import Disambiguator, Disambiguator1880
import disambiguation.linkage as dl
import disambiguation.processing as dp 
from disambiguation.processing import apply_confidence_score
import matplotlib.pyplot as plt

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
match = pd.read_csv("../data/es-1880-21-5-2020.csv", sep='\t', engine='python')

In [4]:
match.columns

Index(['OBJECTID.x', 'CENSUS_NAMEFRSTB', 'CENSUS_NAMELASTB', 'CENSUS_AGE',
       'CENSUS_OCCLABELB', 'CENSUS_MATCH_ADDR', 'CENSUS_SEGMENT_ID',
       'WARD_NUM', 'CD_ED', 'OBJECTID', 'MATCH_ADDR', 'CD_FIRST_NAME',
       'CD_LAST_NAME', 'CD_OCCUPATION', 'CD_FINAL_HOUSENUM'],
      dtype='object')

In [5]:
df = apply_confidence_score(match, cen_fn='CENSUS_NAMEFRSTB', cen_ln='CENSUS_NAMELASTB', cen_occ='CENSUS_OCCLABELB', cen_id='OBJECTID.x')

In [5]:
print ("No. of matches: " + str(len(match)))
print ("No. of unique CD records: " + str(len(match['OBJECTID'].drop_duplicates())))
print ("No. of unique Census records: " + str(len(match['OBJECTID.x'].drop_duplicates())))
print ("No. of 1:1 matches: " + str(len(df[ (df['census_count'] == 1) & (df['cd_count'] == 1) ] )) )
print ("No. of matches where census record <= 12: " + str( len(df[df['CENSUS_AGE'] <= 12]) ))
print ("No. of unique matches where census record <= 12: " + str( len(df[df['CENSUS_AGE'] <= 12]['OBJECTID.x'].drop_duplicates()) ))
print ("No. of anchors (confidence score = 1): " + str( len(df[df['confidence_score'] == 1]) ))

No. of matches: 151046
No. of unique CD records: 107018
No. of unique Census records: 141594
No. of 1:1 matches: 72810
No. of matches where census record <= 12: 21179
No. of unique matches where census record <= 12: 19622
No. of anchors (confidence score = 1): 46550


## Disambiguation

In [6]:
df['CD_ID'] = 'CD_' + df['OBJECTID'].astype(str)
df['CENSUS_ID'] = 'CENSUS_' + df['OBJECTID.x'].astype(str)

In [7]:
# join CD lat lng
latlng = pd.read_csv("../data/cd_1880.csv")
latlng = latlng[['OBJECTID', 'LONG', 'LAT']]
df = df.merge(latlng, how='left', on='OBJECTID', validate='many_to_one')

In [10]:
disambiguate = Disambiguator1880(df)
disambiguate.run_disambiguation()

Running
Creating dictionary of sub dfs (1/4)...
Applying algorithms iteratively (2/4)...
Number of Subgraphs: 46550
Reached: 0
Reached: 1000
Reached: 2000
Reached: 3000
Reached: 4000
Reached: 5000
Reached: 6000
Reached: 7000
Reached: 8000
Reached: 9000
Reached: 10000
Reached: 11000
Reached: 12000
Reached: 13000
Reached: 14000
Reached: 15000
Reached: 16000
Reached: 17000
Reached: 18000
Reached: 19000
Reached: 20000
Reached: 21000
Reached: 22000
Reached: 23000
Reached: 24000
Reached: 25000
Reached: 26000
Reached: 27000
Reached: 28000
Reached: 29000
Reached: 30000
Reached: 31000
Reached: 32000
Reached: 33000
Reached: 34000
Reached: 35000
Reached: 36000
Reached: 37000
Reached: 38000
Reached: 39000
Reached: 40000
Reached: 41000
Reached: 42000
Reached: 43000
Reached: 44000
Reached: 45000
Reached: 46000
Cleaning output (3/4)...
Disambiguating (4/4)...
Done! :)


In [11]:
result = disambiguate.get_result()

In [12]:
result.head()

Unnamed: 0,CD_ED,CD_FINAL_HOUSENUM,CD_FIRST_NAME,CD_ID,CD_LAST_NAME,CD_OCCUPATION,CENSUS_AGE,CENSUS_ID,CENSUS_MATCH_ADDR,CENSUS_NAMEFRSTB,...,jw_fn,jw_ln,jw_score,key,letter,node_ID,occ_listed,spatial_weight,selected,graph_ID
0,59.0,175,Julia,CD_170754,Dunphy,wid,50,CENSUS_100000,"175 ELIZABETH ST, NYC-Manhattan, NY",JULIA,...,1.0,1.0,1.0,,,,1,2.0,1.0,0
1,59.0,24,John,CD_12663,Walsh,police,5,CENSUS_100003,"175 ELIZABETH ST, NYC-Manhattan, NY",ANN,...,0.53,1.0,0.812,,,,1,1.72,0.0,1
2,59.0,24,John,CD_12663,Walsh,police,0,CENSUS_99318,"24 PRINCE ST, NYC-Manhattan, NY",JOHN,...,1.0,1.0,1.0,0.0,N3,N3_0,0,1.77,0.0,1
3,59.0,24,John,CD_12663,Walsh,police,46,CENSUS_99321,"24 PRINCE ST, NYC-Manhattan, NY",JOHN,...,1.0,1.0,1.0,0.0,N4,N4_0,1,1.87,1.0,1
4,59.0,175,Ann,CD_180152,Day,wid,55,CENSUS_100004,"175 ELIZABETH ST, NYC-Manhattan, NY",ANN,...,1.0,1.0,1.0,,,,1,2.0,1.0,2


In [13]:
result.to_csv("../data/disambiguated-21-5-2020.csv", index=False)

## Analysis of Results

In [14]:
# read in census latlng & fix outlier
census_latlng = pd.read_csv("../data/census_1880_mn_v04.csv")
census_latlng['CENSUS_ID'] = 'CENSUS_' + census_latlng['OBJECTID.x'].astype(str)
census_latlng = census_latlng.loc[:, ['CENSUS_ID', 'CENSUS_X', 'CENSUS_Y']]
census_latlng.loc[census_latlng.CENSUS_Y > 1000, 'CENSUS_Y'] = 40.799935

In [15]:
disambiguate.merge_census_var(census_latlng)
disambiguate.set_var()

In [16]:
match_rate = disambiguate.get_match_rate()
addr_success = disambiguate.get_addr_success()
dist_error = disambiguate.get_dist_error()
under12 = disambiguate.get_under12_selections()

In [19]:
print("Match Rate: " + str(match_rate) + "%")
print("Percentage of Perfect Address Matches Selected: " + str(round(addr_success['n_perfect_match_chosen'] / addr_success['n_perfect_match'] * 100, 2)))
print("Descriptives of Distance Error:")
print(dist_error['dist'].describe())
print("Proportion of Selected Matches under Age 12: " +  str(under12))

Match Rate: 98.62%
Percentage of Perfect Address Matches Selected: 78.09
Descriptives of Distance Error:
count    151046.000000
mean         93.609218
std         262.683888
min           0.126686
25%          38.203894
50%          54.146137
75%          70.446966
max        8375.839589
Name: dist, dtype: float64
Proportion of Selected Matches under Age 12: 2.64


### Benchmarking

In [20]:
# import helper functions
import re
def get_hn(add):
    hn = re.search('[0-9]+', add)
    return hn.group()

def get_st(add):
    try:
        st = re.search('(?<=[0-9]\\s)([A-Z]|\\s)+(?=,)', add)
        return st.group()
    except:
        return None

print( get_hn("71 PEARL ST") )
print( get_st("71 PEARL ST, NEW YORK, NY") )

71
PEARL ST


In [23]:
census_latlng = pd.read_csv("../data/census_1880_mn_v04.csv")
census_latlng = census_latlng.loc[:, ['CENSUS_MATCH_ADDR', 'CENSUS_X', 'CENSUS_Y']].drop_duplicates() # select diff variables
census_latlng.loc[census_latlng.CENSUS_Y > 1000, 'CENSUS_Y'] = 40.799935
benchmark = match.merge(census_latlng, how='left', on='CENSUS_MATCH_ADDR', validate='many_to_one')

In [25]:
# add address similarity as weight
import numpy as np
benchmark['cd_hn'] = benchmark.apply(lambda row: get_hn(row.MATCH_ADDR), axis=1)
benchmark['cen_hn'] = benchmark.apply(lambda row: get_hn(row.CENSUS_MATCH_ADDR), axis=1)
benchmark['cd_add_cln'] = benchmark.apply(lambda row: get_st(row.MATCH_ADDR), axis=1)
benchmark['cen_add_cln'] = benchmark.apply(lambda row: get_st(row.CENSUS_MATCH_ADDR), axis=1)

benchmark['add_match'] = np.where(benchmark.cd_hn == benchmark.cen_hn, 0.1, 0) + np.where(benchmark.cen_add_cln == benchmark.cd_add_cln, 0.9, 0)
benchmark['confidence_score'] = benchmark['confidence_score'] + benchmark['add_match']

In [29]:
benchmark = benchmark.merge(latlng, how='left', on='OBJECTID', validate='many_to_one')

In [30]:
from disambiguation.analysis import get_dist_based_match
benchmark = get_dist_based_match(benchmark)['results']

In [31]:
from disambiguation.analysis import compare_selections
confusion_matrix = compare_selections(result, benchmark)['confusion_matrix']

In [32]:
print(confusion_matrix)

[[102012, 3528], [3528, 41978]]
