In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import networkx as nx
import disambiguation.processing as dp
import disambiguation.linkage as dl

In [3]:
pd.set_option('display.min_rows', 1000)
pd.set_option('display.max_columns', 20)

In [4]:
match = pd.read_csv("../data/matches.csv")

## Preprocessing
- Data is currently too large to apply algorithms on
- Split into df, each df bounded by anchors
- Apply algorithms on sub dfs which require disambiguation (because some dfs not have multiple matches)
- Combine after

In [5]:
match['anchor'] = match.apply(lambda row: 1 if row.confidence_score == 1 else None, axis=1)

In [6]:
sum(match['anchor'].notnull())

48716

In [6]:
# create a df with an id ('group') for each anchor
sub_group = pd.DataFrame({'index': list(match.loc[match.anchor.notnull(), :].index), 'group': range(0, sum(match['anchor'].notnull()))}).set_index('index')
sub_group.head()

Unnamed: 0_level_0,group
index,Unnamed: 1_level_1
9,0
10,1
13,2
15,3
16,4


In [7]:
# join it back and fill down
match = match.join(sub_group)

In [8]:
match['group'] = match['group'].fillna(method='ffill').fillna(method='backfill')

In [9]:
# split df into multiple df, each bounded by anchor
sub_group_dict = {group: df for group, df in match.groupby('group')}

In [10]:
# add bottom anchor back
for i in range(0, len(sub_group_dict) - 1):
    sub_group_dict[i] = pd.concat([sub_group_dict[i], sub_group_dict[i+1][0:1]])

In [11]:
# create list of dataframes keys where spatial disambiguation needs to creates weights
sub_group_algos = [i for i in range(0, len(sub_group_dict) - 1) if sum(sub_group_dict[i].census_count > 1) > 0]

In [12]:
print(len(sub_group_dict))
print(len(sub_group_algos))

48716
3687


## Apply Algorithms
- First pass, minimal tuning
- Results will serve as benchmark for further tuning

In [13]:
error = []

# iteratively apply algorithms onto each sub df
for i in sub_group_algos:
    
    try:
        # save df and wrangle to necessary format
        df = sub_group_dict[i]
        path_df = dp.create_path_df(df)

        # apply density clustering and remove outlier nodes
        filtered = dl.apply_density_clustering(path_df)

        # create graph and k shortest paths centrality
        g = dp.create_path_graph(filtered)
        output = dl.apply_k_betweenness(filtered, g)['df']

        sub_group_dict[i] = output
    
    except:
        error.append(i)

In [18]:
len(error)

0

In [19]:
# treatment for rows which were not involved in spatial disambiguation
# anchor: NaN fill 0
# key: 0
# in_cluster: NaN
# spatial weight: confidence weight + 1
sub_group_dict[5918]

Unnamed: 0,CD_ID,CENSUS_ID,LONG,LAT,confidence_score,MATCH_ADDR,anchor,node_ID,letter,in_cluster,key,spatial_weight
0,CD_10691,CENSUS_162526,-73.9846,40.712312,1.0,"190 MONROE ST, New York, NY",1,N0_0,N0,0,0,2.0
1,CD_56128,CENSUS_162536,-73.9846,40.712312,0.97,"190 MONROE ST, New York, NY",0,N1_0,N1,0,0,1.97
2,CD_102471,CENSUS_162546,-73.9846,40.712312,0.99,"190 MONROE ST, New York, NY",0,N2_0,N2,0,0,1.99
3,CD_137980,CENSUS_162552,-73.984672,40.712304,0.8,"188 MONROE ST, New York, NY",0,N3_0,N3,0,0,0.8
4,CD_137978,CENSUS_162552,-73.983897,40.711899,0.8,"372 CHERRY ST, New York, NY",0,N3_1,N3,0,0,1.8
5,CD_82866,CENSUS_162560,-73.984672,40.712304,0.97,"188 MONROE ST, New York, NY",0,N4_0,N4,0,0,1.97
6,CD_66597,CENSUS_162566,-73.984672,40.712304,0.96,"188 MONROE ST, New York, NY",0,N5_0,N5,0,0,1.96
7,CD_107671,CENSUS_162574,-73.984672,40.712304,0.97,"188 MONROE ST, New York, NY",0,N6_0,N6,0,0,1.97
8,CD_201993,CENSUS_162578,-73.9846,40.712312,0.74,"190 MONROE ST, New York, NY",0,N7_0,N7,0,0,0.74
9,CD_201973,CENSUS_162578,-73.983609,40.712074,0.74,"60 GOUVERNEUR ST, New York, NY",0,N7_1,N7,0,0,1.74


In [20]:
sub_grp_list = [v for k,v in sub_group_dict.items()] 
final = pd.concat(sub_grp_list)

In [21]:
final_processed = final.loc[:, ['CD_ID', 'CENSUS_ID', 'anchor', 'confidence_score', 'in_cluster', 'spatial_weight']]

In [22]:
len(final_processed)

186957

In [23]:
# there will be duplicate rows- remove these
final_processed = final_processed.drop_duplicates(['CD_ID', 'CENSUS_ID'])

In [26]:
final_processed['anchor'] = final_processed['anchor'].fillna(0)

In [27]:
final_processed['spatial_weight'] = final_processed.apply(lambda row: row.confidence_score + 1 if pd.isna(row.spatial_weight) else row.spatial_weight, axis=1)

In [25]:
len(final_processed)

138242

In [28]:
final_processed.to_csv('../data/matches_w_weights.csv')

**Bipartite Matching**
- process up to here is quite expensive, save output
- bipartite matching has to act on subgraphs, else will be too expensive

In [123]:
final_processed = pd.read_csv("../data/matches_w_weights.csv")

In [124]:
final_processed['CD_ID'] = 'CD_' + final_processed['CD_ID'].astype(str)
final_processed['CENSUS_ID'] = 'CENSUS_' + final_processed['CENSUS_ID'].astype(str)

In [125]:
b_edges = [(row['CD_ID'], row['CENSUS_ID'], row['spatial_weight']) for index, row in final_processed.iterrows()]
b = nx.Graph()
b.add_weighted_edges_from(b_edges)

In [126]:
subgraphs = list(nx.connected_component_subgraphs(b))

In [127]:
len(subgraphs)

102185

In [128]:
matches = [list(nx.max_weight_matching(graph, maxcardinality = True)) for graph in subgraphs]

In [129]:
matches = [sorted(list(item)) for sublist in matches for item in sublist]

In [130]:
len(matches)

104010

In [131]:
matches = pd.DataFrame(matches, columns=['CD_ID', 'CENSUS_ID'])

In [132]:
matches['selected'] = 1

In [133]:
final_processed.head()

Unnamed: 0.1,Unnamed: 0,CD_ID,CENSUS_ID,anchor,confidence_score,in_cluster,spatial_weight
0,0,CD_74371,CENSUS_172,0.0,0.85,,1.85
1,1,CD_24242,CENSUS_306,0.0,0.66,,1.66
2,2,CD_80330,CENSUS_317,0.0,0.87,,1.87
3,3,CD_185397,CENSUS_359,0.0,0.96,,1.96
4,4,CD_137772,CENSUS_361,0.0,0.86,,1.86


In [134]:
final_processed = final_processed.merge(matches, how='left', on=['CD_ID', 'CENSUS_ID'], validate='one_to_one')
final_processed['selected'] = final_processed['selected'].fillna(0)

In [135]:
sum(final_processed.selected.values)

104010.0

In [109]:
# everything above is equivalent to: 
"""
final_processed = pd.read_csv("../data/matches_w_weights.csv")
final_matched = dl.get_matches(final_processed)
"""

In [136]:
final_processed.to_csv("../data/matches_disambiguated.csv")

## Performance
Some metrics to consider:
- how many matches retrieved
- accuracy of the matches
- some way to check false positive/false negative rate?

In [None]:
# to recap + demonstrate purpose
final_processed = pd.read_csv("../data/matches_w_weights.csv")
final_matched = dl.get_matches(final_processed)

In [31]:
# to recap + demonstrate purpose
final_df = final_matched['results']
final_df.head()

Unnamed: 0.1,Unnamed: 0,CD_ID,CENSUS_ID,anchor,confidence_score,in_cluster,spatial_weight,selected
0,0,CD_74371,CENSUS_172,0.0,0.85,,1.85,1.0
1,1,CD_24242,CENSUS_306,0.0,0.66,,1.66,1.0
2,2,CD_80330,CENSUS_317,0.0,0.87,,1.87,0.0
3,3,CD_185397,CENSUS_359,0.0,0.96,,1.96,1.0
4,4,CD_137772,CENSUS_361,0.0,0.86,,1.86,0.0


**How many matches retrieved**
- as a proportion of CD records (since that's the max no. of matches)

In [33]:
sum(final_df.selected.values) / len(final_df.CD_ID.unique())

0.9897136767182727

**Accuracy**
- Based right now of cd address = census address
    - how accurate & standardized is the address format?
- Another possibility is using lat lng & manhattan distance between CD address and census address
    - where to find census lat lng?

In [41]:
# need to join back street
final_df = final_df.merge(match[['CD_ID', 'CENSUS_ID', 'MATCH_ADDR', 'CENSUS_MATCH_ADDR']], how='inner', on=['CD_ID', 'CENSUS_ID'], validate='one_to_one')

In [46]:
final_df['cd_add_cln'] = final_df.apply(lambda row: row.MATCH_ADDR[:row.MATCH_ADDR.index(',')], axis=1)
final_df['cen_add_cln'] = final_df.apply(lambda row: row.CENSUS_MATCH_ADDR[:row.CENSUS_MATCH_ADDR.index(',')], axis=1)

Success rate based on address:
- how many matches (before disambiguation) have exactly same address?
- how many such matches have been selected via disambiguation?

In [48]:
len(final_df.loc[final_df.cd_add_cln == final_df.cen_add_cln, :])

36480

In [50]:
len(final_df.loc[(final_df.cd_add_cln == final_df.cen_add_cln) & (final_df.selected == 1), :])

29140

In [51]:
# out of these perfect matches, how many were chosen
29140 / 36480

0.7987938596491229

In [57]:
# out of the max no. of matches, how many were chosen
29140 / len(final_df.CD_ID.unique())

0.27728349715960454

In [56]:
# why are there some 'perfect matches' that are not being returned?
# from initial analysis: it may be because the same CD was matched to multiple accurate census
# e.g. household head's record matched to his + children's record in the census
# this needs more exploration
final_df.loc[(final_df.cd_add_cln == final_df.cen_add_cln) & (final_df.selected != 1), :]

Unnamed: 0.1,Unnamed: 0,CD_ID,CENSUS_ID,anchor,confidence_score,in_cluster,spatial_weight,selected,MATCH_ADDR,CENSUS_MATCH_ADDR,cd_add_cln,cen_add_cln
18,18,CD_175520,CENSUS_533,0.0,0.80,,1.80,0.0,"9 STATE ST, New York, NY","9 STATE ST, NYC-Manhattan, NY",9 STATE ST,9 STATE ST
19,19,CD_175515,CENSUS_534,0.0,0.80,,1.80,0.0,"9 STATE ST, New York, NY","9 STATE ST, NYC-Manhattan, NY",9 STATE ST,9 STATE ST
27,27,CD_46136,CENSUS_622,0.0,0.87,,1.87,0.0,"10 STATE ST, New York, NY","10 STATE ST, NYC-Manhattan, NY",10 STATE ST,10 STATE ST
43,4,CD_121432,CENSUS_701,0.0,0.85,0.0,1.85,0.0,"8 PEARL ST, New York, NY","8 PEARL ST, NYC-Manhattan, NY",8 PEARL ST,8 PEARL ST
47,47,CD_51560,CENSUS_743,0.0,0.83,,1.83,0.0,"19 PEARL ST, New York, NY","19 PEARL ST, NYC-Manhattan, NY",19 PEARL ST,19 PEARL ST
56,56,CD_35029,CENSUS_830,0.0,0.85,,1.85,0.0,"17 STATE ST, New York, NY","17 STATE ST, NYC-Manhattan, NY",17 STATE ST,17 STATE ST
59,59,CD_75579,CENSUS_851,0.0,0.90,,1.90,0.0,"18 STATE ST, New York, NY","18 STATE ST, NYC-Manhattan, NY",18 STATE ST,18 STATE ST
105,105,CD_182293,CENSUS_1219,0.0,0.80,,1.80,0.0,"27 BRIDGE ST, New York, NY","27 BRIDGE ST, NYC-Manhattan, NY",27 BRIDGE ST,27 BRIDGE ST
110,110,CD_74957,CENSUS_1310,0.0,0.80,,1.80,0.0,"43 WHITEHALL ST, New York, NY","43 WHITEHALL ST, NYC-Manhattan, NY",43 WHITEHALL ST,43 WHITEHALL ST
117,117,CD_39614,CENSUS_1362,0.0,0.80,,1.80,0.0,"5 WATER ST, New York, NY","5 WATER ST, NYC-Manhattan, NY",5 WATER ST,5 WATER ST


## EDA
- average confidence score of selected matches

In [54]:
sum(final_df.loc[final_df.selected == 1, :].confidence_score.values) / len(final_df.loc[final_df.selected == 1, :])

0.9558351120083621

In [55]:
sum(final_df.confidence_score.values) / len(final_df)

0.9215124202484423