In [None]:
!pip install pyjarowinkler
!pip install haversine
!pip install hdbscan

### write processing module

In [None]:
# processing
%%writefile processing.py

import pandas as pd
import networkx as nx
import numpy as np
from pyjarowinkler import distance
from haversine import haversine, Unit
import time


def col_for_disamb(df, cd_id, cen_id, cd_fn="CD_FIRST_NAME", cen_fn="CENSUS_FIRST_NAME", cd_ln="CD_LAST_NAME",
                           cen_ln="CENSUS_LAST_NAME", cen_occ="CENSUS_OCCUPATION", cen_age="CENSUS_AGE"):
    # name jw dist
    df["jw_fn"] = df.apply(lambda x: distance.get_jaro_distance(x[cd_fn], x[cen_fn], winkler=True, scaling=0.1), axis=1)
    df["jw_ln"] = df.apply(lambda x: distance.get_jaro_distance(x[cd_ln], x[cen_ln], winkler=True, scaling=0.1), axis=1)
    df["jw_score"] = 0.4 * df["jw_fn"] + 0.6 * df["jw_ln"]

    # occ
    df['occ_listed'] = np.where((df[cen_occ].isnull()) | (df[cen_occ] == '*'), 0, 1)

    # age
    df['age_score'] = np.where(df[cen_age] <= 12, 0, 1)

    # cd conflicts
    df["cd_count"] = df.groupby(cd_id)[cen_id].transform('count')
    df["census_count"] = df.groupby(cen_id)[cd_id].transform('count')

    df['census_count_inverse'] = 1 / df['census_count']
    df['cd_count_inverse'] = 1 / df['cd_count']

    #This is so the bipartite matching algorthm works the way we need it to
    df['CD_ID'] = 'CD_' + df[cd_id].astype(str)
    df['CENSUS_ID'] = 'CENSUS_' + df[cen_id].astype(str)

    return df


"""
Applies confidence score to df
"""
def apply_confidence_score(df, cd_fn = "CD_FIRST_NAME", cen_fn = "CENSUS_FIRST_NAME", cd_ln = "CD_LAST_NAME", cen_ln = "CENSUS_LAST_NAME", cen_occ = "CENSUS_OCCLABELB", cen_age = "CENSUS_AGE", cd_id="OBJECTID", cen_id="OBJECTID.x"):
    
    # name jw dist
    df["jw_fn"] = df.apply(lambda x: distance.get_jaro_distance(x[cd_fn], x[cen_fn], winkler=True, scaling=0.1), axis = 1)
    df["jw_ln"] = df.apply(lambda x: distance.get_jaro_distance(x[cd_ln], x[cen_ln], winkler=True, scaling=0.1), axis = 1)
    df["jw_score"] = 0.4 * df["jw_fn"] + 0.6 * df["jw_ln"]

    # occ
    df['occ_listed'] = np.where((df[cen_occ].isnull()) | (df[cen_occ] == '*'), 0, 1)

    # age
    df['age_score'] = np.where(df[cen_age] <= 12, 0, 1)

    # cd conflicts
    df["cd_count"] = df.groupby(cd_id)[cen_id].transform('count')
    df["census_count"] = df.groupby(cen_id)[cd_id].transform('count')

    df['confidence_score'] = .5*df.jw_score + .2*(1/df.cd_count) + \
                             .2*(1/df.census_count) + .05*df.occ_listed + \
                             .05*df.age_score
    df['confidence_score'] = df['confidence_score'].round(decimals = 2)

    return df


#not needed in new run
"""
Takes elastic search and census directory geocode file to create a dataframe 
ready for the disambiguation process.
Can add/incorporate new columns to include in confidence score here
elastic_search: either df with elastic search output or elastic search output file
city_directory: either df with city directory data or file name
file: boolean value, set to True if elastic_search/city_directory are file names otherwise set 
to false. Default false 
"""
def elastic_to_disamb(elastic_search, city_directory, file = False):

    if file:
        elastic_search = pd.read_csv(elastic_search, sep='\t', engine='python')
        city_directory = pd.read_csv(city_directory)

    else:
        elastic_search = elastic_search.copy()
        city_directory = city_directory.copy()

    latlng = city_directory[['OBJECTID', 'LONG', 'LAT']]

    #print(elastic_search.head())
    #print(latlng.head())

    match = apply_confidence_score(elastic_search, cen_fn='CENSUS_NAMEFRSTB', cen_ln='CENSUS_NAMELASTB',
                                      cen_occ='CENSUS_OCCLABELB', cen_id='OBJECTID.x')

    #print(match.head())
    match['CD_ID'] = 'CD_' + match['OBJECTID'].astype(str)
    match['CENSUS_ID'] = 'CENSUS_' + match['OBJECTID.x'].astype(str)

    #Can remove this after finalizing the confidence score
    match['census_count_inverse'] = 1 / match['census_count']
    match['cd_count_inverse'] = 1 / match['cd_count']

    match = match.merge(latlng, how='left', on='OBJECTID', validate='many_to_one')
    #print(match.head())
    return match
"""
Create a list of dataframes where the top row is an anchor
Each dataframe is one where spatial disambiguation will be applied
This is necessary as else, algorithms take too long to run
Match: df of matches
confidence_score: name of confidence score column
"""

def split_dfs(match, sort_var="CENSUS_ID", confidence="confidence_score"):

    match = match.sort_values(by=[sort_var])

    # identify anchors and assign anchor ID
    match['anchor'] = np.where(match[confidence] == 1, 1, None)
    sub_group = pd.DataFrame({'index': list(match.loc[match.anchor.notnull(), :].index), 'group_ID': range(0, sum(match['anchor'].notnull()))}).set_index('index')
    match = match.join(sub_group)
    match['group_ID'] = match['group_ID'].fillna(method='ffill').fillna(method='backfill')

    # split df into multiple df, each bounded by anchor

    # sub_group_dict = {group: df for group, df in match.groupby('group_ID')}
    sub_groups = [df for group, df in match.groupby('group_ID')]
    
    # add bottom anchor back
    """
    for i in range(0, len(sub_group_dict) - 1):
        sub_group_dict[i] = pd.concat([sub_group_dict[i], sub_group_dict[i+1][0:1]])
    """
    return sub_groups

"""
Create node ID for each match, to be using the shortest path algorithm 
sub must be a df with each row as a potential match between a CD and census record. It must contain the columns CD_ID, CENSUS_ID, LONG, LAT, confidence_score and MATCH_ADDR.
the column names can be specified individually if they are named differently
Returns the dataframe with new columns, 'anchor', 'node_ID' and 'letter'.
anchor: whether row is an anchor (confidence score = 1)
node_ID: unique node ID. each node is a match, so e.g. A0 and A1 refers to two potential CD matches for the same census record
letter: grouping for identical census records 
add_prefixes: whether to add prefixes 'CD_' and 'CENSUS_' to cd_id and census_id respectively. prefixes are required for subsequent bipartite matching
"""
def create_path_df(sub_graph, census_id = "CENSUS_ID"):

    sub_graph['node_ID'] = sub_graph.groupby(census_id).cumcount()

    letter_id = sub_graph[census_id].unique().tolist()
    letters = ['N' + str(x) for x in range(0, len(letter_id))]
    letter_id = pd.DataFrame({'CENSUS_ID': letter_id, 'letter': letters})

    sub_graph = sub_graph.merge(letter_id, how='left', left_on=census_id, right_on="CENSUS_ID", validate='many_to_one')

    sub_graph['node_ID'] = sub_graph.apply(lambda row: row.letter + '_' + str(row.node_ID), axis=1)

    return sub_graph

"""
Creates a graph from the sub_graph dataframe
Each node being a potential CD-census match and 
    each edge being the link between the potential CD records of consecutive census records
The weight of each edge = the haversine distance between the two
cluster_col: name of column with cluster group. If does not exist, use None
Returns the graph object
"""

def create_path_graph(g, cluster_col='in_cluster_x', lat='CD_X', lon='CD_Y'):

    g.loc[:, 'key'] = 0
    g = g.merge(g, on='key')

    #This is time consuming
    g['key'] = g.apply(lambda row: 1 if int(row.letter_x[1:]) - int(row.letter_y[1:]) == -1 else 0, axis = 1)

    g = g[g.key == 1]

    g['weight'] = g.apply(lambda row: haversine((row[lat + '_y'], row[lon + '_y']), (row[lat + '_x'], row[lon + '_x']), unit=Unit.METERS), axis=1)

    if cluster_col != None:
        g['weight'] = g.apply(lambda row: row.weight + 999 if row[cluster_col] == -1 else row.weight, axis=1)

    g_edges = [(row.node_ID_x, row.node_ID_y, row.weight) for index, row in g.iterrows()]
    graph = nx.DiGraph()
    graph.add_weighted_edges_from(g_edges)

    
    return graph

Writing processing.py


### write disambiguation module

In [None]:
# disambiguation
%%writefile disambiguation.py

import pandas as pd
import networkx as nx
import hdbscan
from itertools import islice
#import disambiguation.processing as dp
import processing as dp
import time

"""
Wrapper function for everything below, including checks
Designed to work within list comprehension only! (refer to Disambiguator())
Works by applying algorithm to specified df (using index i) in the list
sub_groups: list of dfs, each df being a subset of the census bounded by 2 anchors
i: index of df in the list
"""
def apply_algo(sub_groups, i, cluster=True, k_between=True, census_id='CENSUS_ID', census_count="census_count", confidence='confidence_score', lat="CD_X", lon="CD_Y", cluster_kwargs={}, path_kwargs={}):

    if i % 1000 == 0:
        print("Reached: " + str(i))
    df = sub_groups[i]
    if sum(df[census_count] > 1) == 0: # no disambiguation needed
        return df

    if i + 1 < len(sub_groups): # add bottom anchor
        df = pd.concat([df, sub_groups[i+1][0:1]]) 

    path_df = dp.create_path_df(df, census_id)


    if cluster:
        # apply density clustering and remove outlier nodes
        path_df = apply_density_clustering(path_df, lat, lon, **cluster_kwargs)
        cluster_arg = 'in_cluster_x'
    else:
        cluster_arg = None

    # create graph and k shortest paths centrality

    g = dp.create_path_graph(path_df, cluster_col=cluster_arg, lat=lat, lon=lon)

    if k_between:

        output = apply_k_betweenness(path_df, g, confidence = confidence, **path_kwargs)
    else:
        output = apply_shortest_path(path_df, g, confidence = confidence, **path_kwargs)

    return output

"""
Apply Dijkstra's algorithm to the graph and get spatial weights
Spatial weights are computed as confidence score +1 if match was included in shortest path, and confidence score + 0 otherwise
df: dataframe of records with confidence score and node ID, names can be modified via parameters
graph: graph object created from create_path_graph()
source: start node, e.g. 'A0'. By default it chooses first row in the table
target: end node, e.g. 'J0'. By default it chooses last row in the table
Returns a dataframe of matches with added 'spatial weight'
"""
def apply_shortest_path(df, graph, source = None, target = None, confidence = 'confidence_score', node_id = 'node_ID'):
    if source == None:
        source = list(df[node_id])[0]
    if target == None:
        target = list(df[node_id])[-1]

    path = nx.dijkstra_path(graph, source, target)
    df['spatial_weight'] = df.apply(lambda row: row[confidence] + 1 if row[node_id] in path else row[confidence], axis = 1)

    return df

"""
Apply betweenness centrality using k shortest paths (as documented in spatial_disambiguation.ipynb)
df: df of matches
graph: graph object created from create_path_graph()
source: start node, e.g. 'A0'. By default it chooses first row in the table
target: end node, e.g. 'J0'. By default it chooses last row in the table
k: how many shortest paths to choose from (absolute number). By default 1 or ~ 1/2 of number of possible paths if there are more than 30 paths
scale: how much to scale the score by when adding it with confidence score. Default = 1 (equal weight of confidence score and spatial weight)
Returns
    df: df with spatial weights column
    k_paths: paths used for calculation
"""
def apply_k_betweenness(df, graph, confidence = "confidence_score", source=None, target=None, k=None, scale=1):
    if source == None:
        source = list(df["node_ID"])[0]
    if target == None:
        target = list(df["node_ID"])[-1]

    k_paths = nx.shortest_simple_paths(graph, source, target, weight="weight")

    length = get_n_paths(df)
    if k == None:
        if length < 31:
            k = 1
        elif length > 50:
            k = 50
        else:
            k = int(0.5 * length)

    k_paths = list(islice(k_paths, k))

    # initialize output: dict with nodes as keys
    spatial_weights = dict.fromkeys(graph.nodes, 0)
    
    # count
    for path in k_paths:
        for node in path:
            spatial_weights[node] += 1
    
    spatial_weights = [[key , round(value / k, 2) * scale] for key, value in spatial_weights.items()]
    spatial_df = pd.DataFrame(spatial_weights, columns=["node_ID", 'spatial_weight'])
    df = df.merge(spatial_df, how="inner", on="node_ID", validate="one_to_one")
    df['spatial_weight'] = df['spatial_weight'] + df[confidence]

    return df

"""
Helper method to count the number of possible paths in the graph
"""
def get_n_paths(df):
    k = 1
    counts = df.groupby('letter')['letter'].size().to_list()
    for count in counts:
        k *= count
    
    return k

"""
Apply density based clustering to detect outliers. Requires `hdbscan` library
Refer to hdbscan documentation on parameters
Returns df with a column 'in_cluster' indicating which cluster the nodes are in
"""
def apply_density_clustering(df, lat='CD_X', lon="CD_Y", min_cluster_size=10, min_samples=10, allow_single_cluster=True, **kwargs):
    cluster_sub = df.loc[:, [lon, lat]]
    clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples, allow_single_cluster=allow_single_cluster, **kwargs).fit(cluster_sub)

    df['in_cluster'] = pd.Series(clusterer.labels_).values
    return df

"""
A bipartite graph is created from the matches, with each node being either a census or CD record and each edge indicating a potential match. 
Note that subgraph MUST have prefixes on the cd_id ('CD_') and census_id ('CENSUS_') columns
The matching algorithm (maximum weighted matching) will 
    (1) select sets of matches that give the highest number of matches 
    (2) choose the match set that has the highest weight based on that
Returns a dictionary with 'graph' as the list of bipartite graphs and 'results' being the original df with an additional 'selected' column, indicating the correct match and 'graph_id' column, indicated subgraph.
"""
def get_matches(df, cd_id = 'CD_ID', census_id = 'CENSUS_ID', weight = 'spatial_weight'):
    b_edges = [(row[cd_id], row[census_id], row[weight]) for index, row in df.iterrows()]
    b = nx.Graph()
    b.add_weighted_edges_from(b_edges)

    # algorithm is too expensive if we perform it on entire graph. moreover, graph is actually disconnected into sub_graphs. apply algorithm on subgraphs instead
    subgraphs = [b.subgraph(c) for c in nx.connected_components(b)]
    matches = [list(nx.max_weight_matching(graph, maxcardinality = True)) for graph in subgraphs]
    matches = [sorted(list(item)) for sublist in matches for item in sublist] # unnest and convert pairs from tuple to list
    matches = pd.DataFrame(matches, columns=[cd_id, census_id])
    matches['selected'] = 1

    df = df.merge(matches, how='left', on=[cd_id, census_id], validate='one_to_one')
    df['selected'] = df['selected'].fillna(0)

    # add subgraph id
    subgraph_id = [{'graph_ID': i, 'CD_ID': node} for i in range(0, len(subgraphs)) for node in list(subgraphs[i].nodes) if node[:2] == 'CD']
    subgraph_id = pd.DataFrame(subgraph_id)
    df = df.merge(subgraph_id, how="inner", left_on=cd_id, right_on="CD_ID", validate="many_to_one")

    return {'graph': subgraphs, 'results': df}

Writing disambiguation.py


### write analysis module

In [None]:
# analysis
%%writefile analysis.py

from haversine import haversine, Unit
from numpy import log
#import disambiguation.disambiguation as dl
import disambiguation as dl

"""
Get number of selected matches, out of total possible (ie unique CD records)
df: df with "selected" column, after running get_matches()
cd_id: name of cd_id column
"""
def get_match_rate(df, cd_id='CD_ID'):
    n_cd_records = len(df[cd_id].unique())
    n_selected = sum(df["selected"].values)
    match_rate = round(n_selected / n_cd_records * 100, 2)

    return match_rate

"""
Get number of perfect matches (in terms of address) selected
df: df with "selected" column, after running get_matches()
cd_add: name of cd address column
cen_add: name of cen_add column
"""
def get_addr_success(df, cd_add='MATCH_ADDR', cen_add='CENSUS_MATCH_ADDR'):
    df['cd_add_cln'] = df.apply(lambda row: row[cd_add][:row[cd_add].index(',')], axis=1)
    df['cen_add_cln'] = df.apply(lambda row: row[cen_add][:row[cen_add].index(',')], axis=1)
    n_perfect_match_chosen = len(df.loc[(df['cd_add_cln'] == df['cen_add_cln']) & (df['selected'] == 1), :])
    n_perfect_match = len(df.loc[df['cd_add_cln'] == df['cen_add_cln'], :])

    return {'n_perfect_match_chosen': n_perfect_match_chosen, 'n_perfect_match': n_perfect_match}

"""
Get error rate based on distance (in metres) between matched and actual address
df: df with "selected" column, after running get_matches()
cen_lon: name of census long column
cen_lat: name of census lat column
lon: name of long column
lat: name of lat column
"""
def get_dist_error(df, cen_lon='CENSUS_X', cen_lat='CENSUS_Y', lon='CD_X', lat='CD_Y'):
    df['dist'] = df.apply(lambda row: haversine((row[cen_lat], row[cen_lon]), (row[lat], row[lon]), unit=Unit.METERS), axis=1)
    return df

"""
Get number of selected matches, out of total possible (ie unique CD records)
df: df with "selected" column, after running get_matches()
cd_id: name of cd_id column
"""
def get_under12_selections(df, age='CENSUS_AGE'):
    n_under12 = len(df.loc[(df[age] <= 12) & (df['selected'] == 1), :])
    n_selected = len(df.loc[df['selected'] == 1, :])
    proportion = round(n_under12 / n_selected * 100, 2)

    return proportion

"""
Get df containing selected matches based on actual distances and confidence score
df: any match df containing at least cd_id, census_id, census long/lat, cd long/lat and confidence score. preferably df with 'dist' column (after get_dist_error())
"""
def get_dist_based_match(df, cen_lon='CENSUS_X', cen_lat='CENSUS_Y', lon='CD_X', lat='CD_Y', cd_id='CD_ID', census_id='CENSUS_ID', confidence='confidence_score'):
    if 'dist' not in df.columns:
        df = get_dist_error(df, cen_lon=cen_lon, cen_lat=cen_lat, lon=lon, lat=lat)
    
    df['dist_weight'] = round(1 / log(df['dist']) + df[confidence], 2)
    dist_disamb = dl.get_matches(df, cd_id = cd_id, census_id = census_id, weight = 'dist_weight')

    return dist_disamb

"""
Get false positive and false negative rates
df_algo: df with "selected" column, after running get_matches()
df_dist: df with "selected" column, after running get_dist_based_match()
Returns confusion matrix and df_algo with 'selected' now called 'selected_algo', and an additional column, 'selected_dist' which indicates 'true' matches.
"""
def compare_selections(df_algo, df_dist, cd_id="CD_ID", census_id="CENSUS_ID"):
    df_algo = df_algo.merge(df_dist.loc[:, [cd_id, census_id, 'selected']], how="inner", on=[cd_id, census_id], validate='one_to_one', suffixes=('_algo', '_dist'))

    true_positive = len(df_algo.loc[(df_algo['selected_algo'] == 1) & (df_algo['selected_dist'] == 1), :])
    false_positive = len(df_algo.loc[(df_algo['selected_algo'] == 1) & (df_algo['selected_dist'] == 0), :])
    false_negative = len(df_algo.loc[(df_algo['selected_algo'] == 0) & (df_algo['selected_dist'] == 1), :])
    true_negative = len(df_algo.loc[(df_algo['selected_algo'] == 0) & (df_algo['selected_dist'] == 0), :])

    confusion_matrix = [[true_positive, false_positive], [false_negative, true_negative]]
    return {'confusion_matrix': confusion_matrix, 'merged_df': df_algo}

Writing analysis.py


### write benchmarking module

In [None]:
# benchmark
%%writefile benchmark.py

import pandas as pd
import re
import numpy as np
#import disambiguation.analysis as da
import analysis as da


#Static methods

def get_hn(add):
    hn = re.search('[0-9]+', add)
    return hn.group()

def get_st(add):
    try:
        st = re.search('(?<=[0-9]\\s)([A-Z]|\\s)+(?=,)', add)
        return st.group()
    except:
        return None

Writing benchmark.py


### write init module

In [None]:
%%writefile __init__.py

import pandas as pd
import numpy as np
import processing as dp
import disambiguation as dl
import analysis as da
import benchmark as bm
import time

class Disambiguator:

    def __init__(self, match_df, cd_id="CD_ID", census_id="CENSUS_ID", confidence="confidence_score", census_count='census_count', lon="CD_Y", lat="CD_X", sort_var="CENSUS_ID"):
        # initialize input 
        self.input = match_df

        # initialize col names
        self.sort_var = sort_var
        self.cd_id = cd_id
        self.census_id = census_id
        self.cen_count = census_count
        self.confidence = confidence
        self.lon = lon
        self.lat = lat

        # output variables 
        self.bipartite = None
        self.results = None

    def run_disambiguation(self, cluster=True, k_between=True, cluster_kwargs = {}, path_kwargs = {}):
        start_time = time.time()
        print("Running")

        print("Creating dictionary of sub dfs (1/4)...")
        sub_groups = dp.split_dfs(self.input, self.sort_var, self.confidence)

        print("Applying algorithms iteratively (2/4)...")
        print("Number of Subgraphs: " + str(len(sub_groups)))


        # iteratively apply algorithms onto each sub df
        sub_groups = [dl.apply_algo(sub_groups, i, cluster=cluster, census_id=self.census_id, confidence=self.confidence, lat=self.lat, lon=self.lon, k_between=k_between, cluster_kwargs=cluster_kwargs, path_kwargs=path_kwargs) for i in range(0, len(sub_groups))]

        print("Cleaning output (3/4)...")
        final = pd.concat(sub_groups)

        final = final.drop_duplicates([self.cd_id, self.census_id])

        final['anchor'] = final['anchor'].fillna(0)
        final['spatial_weight'] = np.where(final['spatial_weight'].isnull(), final[self.confidence] + 1, final['spatial_weight']) # conf + 1 when weight is null - these are the rows that had did not req. spatial disambiguation, hence would def. be in shortest path

        print("Disambiguating (4/4)...")
        disambiguated = dl.get_matches(final)
        self.bipartite = disambiguated['graph']
        self.results = disambiguated['results']

        end_time = time.time()
        print("Total time:", end_time - start_time)
        print("Done! :)")
    
    def get_result(self):
        return self.results

    def get_bgraph(self):
        return self.bipartite

    def save_result(self, output_fn):
        self.results.to_csv(output_fn, index=False)

class Disambiguator1880(Disambiguator):
    def __init__(self, match_df, cd_id='CD_ID', census_id='CENSUS_ID', confidence='confidence_score', lon='LONG', lat='LAT'):
        super().__init__(match_df, cd_id=cd_id, census_id=census_id, confidence=confidence, lon=lon, lat=lat)
        self.cen_lon = None
        self.cen_lat = None
        self.cen_add = None
        self.cd_add = None

    # adding in variables needed for analysis
    def merge_census_var(self, df, merge_cen_id="CENSUS_ID"):
        try:
           self.results = self.results.merge(df, how="left", left_on=self.census_id, right_on=merge_cen_id)

        except AttributeError:
            raise Exception("Please run run_disambiguation() first")

    def merge_cd_var(self, df, merge_cd_id="CD_ID"):
        try:
            self.results = self.results.merge(df, how="left", left_on=self.cd_id, right_on=merge_cd_id)

        except AttributeError:
            raise Exception("Please run run_disambiguation() first")

    def set_var(self, var= None):
        if var is None:
            var = {'cen_lon': 'CENSUS_X', 'cen_lat': 'CENSUS_Y', 'cen_add': 'CENSUS_MATCH_ADDR', 'cd_add': 'MATCH_ADDR'}

        for key, value in var.items():
            setattr(self, key, value)

    # functions for analysis
    def get_match_rate(self):
        
        return da.get_match_rate(self.results, cd_id=self.cd_id)

    def get_addr_success(self):
        if self.cd_add is None or self.cen_add is None:
            raise Exception("Please check that cd_add and cen_add have been initialized through set_var")

        return da.get_addr_success(self.results, cd_add=self.cd_add, cen_add=self.cen_add)

    def get_dist_error(self):
        if self.cen_lon is None or self.cen_lat is None:
            raise Exception("Please check that cen_lon and cen_lat have been initialized through set_var")

        return da.get_dist_error(self.results, cen_lon=self.cen_lon, cen_lat=self.cen_lat, lon=self.lon, lat=self.lat)

    def get_under12_selections(self, age='CENSUS_AGE'):
        return da.get_under12_selections(self.results, age=age)

class Benchmark():

    def __init__(self, match, census, cd):

        #Format lat/long for census data
        self.census = self.set_census(census)
        self.cd = self.set_cd(cd)
        self.match = match
        self.benchmark = self.create_benchmark()

        #Variables to set
        self.disambiguated = None
        self.confidence = None

        #outputs
        self.benchmark_results = None
        self.confusion_matrix = None

    def set_census(self, census):
        census = census.loc[:,['CENSUS_MATCH_ADDR', 'CENSUS_Y', 'CENSUS_X']].drop_duplicates()  # select diff variables
        census.loc[census.CENSUS_Y > 1000, 'CENSUS_Y'] = 40.799935
        return census

    def set_cd(self, cd):
        return cd[['OBJECTID', 'LONG', 'LAT', "CD_BLOCK_NUM"]]

    def set_disambiguated(self, disambiguated):
        self.disambiguated = disambiguated

    def set_confidence(self, confidence):
        self.confidence = confidence
        # Need to figure out what's going on here
        self.benchmark['confidence_score'] = self.benchmark['add_match'] #+ self.benchmark[self.confidence]

    def create_benchmark(self):

        benchmark = self.match.merge(self.census, how='left', on='CENSUS_MATCH_ADDR', validate='many_to_one')

        benchmark['cd_hn'] = benchmark.apply(lambda row: bm.get_hn(row.MATCH_ADDR), axis=1)
        benchmark['cen_hn'] = benchmark.apply(lambda row: bm.get_hn(row.CENSUS_MATCH_ADDR), axis=1)
        benchmark['cd_add_cln'] = benchmark.apply(lambda row: bm.get_st(row.MATCH_ADDR), axis=1)
        benchmark['cen_add_cln'] = benchmark.apply(lambda row: bm.get_st(row.CENSUS_MATCH_ADDR), axis=1)

        benchmark['add_match'] = np.where(benchmark.cd_hn == benchmark.cen_hn, 0.1, 0) + np.where(
            benchmark.cen_add_cln == benchmark.cd_add_cln, 0.9, 0)

        benchmark.merge(self.cd, how='left', on='OBJECTID', validate='many_to_one')

        return benchmark

    def run_benchmarking(self):
        if self.confidence is None:
            raise Exception("Please set confidence first")
        if self.disambiguated is None:
            raise Exception("Please set disambiguated first")

        self.benchmark_results = da.get_dist_based_match(self.benchmark, lon = "LONG", lat = "LAT")['results']
        self.confusion_matrix = da.compare_selections(self.disambiguated, self.benchmark_results)['confusion_matrix']

    def get_benchmark(self):
        return self.benchmark

    def get_benchmark_results(self):
        if self.benchmark_results is None:
            raise Exception("Please run benchmarking first")
        return self.benchmark_results

    def get_confusion_matrix(self):
        if self.confusion_matrix is None:
            raise Exception("Please run benchmarking first")
        return self.confusion_matrix

#version for new run where cd/census information does not need to be joined in
class Benchmark_v02():

    def __init__(self, match):

        self.match = match
        self.benchmark = self.create_benchmark()

        #Variables to set
        self.disambiguated = None
        self.confidence = None

        #outputs
        self.benchmark_results = None
        self.confusion_matrix = None

    def set_disambiguated(self, disambiguated):
        self.disambiguated = disambiguated

    def set_confidence(self, confidence):
        self.confidence = confidence
        # Need to figure out what's going on here
        self.benchmark['confidence_score'] = self.benchmark['add_match'] #+ self.benchmark[self.confidence]

    def create_benchmark(self):
        benchmark = self.match.copy()

        #change to use actual street/house numbers once they are sorted out for the census data
        benchmark['cd_hn'] = benchmark.apply(lambda row: bm.get_hn(row.CD_H_ADDRESS), axis=1)
        benchmark['cen_hn'] = benchmark.apply(lambda row: bm.get_hn(row.CENSUS_MATCH_ADDR), axis=1)
        benchmark['cd_add_cln'] = benchmark.apply(lambda row: bm.get_st(row.CD_H_ADDRESS), axis=1)
        benchmark['cen_add_cln'] = benchmark.apply(lambda row: bm.get_st(row.CENSUS_MATCH_ADDR), axis=1)

        benchmark['add_match'] = np.where(benchmark.cd_hn == benchmark.cen_hn, 0.1, 0) + np.where(
            benchmark.cen_add_cln == benchmark.cd_add_cln, 0.9, 0)

        return benchmark

    def run_benchmarking(self):
        if self.confidence is None:
            raise Exception("Please set confidence first")
        if self.disambiguated is None:
            raise Exception("Please set disambiguated first")

        self.benchmark_results = da.get_dist_based_match(self.benchmark, confidence = self.confidence)['results']
        self.confusion_matrix = da.compare_selections(self.disambiguated, self.benchmark_results)['confusion_matrix']

    def get_benchmark(self):
        return self.benchmark

    def get_benchmark_results(self):
        if self.benchmark_results is None:
            raise Exception("Please run benchmarking first")
        return self.benchmark_results

    def get_confusion_matrix(self):
        if self.confusion_matrix is None:
            raise Exception("Please run benchmarking first")
        return self.confusion_matrix

Writing __init__.py


### confidence score tuning

In [None]:
%%writefile confidence_score_tuning.py
from __init__ import Disambiguator, Disambiguator1880, Benchmark, Benchmark_v02

"""
Purpose: Generate confidence scores as a list
df: elastic search dataframe formatted for disambiguation
columns: columns we want to use to create confidence score
weights: corresponding weights we want to use to create confidence score, should sum to one
"""
def confidence_score(df, columns, weights):
    return [sum(row[col]*w for col, w in zip(columns,weights)) for index,row in df.iterrows()]

"""
#Unneeded in new data run
Purpose: Format census data for benchmarking
census: census data 
"""
def census_for_disamb(census):
    census_latlng_tuning = census.copy()
    census_latlng_tuning['CENSUS_ID'] = 'CENSUS_' + census_latlng_tuning['OBJECTID.x'].astype(str)
    census_latlng_tuning = census_latlng_tuning.loc[:, ['CENSUS_ID', 'CENSUS_X', 'CENSUS_Y']]
    census_latlng_tuning.loc[census_latlng_tuning.CENSUS_Y > 1000, 'CENSUS_Y'] = 40.799935
    return census_latlng_tuning

"""
param_grid: list of dictionaries with names of columns to use for a trial cf score and corresponding weights
df_allcols: elastic search output formatted for disambiguation
df_census: census data for benchmarking
df_cd: city directory data for benchmarking
"""
def confidence_score_tuning(param_grid, df_allcols, df_census, df_cd):
    # Store results
    results = {}
    df = df_allcols.copy()

    # Get confidence score for each value in grid
    for i in range(len(param_grid)):
        name = "confidence_score_" + str(i)
        df.loc[:, name] = confidence_score(df_allcols, param_grid[i]["columns"], param_grid[i]["weights"])

    # Create benchmark object
    benchmark = Benchmark(df, df_census, df_cd)

    # Format census data for tuning
    census_tuning = census_for_disamb(df_census)

    # try:
    for i in range(len(param_grid)):
        name = "confidence_score_" + str(i)

        # Run disambiguation process (use betweeness and clustering -- based on Jolene's work)
        basic = Disambiguator1880(df, confidence=name)

        try:
            basic.run_disambiguation()
        except:
            continue

        result = basic.get_result()  # .to_csv("..data/confidence_score_tuning/confidence_score_"+str(i))

        # Results analysis
        basic.merge_census_var(census_tuning)
        basic.set_var()

        # benchmarking
        benchmark.set_confidence(name)
        benchmark.set_disambiguated(result)
        benchmark.run_benchmarking()

        # Store results
        results[name] = {"columns": param_grid[i]["columns"], "weights": param_grid[i]["weights"],
                         "Match Rate": basic.get_match_rate(), "Address Success": basic.get_addr_success(),
                         "Under 12": basic.get_under12_selections(),
                         "confusion matrix": benchmark.get_confusion_matrix()}

        # will return results so far even if exception occurs
        # Spit out the best columns and weights (Add this in when decide what makes something the best)
        # For now simply output the analysis
    return results

#Uses new version of benchmarking, bc elastic search output means we don't need to join in the x/ys separately
def confidence_score_tuning_v02(param_grid, df_elastic_search):
    # Store results
    results = {}
    df = df_elastic_search.copy()

    # Get confidence score for each value in grid
    for i in range(len(param_grid)):
        name = "confidence_score_" + str(i)
        df.loc[:, name] = confidence_score(df_elastic_search, param_grid[i]["columns"], param_grid[i]["weights"])

    benchmark = Benchmark_v02(df_elastic_search)

    for i in range(len(param_grid)):
        name = "confidence_score_" + str(i)

        # Run disambiguation process (use betweeness and clustering -- based on Jolene's work)
        basic = Disambiguator1880(df, confidence=name)

        #try:
        basic.run_disambiguation()
        #except:
            #continue

        result = basic.get_result()  # .to_csv("..data/confidence_score_tuning/confidence_score_"+str(i))

        # Results analysis
        basic.set_var()

        # benchmarking
        benchmark.set_confidence(name)
        benchmark.set_disambiguated(result)
        benchmark.run_benchmarking()

        # Store results
        results[name] = {"columns": param_grid[i]["columns"], "weights": param_grid[i]["weights"],
                         "Match Rate": basic.get_match_rate(), "Address Success": basic.get_addr_success(),
                         "Under 12": basic.get_under12_selections(),
                         "confusion matrix": benchmark.get_confusion_matrix()}

        # will return results so far even if exception occurs
        # Spit out the best columns and weights (Add this in when decide what makes something the best)
        # For now simply output the analysis
    return results



Writing confidence_score_tuning.py


## confidence scoring module

In [None]:

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

%load_ext autoreload
%autoreload 2
import pandas as pd
import networkx as nx
import numpy as np
import disambiguation
from __init__ import Disambiguator, Disambiguator1880
import analysis as da
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import random
import re
import processing as dp 
from __init__ import Benchmark

### Get and Format Data

In [None]:
#elastic_match = pd.read_csv("../../Data/matches.csv")
# used old 1880 files. still need to run confidence score tuning on new 1880 matched file

elastic_match = pd.read_csv("/content/es-1880-21-5-2020.csv", sep='\t', engine='python')

In [None]:
census = pd.read_csv("/content/census_1880_mn_v04.csv")
def census_for_disamb(census):
    census_latlng_tuning = census.copy()
    census_latlng_tuning['CENSUS_ID'] = 'CENSUS_' + census_latlng_tuning['OBJECTID.x'].astype(str)
    census_latlng_tuning = census_latlng_tuning.loc[:, ['CENSUS_ID', 'CENSUS_X', 'CENSUS_Y']]
    census_latlng_tuning.loc[census_latlng_tuning.CENSUS_Y > 1000, 'CENSUS_Y'] = 40.799935
    return census_latlng_tuning

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
#cd_latlng
latlng = pd.read_csv("/content/cd_1880_mn_v04.csv")

In [None]:
match = dp.elastic_to_disamb(elastic_match, latlng)

##### create a sample of wards

In [None]:
match.head()

Unnamed: 0,OBJECTID.x,CENSUS_NAMEFRSTB,CENSUS_NAMELASTB,CENSUS_AGE,CENSUS_OCCLABELB,CENSUS_MATCH_ADDR,CENSUS_SEGMENT_ID,WARD_NUM,CD_ED,OBJECTID,MATCH_ADDR,CD_FIRST_NAME,CD_LAST_NAME,CD_OCCUPATION,CD_FINAL_HOUSENUM,jw_fn,jw_ln,jw_score,occ_listed,age_score,cd_count,census_count,confidence_score,CD_ID,CENSUS_ID,census_count_inverse,cd_count_inverse,LONG,LAT
0,862548,STEPHEN,ZELLER,40,IRON MOULDER,"504 55TH ST W, NYC-Manhattan, NY",3789,22,513.0,3,"504 W 55 ST, New York, NY",Stephen,Zoller,molder,504,1.0,0.84,0.904,1,1,1,1,0.95,CD_3,CENSUS_862548,1.0,1.0,-73.989856,40.767868
1,795510,OSCAR,ZOLLIKOFFER,70,"PRESIDENT, METROPOLITAN GAS CO.","210 46TH ST W, NYC-Manhattan, NY",3357,22,469.0,4,"210 W 46 ST, New York, NY",Oscar,Zollikoffer,pres,210,1.0,1.0,1.0,1,1,1,1,1.0,CD_4,CENSUS_795510,1.0,1.0,-73.985856,40.758819
2,978306,OSCAR,ZOLLIKOFFER,33,"SECRETARY, MET. GAS LIGHT CO.","65 54TH ST W, NYC-Manhattan, NY",3488,19,582.0,6,"65 W 54 ST, New York, NY",Oscar,Zollikoffer,sec,65,1.0,1.0,1.0,1,1,1,1,1.0,CD_6,CENSUS_978306,1.0,1.0,-73.977969,40.762224
3,56054,ROBERT,ZALLER,31,BIRD IMPORTER,"5 WILLIAM ST N, NYC-Manhattan, NY",531,4,36.0,8,"5 N William St, New York, NY",Robert,Zoller,birds,5,1.0,0.9,0.94,1,1,1,1,0.97,CD_8,CENSUS_56054,1.0,1.0,-74.004401,40.711743
4,522583,LEOPOLD,ZOLLMANN,30,SHOE MAKER,"342 HOUSTON ST E, NYC-Manhattan, NY",1283,11,305.0,13,"344 E HOUSTON ST, New York, NY",Leopold,Zollmann,shoes,344,1.0,1.0,1.0,1,1,1,2,0.9,CD_13,CENSUS_522583,0.5,1.0,-73.981913,40.720894


In [None]:
wards = [3,9,10,18,21,22]
match_sample = match[match.WARD_NUM.isin(wards)]

### confidence score tuning


In [None]:
#function to get confidence score including specified columns and weights
def confidence_score(df, columns, weights):
    return [sum(row[col]*w for col, w in zip(columns,weights)) for index,row in df.iterrows()]

In [None]:
def confidence_score_tuning(param_grid, df_allcols, df_census, df_cd):
    #Store results
    results = {}
    df = df_allcols.copy()
    
    #Get confidence score for each value in grid
    for i in range(len(param_grid)):
        name = "confidence_score_"+str(i)
        df.loc[:,name] = confidence_score(df_allcols, param_grid[i]["columns"], param_grid[i]["weights"])

        #print(df.head())
        
    #Create benchmark object
    benchmark = Benchmark(df, df_census, df_cd)
    
    #Format census data for tuning
    census_tuning = census_for_disamb(df_census)
    
   # try:
    for i in range(len(param_grid)):

        name = "confidence_score_"+str(i)

        #Run disambiguation process (use betweeness and clustering -- based on Jolene's work)
        basic = Disambiguator1880(df, confidence = name)

        try:
            basic.run_disambiguation()
        except:
            continue

        result = basic.get_result() #.to_csv("..data/confidence_score_tuning/confidence_score_"+str(i))

        #Results analysis
        basic.merge_census_var(census_tuning)
        basic.set_var() 

        #benchmarking
        benchmark.set_confidence(name)
        benchmark.set_disambiguated(result)
        benchmark.run_benchmarking()

        #Store results
        results[name] = {"columns":param_grid[i]["columns"], "weights":param_grid[i]["weights"], "Match Rate":basic.get_match_rate(), "Address Success":basic.get_addr_success(),"Under 12":basic.get_under12_selections(), "confusion matrix":benchmark.get_confusion_matrix()}
        
    #will return results so far even if exception occurs
        #Spit out the best columns and weights (Add this in when decide what makes something the best)
        #For now simply output the analysis
    return results


In [None]:
#Columns and weights
param_grid = [{"columns": ['jw_score','cd_count_inverse','census_count_inverse', 'occ_listed', 'age_score'], "weights":[0.5,0.2,0.2,0.05,0.05]},
              {"columns": ['jw_score','cd_count_inverse','census_count_inverse', 'occ_listed', 'age_score'], "weights":[0.55,0.18,0.18,0.05,0.04]},
             {"columns": ['jw_score','cd_count_inverse','census_count_inverse', 'occ_listed', 'age_score'], "weights":[0.7,0.1,0.1,0.05,0.05]}, #Best outcome
             {"columns": ['jw_score','cd_count_inverse','census_count_inverse', 'occ_listed', 'age_score'], "weights":[0.6,0.1,0.1,0.1,0.1]},
              {"columns": ['jw_score', 'occ_listed', 'age_score'], "weights":[0.8,0.10,0.10]},
             {"columns": ['jw_score','census_count_inverse', 'occ_listed', 'age_score'], "weights":[0.6,0.15,0.1,0.15]},
             {"columns": ['jw_score','cd_count_inverse','census_count_inverse', 'occ_listed', 'age_score'], "weights":[0.8,0.05,0.05,0.05,0.05]},
             {"columns": ['jw_score','cd_count_inverse','census_count_inverse', 'occ_listed', 'age_score'], "weights":[0.6,0.15,0.15,0.05,0.05]}]

             
tuning_results = confidence_score_tuning(param_grid, match_sample, census, latlng)


   OBJECTID.x CENSUS_NAMEFRSTB  ...        LAT  confidence_score_0
0      862548          STEPHEN  ...  40.767868               0.952
1      795510            OSCAR  ...  40.758819               1.000
6      330206           HERMAN  ...  40.716711               1.000
7      855850          MICHAEL  ...  40.766168               1.000
8      796365            RICKA  ...  40.758155               0.967

[5 rows x 30 columns]
   OBJECTID.x CENSUS_NAMEFRSTB  ... confidence_score_0  confidence_score_1
0      862548          STEPHEN  ...              0.952              0.9472
1      795510            OSCAR  ...              1.000              1.0000
6      330206           HERMAN  ...              1.000              1.0000
7      855850          MICHAEL  ...              1.000              1.0000
8      796365            RICKA  ...              0.967              0.9637

[5 rows x 31 columns]
   OBJECTID.x CENSUS_NAMEFRSTB  ... confidence_score_1  confidence_score_2
0      862548          STEP

In [None]:
display(pd.DataFrame.from_dict(tuning_results))

Unnamed: 0,confidence_score_0,confidence_score_1,confidence_score_2,confidence_score_5,confidence_score_7
columns,"[jw_score, cd_count_inverse, census_count_inve...","[jw_score, cd_count_inverse, census_count_inve...","[jw_score, cd_count_inverse, census_count_inve...","[jw_score, census_count_inverse, occ_listed, a...","[jw_score, cd_count_inverse, census_count_inve..."
weights,"[0.5, 0.2, 0.2, 0.05, 0.05]","[0.55, 0.18, 0.18, 0.05, 0.04]","[0.7, 0.1, 0.1, 0.05, 0.05]","[0.6, 0.15, 0.1, 0.15]","[0.6, 0.15, 0.15, 0.05, 0.05]"
Match Rate,98.48,98.48,98.48,98.48,98.48
Address Success,"{'n_perfect_match_chosen': 8445, 'n_perfect_ma...","{'n_perfect_match_chosen': 8443, 'n_perfect_ma...","{'n_perfect_match_chosen': 8445, 'n_perfect_ma...","{'n_perfect_match_chosen': 8441, 'n_perfect_ma...","{'n_perfect_match_chosen': 8446, 'n_perfect_ma..."
Under 12,2.6,2.68,2.67,2.45,2.65
confusion matrix,"[[30642, 3776], [3776, 11333]]","[[30627, 3791], [3791, 11318]]","[[30633, 3785], [3785, 11324]]","[[30591, 3827], [3827, 11282]]","[[30634, 3784], [3784, 11325]]"


In [None]:
benchmark_test = confidence_score_tuning(param_grid, match_sample, census, latlng)

   OBJECTID.x CENSUS_NAMEFRSTB  ...        LAT  confidence_score_0
0      862548          STEPHEN  ...  40.767868               0.952
1      795510            OSCAR  ...  40.758819               1.000
6      330206           HERMAN  ...  40.716711               1.000
7      855850          MICHAEL  ...  40.766168               1.000
8      796365            RICKA  ...  40.758155               0.967

[5 rows x 30 columns]
   OBJECTID.x CENSUS_NAMEFRSTB  ... confidence_score_0  confidence_score_1
0      862548          STEPHEN  ...              0.952              0.9472
1      795510            OSCAR  ...              1.000              1.0000
6      330206           HERMAN  ...              1.000              1.0000
7      855850          MICHAEL  ...              1.000              1.0000
8      796365            RICKA  ...              0.967              0.9637

[5 rows x 31 columns]
   OBJECTID.x CENSUS_NAMEFRSTB  ... confidence_score_1  confidence_score_2
0      862548          STEP

In [None]:
display(pd.DataFrame.from_dict(benchmark_test))

Unnamed: 0,confidence_score_0,confidence_score_1,confidence_score_2,confidence_score_5,confidence_score_7
columns,"[jw_score, cd_count_inverse, census_count_inve...","[jw_score, cd_count_inverse, census_count_inve...","[jw_score, cd_count_inverse, census_count_inve...","[jw_score, census_count_inverse, occ_listed, a...","[jw_score, cd_count_inverse, census_count_inve..."
weights,"[0.5, 0.2, 0.2, 0.05, 0.05]","[0.55, 0.18, 0.18, 0.05, 0.04]","[0.7, 0.1, 0.1, 0.05, 0.05]","[0.6, 0.15, 0.1, 0.15]","[0.6, 0.15, 0.15, 0.05, 0.05]"
Match Rate,98.48,98.48,98.48,98.48,98.48
Address Success,"{'n_perfect_match_chosen': 8445, 'n_perfect_ma...","{'n_perfect_match_chosen': 8443, 'n_perfect_ma...","{'n_perfect_match_chosen': 8445, 'n_perfect_ma...","{'n_perfect_match_chosen': 8441, 'n_perfect_ma...","{'n_perfect_match_chosen': 8446, 'n_perfect_ma..."
Under 12,2.6,2.68,2.67,2.45,2.65
confusion matrix,"[[30642, 3776], [3776, 11333]]","[[30627, 3791], [3791, 11318]]","[[30633, 3785], [3785, 11324]]","[[30591, 3827], [3827, 11282]]","[[30634, 3784], [3784, 11325]]"


## 1850 disambiguation

In [None]:
# disambiguation on 22-09 summer file run
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import pandas as pd
from __init__ import Disambiguator
import confidence_score_tuning as cf
import processing as dp


elastic_search_file = "/content/es-1850-22-9-2020.csv"
match_file = "/content/1850_mn_match_v2.csv"

elastic_match = pd.read_csv(elastic_search_file, sep='\t', engine='python')

elastic_match = dp.col_for_disamb(elastic_match, cd_id = "CD_RECORD_ID",cen_id = "CENSUS_IPUMS_UID")
elastic_match.loc[:,"confidence_score"] = cf.confidence_score(elastic_match, ['jw_score','cd_count_inverse','census_count_inverse', 'occ_listed', 'age_score'], [0.7,0.1,0.1,0.05,0.05])

print ("No. of matches: " + str(len(elastic_match)))
print ("No. of unique CD records: " + str(len(elastic_match['OBJECTID'].drop_duplicates())))
print ("No. of unique Census records: " + str(len(elastic_match['CENSUS_IPUMS_UID'].drop_duplicates())))
print ("No. of 1:1 matches: " + str(len(elastic_match[ (elastic_match['census_count'] == 1) & (elastic_match['cd_count'] == 1) ] )) )
print ("No. of matches where census record <= 12: " + str( len(elastic_match[elastic_match['CENSUS_AGE'] <= 12]) ))
print ("No. of unique matches where census record <= 12: " + str( len(elastic_match[elastic_match['CENSUS_AGE'] <= 12]['CENSUS_IPUMS_UID'].drop_duplicates()) ))
print ("No. of anchors (confidence score = 1): " + str( len(elastic_match[elastic_match['confidence_score'] == 1]) ))

disambiguate = Disambiguator(elastic_match, lon='CD_X', lat='CD_Y',sort_var='CENSUS_INDEX')
disambiguate.run_disambiguation()

result = disambiguate.get_result()

print("records with a final match:", sum(result.selected))
print("all records matched:", len(result))
print("number of cd records matched:", len(result['CD_ID'].drop_duplicates()))
print("proportion of cd records in elastic search included in final match:", sum(result.selected) / len(result['CD_ID'].drop_duplicates()))

result.to_csv(match_file, index=False)


No. of matches: 63312
No. of unique CD records: 25089
No. of unique Census records: 47306
No. of 1:1 matches: 11973
No. of matches where census record <= 12: 14877
No. of unique matches where census record <= 12: 10627
No. of anchors (confidence score = 1): 6702
Running
Creating dictionary of sub dfs (1/4)...
Applying algorithms iteratively (2/4)...
Number of Subgraphs: 6702
Reached: 0
Reached: 1000
Reached: 2000
Reached: 3000
Reached: 4000
Reached: 5000
Reached: 6000
Cleaning output (3/4)...
Disambiguating (4/4)...
Total time: 297.37249851226807
Done! :)
records with a final match: 24585.0
all records matched: 63312
number of cd records matched: 25089
proportion of cd records in elastic search included in final match: 0.9799115150065766


In [None]:
# disambiguation on 24-11 fall file run

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import pandas as pd
from __init__ import Disambiguator
import confidence_score_tuning as cf
import processing as dp


elastic_search_file = "/content/es-1850-24-11-2020.csv"
match_file = "/content/1850_mn_match_24-11-2020.csv"

elastic_match = pd.read_csv(elastic_search_file)


elastic_match = dp.col_for_disamb(elastic_match, cd_id = "CD_RECORD_ID",cen_id = "CENSUS_IPUMS_UID")
elastic_match.loc[:,"confidence_score"] = cf.confidence_score(elastic_match, ['jw_score','cd_count_inverse','census_count_inverse', 'occ_listed', 'age_score'], [0.7,0.1,0.1,0.05,0.05])

print ("No. of matches: " + str(len(elastic_match)))
print ("No. of unique CD records: " + str(len(elastic_match['OBJECTID'].drop_duplicates())))
print ("No. of unique Census records: " + str(len(elastic_match['CENSUS_IPUMS_UID'].drop_duplicates())))
print ("No. of 1:1 matches: " + str(len(elastic_match[ (elastic_match['census_count'] == 1) & (elastic_match['cd_count'] == 1) ] )) )
print ("No. of matches where census record <= 12: " + str( len(elastic_match[elastic_match['CENSUS_AGE'] <= 12]) ))
print ("No. of unique matches where census record <= 12: " + str( len(elastic_match[elastic_match['CENSUS_AGE'] <= 12]['CENSUS_IPUMS_UID'].drop_duplicates()) ))
print ("No. of anchors (confidence score = 1): " + str( len(elastic_match[elastic_match['confidence_score'] == 1]) ))

disambiguate = Disambiguator(elastic_match, lon='CD_X', lat='CD_Y',sort_var='CENSUS_INDEX')
disambiguate.run_disambiguation()

result = disambiguate.get_result()

print("records with a final match:", sum(result.selected))
print("all records matched:", len(result))
print("number of cd records matched:", len(result['CD_ID'].drop_duplicates()))
print("proportion of cd records in elastic search included in final match:", sum(result.selected) / len(result['CD_ID'].drop_duplicates()))

result.to_csv(match_file, index=False)


No. of matches: 89294
No. of unique CD records: 32706
No. of unique Census records: 65162
No. of 1:1 matches: 14737
No. of matches where census record <= 12: 21681
No. of unique matches where census record <= 12: 15281
No. of anchors (confidence score = 1): 6725
Running
Creating dictionary of sub dfs (1/4)...
Applying algorithms iteratively (2/4)...
Number of Subgraphs: 6725
Reached: 0
Reached: 1000
Reached: 2000
Reached: 3000
Reached: 4000
Reached: 5000
Reached: 6000
Cleaning output (3/4)...
Disambiguating (4/4)...
Total time: 458.5079038143158
Done! :)
records with a final match: 31818.0
all records matched: 89294
number of cd records matched: 32706
proportion of cd records in elastic search included in final match: 0.9728490185287103
