In [None]:
!pip install pyjarowinkler
!pip install haversine
!pip install hdbscan

### write processing module

In [None]:
# processing
# Contains functions related to dataframe manipulation, ei joining dwellings to all records, etc.
%%writefile processing.py

import pandas as pd
import networkx as nx
import numpy as np
from pyjarowinkler import distance
from haversine import haversine, Unit
import time


def col_for_disamb(df, cd_id, cen_id, cd_fn="CD_FIRST_NAME", cen_fn="CENSUS_FIRST_NAME", cd_ln="CD_LAST_NAME",
                           cen_ln="CENSUS_LAST_NAME", cen_occ="CENSUS_OCCUPATION", cen_age="CENSUS_AGE"):
    # name jw dist
    df["jw_fn"] = df.apply(lambda x: distance.get_jaro_distance(x[cd_fn], x[cen_fn], winkler=True, scaling=0.1), axis=1)
    df["jw_ln"] = df.apply(lambda x: distance.get_jaro_distance(x[cd_ln], x[cen_ln], winkler=True, scaling=0.1), axis=1)
    df["jw_score"] = 0.4 * df["jw_fn"] + 0.6 * df["jw_ln"]

    # occ
    df['occ_listed'] = np.where((df[cen_occ].isnull()) | (df[cen_occ] == '*'), 0, 1)

    # age
    df['age_score'] = np.where(df[cen_age] <= 12, 0, 1)

    # cd conflicts
    df["cd_count"] = df.groupby(cd_id)[cen_id].transform('count')
    df["census_count"] = df.groupby(cen_id)[cd_id].transform('count')

    df['census_count_inverse'] = 1 / df['census_count']
    df['cd_count_inverse'] = 1 / df['cd_count']

    #This is so the bipartite matching algorthm works the way we need it to
    df['CD_ID'] = 'CD_' + df[cd_id].astype(str)
    df['CENSUS_ID'] = 'CENSUS_' + df[cen_id].astype(str)

    return df


"""
Applies confidence score to df
"""
def apply_confidence_score(df, cd_fn = "CD_FIRST_NAME", cen_fn = "CENSUS_FIRST_NAME", cd_ln = "CD_LAST_NAME", cen_ln = "CENSUS_LAST_NAME", cen_occ = "CENSUS_OCCLABELB", cen_age = "CENSUS_AGE", cd_id="OBJECTID", cen_id="OBJECTID.x"):
    
    # name jw dist
    df["jw_fn"] = df.apply(lambda x: distance.get_jaro_distance(x[cd_fn], x[cen_fn], winkler=True, scaling=0.1), axis = 1)
    df["jw_ln"] = df.apply(lambda x: distance.get_jaro_distance(x[cd_ln], x[cen_ln], winkler=True, scaling=0.1), axis = 1)
    df["jw_score"] = 0.4 * df["jw_fn"] + 0.6 * df["jw_ln"]

    # occ
    df['occ_listed'] = np.where((df[cen_occ].isnull()) | (df[cen_occ] == '*'), 0, 1)

    # age
    df['age_score'] = np.where(df[cen_age] <= 12, 0, 1)

    # cd conflicts
    df["cd_count"] = df.groupby(cd_id)[cen_id].transform('count')
    df["census_count"] = df.groupby(cen_id)[cd_id].transform('count')

    df['confidence_score'] = .5*df.jw_score + .2*(1/df.cd_count) + \
                             .2*(1/df.census_count) + .05*df.occ_listed + \
                             .05*df.age_score
    df['confidence_score'] = df['confidence_score'].round(decimals = 2)

    return df


#not needed in new run
"""
Takes elastic search and census directory geocode file to create a dataframe 
ready for the disambiguation process.
Can add/incorporate new columns to include in confidence score here
elastic_search: either df with elastic search output or elastic search output file
city_directory: either df with city directory data or file name
file: boolean value, set to True if elastic_search/city_directory are file names otherwise set 
to false. Default false 
"""
def elastic_to_disamb(elastic_search, city_directory, file = False):

    if file:
        elastic_search = pd.read_csv(elastic_search, sep='\t', engine='python')
        city_directory = pd.read_csv(city_directory)

    else:
        elastic_search = elastic_search.copy()
        city_directory = city_directory.copy()

    latlng = city_directory[['OBJECTID', 'LONG', 'LAT']]

    #print(elastic_search.head())
    #print(latlng.head())

    match = apply_confidence_score(elastic_search, cen_fn='CENSUS_NAMEFRSTB', cen_ln='CENSUS_NAMELASTB',
                                      cen_occ='CENSUS_OCCLABELB', cen_id='OBJECTID.x')

    #print(match.head())
    match['CD_ID'] = 'CD_' + match['OBJECTID'].astype(str)
    match['CENSUS_ID'] = 'CENSUS_' + match['OBJECTID.x'].astype(str)

    #Can remove this after finalizing the confidence score
    match['census_count_inverse'] = 1 / match['census_count']
    match['cd_count_inverse'] = 1 / match['cd_count']

    match = match.merge(latlng, how='left', on='OBJECTID', validate='many_to_one')
    #print(match.head())
    return match
"""
Create a list of dataframes where the top row is an anchor
Each dataframe is one where spatial disambiguation will be applied
This is necessary as else, algorithms take too long to run
Match: df of matches
confidence_score: name of confidence score column
"""

def split_dfs(match, sort_var="CENSUS_ID", confidence="confidence_score"):

    match = match.sort_values(by=[sort_var])

    # identify anchors and assign anchor ID
    match['anchor'] = np.where(match[confidence] == 1, 1, None)
    sub_group = pd.DataFrame({'index': list(match.loc[match.anchor.notnull(), :].index), 'group_ID': range(0, sum(match['anchor'].notnull()))}).set_index('index')
    match = match.join(sub_group)
    match['group_ID'] = match['group_ID'].fillna(method='ffill').fillna(method='backfill')

    # split df into multiple df, each bounded by anchor

    # sub_group_dict = {group: df for group, df in match.groupby('group_ID')}
    sub_groups = [df for group, df in match.groupby('group_ID')]
    
    # add bottom anchor back
    """
    for i in range(0, len(sub_group_dict) - 1):
        sub_group_dict[i] = pd.concat([sub_group_dict[i], sub_group_dict[i+1][0:1]])
    """
    return sub_groups

"""
Create node ID for each match, to be using the shortest path algorithm 
sub must be a df with each row as a potential match between a CD and census record. It must contain the columns CD_ID, CENSUS_ID, LONG, LAT, confidence_score and MATCH_ADDR.
the column names can be specified individually if they are named differently
Returns the dataframe with new columns, 'anchor', 'node_ID' and 'letter'.
anchor: whether row is an anchor (confidence score = 1)
node_ID: unique node ID. each node is a match, so e.g. A0 and A1 refers to two potential CD matches for the same census record
letter: grouping for identical census records 
add_prefixes: whether to add prefixes 'CD_' and 'CENSUS_' to cd_id and census_id respectively. prefixes are required for subsequent bipartite matching
"""
def create_path_df(sub_graph, census_id = "CENSUS_ID"):

    sub_graph['node_ID'] = sub_graph.groupby(census_id).cumcount()

    letter_id = sub_graph[census_id].unique().tolist()
    letters = ['N' + str(x) for x in range(0, len(letter_id))]
    letter_id = pd.DataFrame({'CENSUS_ID': letter_id, 'letter': letters})

    sub_graph = sub_graph.merge(letter_id, how='left', left_on=census_id, right_on="CENSUS_ID", validate='many_to_one')

    sub_graph['node_ID'] = sub_graph.apply(lambda row: row.letter + '_' + str(row.node_ID), axis=1)

    return sub_graph

"""
Creates a graph from the sub_graph dataframe
Each node being a potential CD-census match and 
    each edge being the link between the potential CD records of consecutive census records
The weight of each edge = the haversine distance between the two
cluster_col: name of column with cluster group. If does not exist, use None
Returns the graph object
"""

def create_path_graph(g, cluster_col='in_cluster_x', lat='CD_X', lon='CD_Y'):

    g.loc[:, 'key'] = 0
    g = g.merge(g, on='key')

    #This is time consuming
    g['key'] = g.apply(lambda row: 1 if int(row.letter_x[1:]) - int(row.letter_y[1:]) == -1 else 0, axis = 1)

    g = g[g.key == 1]

    g['weight'] = g.apply(lambda row: haversine((row[lat + '_y'], row[lon + '_y']), (row[lat + '_x'], row[lon + '_x']), unit=Unit.METERS), axis=1)

    if cluster_col != None:
        g['weight'] = g.apply(lambda row: row.weight + 999 if row[cluster_col] == -1 else row.weight, axis=1)

    g_edges = [(row.node_ID_x, row.node_ID_y, row.weight) for index, row in g.iterrows()]
    graph = nx.DiGraph()
    graph.add_weighted_edges_from(g_edges)

    
    return graph

Writing processing.py


### write disambiguation module

In [None]:
# disambiguation
%%writefile disambiguation.py

import pandas as pd
import networkx as nx
import hdbscan
from itertools import islice
#import disambiguation.processing as dp
import processing as dp
import time

"""
Wrapper function for everything below, including checks
Designed to work within list comprehension only! (refer to Disambiguator())
Works by applying algorithm to specified df (using index i) in the list
sub_groups: list of dfs, each df being a subset of the census bounded by 2 anchors
i: index of df in the list
"""
def apply_algo(sub_groups, i, cluster=True, k_between=True, census_id='CENSUS_ID', census_count="census_count", confidence='confidence_score', lat="CD_X", lon="CD_Y", cluster_kwargs={}, path_kwargs={}):

    if i % 1000 == 0:
        print("Reached: " + str(i))
    df = sub_groups[i]
    if sum(df[census_count] > 1) == 0: # no disambiguation needed
        return df

    if i + 1 < len(sub_groups): # add bottom anchor
        df = pd.concat([df, sub_groups[i+1][0:1]]) 

    path_df = dp.create_path_df(df, census_id)


    if cluster:
        # apply density clustering and remove outlier nodes
        path_df = apply_density_clustering(path_df, lat, lon, **cluster_kwargs)
        cluster_arg = 'in_cluster_x'
    else:
        cluster_arg = None

    # create graph and k shortest paths centrality

    g = dp.create_path_graph(path_df, cluster_col=cluster_arg, lat=lat, lon=lon)

    if k_between:

        output = apply_k_betweenness(path_df, g, confidence = confidence, **path_kwargs)
    else:
        output = apply_shortest_path(path_df, g, confidence = confidence, **path_kwargs)

    return output

"""
Apply Dijkstra's algorithm to the graph and get spatial weights
Spatial weights are computed as confidence score +1 if match was included in shortest path, and confidence score + 0 otherwise
df: dataframe of records with confidence score and node ID, names can be modified via parameters
graph: graph object created from create_path_graph()
source: start node, e.g. 'A0'. By default it chooses first row in the table
target: end node, e.g. 'J0'. By default it chooses last row in the table
Returns a dataframe of matches with added 'spatial weight'
"""
def apply_shortest_path(df, graph, source = None, target = None, confidence = 'confidence_score', node_id = 'node_ID'):
    if source == None:
        source = list(df[node_id])[0]
    if target == None:
        target = list(df[node_id])[-1]

    path = nx.dijkstra_path(graph, source, target)
    df['spatial_weight'] = df.apply(lambda row: row[confidence] + 1 if row[node_id] in path else row[confidence], axis = 1)

    return df

"""
Apply betweenness centrality using k shortest paths (as documented in spatial_disambiguation.ipynb)
df: df of matches
graph: graph object created from create_path_graph()
source: start node, e.g. 'A0'. By default it chooses first row in the table
target: end node, e.g. 'J0'. By default it chooses last row in the table
k: how many shortest paths to choose from (absolute number). By default 1 or ~ 1/2 of number of possible paths if there are more than 30 paths
scale: how much to scale the score by when adding it with confidence score. Default = 1 (equal weight of confidence score and spatial weight)
Returns
    df: df with spatial weights column
    k_paths: paths used for calculation
"""
def apply_k_betweenness(df, graph, confidence = "confidence_score", source=None, target=None, k=None, scale=1):
    if source == None:
        source = list(df["node_ID"])[0]
    if target == None:
        target = list(df["node_ID"])[-1]

    k_paths = nx.shortest_simple_paths(graph, source, target, weight="weight")

    length = get_n_paths(df)
    if k == None:
        if length < 31:
            k = 1
        elif length > 50:
            k = 50
        else:
            k = int(0.5 * length)

    k_paths = list(islice(k_paths, k))

    # initialize output: dict with nodes as keys
    spatial_weights = dict.fromkeys(graph.nodes, 0)
    
    # count
    for path in k_paths:
        for node in path:
            spatial_weights[node] += 1
    
    spatial_weights = [[key , round(value / k, 2) * scale] for key, value in spatial_weights.items()]
    spatial_df = pd.DataFrame(spatial_weights, columns=["node_ID", 'spatial_weight'])
    df = df.merge(spatial_df, how="inner", on="node_ID", validate="one_to_one")
    df['spatial_weight'] = df['spatial_weight'] + df[confidence]

    return df

"""
Helper method to count the number of possible paths in the graph
"""
def get_n_paths(df):
    k = 1
    counts = df.groupby('letter')['letter'].size().to_list()
    for count in counts:
        k *= count
    
    return k

"""
Apply density based clustering to detect outliers. Requires `hdbscan` library
Refer to hdbscan documentation on parameters
Returns df with a column 'in_cluster' indicating which cluster the nodes are in
"""
def apply_density_clustering(df, lat='CD_X', lon="CD_Y", min_cluster_size=10, min_samples=10, allow_single_cluster=True, **kwargs):
    cluster_sub = df.loc[:, [lon, lat]]
    clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples, allow_single_cluster=allow_single_cluster, **kwargs).fit(cluster_sub)

    df['in_cluster'] = pd.Series(clusterer.labels_).values
    return df

"""
A bipartite graph is created from the matches, with each node being either a census or CD record and each edge indicating a potential match. 
Note that subgraph MUST have prefixes on the cd_id ('CD_') and census_id ('CENSUS_') columns
The matching algorithm (maximum weighted matching) will 
    (1) select sets of matches that give the highest number of matches 
    (2) choose the match set that has the highest weight based on that
Returns a dictionary with 'graph' as the list of bipartite graphs and 'results' being the original df with an additional 'selected' column, indicating the correct match and 'graph_id' column, indicated subgraph.
"""
def get_matches(df, cd_id = 'CD_ID', census_id = 'CENSUS_ID', weight = 'spatial_weight'):
    b_edges = [(row[cd_id], row[census_id], row[weight]) for index, row in df.iterrows()]
    b = nx.Graph()
    b.add_weighted_edges_from(b_edges)

    # algorithm is too expensive if we perform it on entire graph. moreover, graph is actually disconnected into sub_graphs. apply algorithm on subgraphs instead
    subgraphs = [b.subgraph(c) for c in nx.connected_components(b)]
    matches = [list(nx.max_weight_matching(graph, maxcardinality = True)) for graph in subgraphs]
    matches = [sorted(list(item)) for sublist in matches for item in sublist] # unnest and convert pairs from tuple to list
    matches = pd.DataFrame(matches, columns=[cd_id, census_id])
    matches['selected'] = 1

    df = df.merge(matches, how='left', on=[cd_id, census_id], validate='one_to_one')
    df['selected'] = df['selected'].fillna(0)

    # add subgraph id
    subgraph_id = [{'graph_ID': i, 'CD_ID': node} for i in range(0, len(subgraphs)) for node in list(subgraphs[i].nodes) if node[:2] == 'CD']
    subgraph_id = pd.DataFrame(subgraph_id)
    df = df.merge(subgraph_id, how="inner", left_on=cd_id, right_on="CD_ID", validate="many_to_one")

    return {'graph': subgraphs, 'results': df}

Writing disambiguation.py


### write analysis module

In [None]:
# analysis
%%writefile analysis.py

from haversine import haversine, Unit
from numpy import log
#import disambiguation.disambiguation as dl
import disambiguation as dl

"""
Get number of selected matches, out of total possible (ie unique CD records)
df: df with "selected" column, after running get_matches()
cd_id: name of cd_id column
"""
def get_match_rate(df, cd_id='CD_ID'):
    n_cd_records = len(df[cd_id].unique())
    n_selected = sum(df["selected"].values)
    match_rate = round(n_selected / n_cd_records * 100, 2)

    return match_rate

"""
Get number of perfect matches (in terms of address) selected
df: df with "selected" column, after running get_matches()
cd_add: name of cd address column
cen_add: name of cen_add column
"""
def get_addr_success(df, cd_add='MATCH_ADDR', cen_add='CENSUS_MATCH_ADDR'):
    df['cd_add_cln'] = df.apply(lambda row: row[cd_add][:row[cd_add].index(',')], axis=1)
    df['cen_add_cln'] = df.apply(lambda row: row[cen_add][:row[cen_add].index(',')], axis=1)
    n_perfect_match_chosen = len(df.loc[(df['cd_add_cln'] == df['cen_add_cln']) & (df['selected'] == 1), :])
    n_perfect_match = len(df.loc[df['cd_add_cln'] == df['cen_add_cln'], :])

    return {'n_perfect_match_chosen': n_perfect_match_chosen, 'n_perfect_match': n_perfect_match}

"""
Get error rate based on distance (in metres) between matched and actual address
df: df with "selected" column, after running get_matches()
cen_lon: name of census long column
cen_lat: name of census lat column
lon: name of long column
lat: name of lat column
"""
def get_dist_error(df, cen_lon='CENSUS_X', cen_lat='CENSUS_Y', lon='CD_X', lat='CD_Y'):
    df['dist'] = df.apply(lambda row: haversine((row[cen_lat], row[cen_lon]), (row[lat], row[lon]), unit=Unit.METERS), axis=1)
    return df

"""
Get number of selected matches, out of total possible (ie unique CD records)
df: df with "selected" column, after running get_matches()
cd_id: name of cd_id column
"""
def get_under12_selections(df, age='CENSUS_AGE'):
    n_under12 = len(df.loc[(df[age] <= 12) & (df['selected'] == 1), :])
    n_selected = len(df.loc[df['selected'] == 1, :])
    proportion = round(n_under12 / n_selected * 100, 2)

    return proportion

"""
Get df containing selected matches based on actual distances and confidence score
df: any match df containing at least cd_id, census_id, census long/lat, cd long/lat and confidence score. preferably df with 'dist' column (after get_dist_error())
"""
def get_dist_based_match(df, cen_lon='CENSUS_X', cen_lat='CENSUS_Y', lon='CD_X', lat='CD_Y', cd_id='CD_ID', census_id='CENSUS_ID', confidence='confidence_score'):
    if 'dist' not in df.columns:
        df = get_dist_error(df, cen_lon=cen_lon, cen_lat=cen_lat, lon=lon, lat=lat)
    
    df['dist_weight'] = round(1 / log(df['dist']) + df[confidence], 2)
    dist_disamb = dl.get_matches(df, cd_id = cd_id, census_id = census_id, weight = 'dist_weight')

    return dist_disamb

"""
Get false positive and false negative rates
df_algo: df with "selected" column, after running get_matches()
df_dist: df with "selected" column, after running get_dist_based_match()
Returns confusion matrix and df_algo with 'selected' now called 'selected_algo', and an additional column, 'selected_dist' which indicates 'true' matches.
"""
def compare_selections(df_algo, df_dist, cd_id="CD_ID", census_id="CENSUS_ID"):
    df_algo = df_algo.merge(df_dist.loc[:, [cd_id, census_id, 'selected']], how="inner", on=[cd_id, census_id], validate='one_to_one', suffixes=('_algo', '_dist'))

    true_positive = len(df_algo.loc[(df_algo['selected_algo'] == 1) & (df_algo['selected_dist'] == 1), :])
    false_positive = len(df_algo.loc[(df_algo['selected_algo'] == 1) & (df_algo['selected_dist'] == 0), :])
    false_negative = len(df_algo.loc[(df_algo['selected_algo'] == 0) & (df_algo['selected_dist'] == 1), :])
    true_negative = len(df_algo.loc[(df_algo['selected_algo'] == 0) & (df_algo['selected_dist'] == 0), :])

    confusion_matrix = [[true_positive, false_positive], [false_negative, true_negative]]
    return {'confusion_matrix': confusion_matrix, 'merged_df': df_algo}

Writing analysis.py


### write benchmarking module

In [None]:
# benchmark
%%writefile benchmark.py

import pandas as pd
import re
import numpy as np
#import disambiguation.analysis as da
import analysis as da


#Static methods

def get_hn(add):
    hn = re.search('[0-9]+', add)
    return hn.group()

def get_st(add):
    try:
        st = re.search('(?<=[0-9]\\s)([A-Z]|\\s)+(?=,)', add)
        return st.group()
    except:
        return None

Writing benchmark.py


### write init module

In [None]:
%%writefile __init__.py

import pandas as pd
import numpy as np
import processing as dp
import disambiguation as dl
import analysis as da
import benchmark as bm
import time

class Disambiguator:

    def __init__(self, match_df, cd_id="CD_ID", census_id="CENSUS_ID", confidence="confidence_score", census_count='census_count', lon="CD_Y", lat="CD_X", sort_var="CENSUS_ID"):
        # initialize input 
        self.input = match_df

        # initialize col names
        self.sort_var = sort_var
        self.cd_id = cd_id
        self.census_id = census_id
        self.cen_count = census_count
        self.confidence = confidence
        self.lon = lon
        self.lat = lat

        # output variables 
        self.bipartite = None
        self.results = None

    def run_disambiguation(self, cluster=True, k_between=True, cluster_kwargs = {}, path_kwargs = {}):
        start_time = time.time()
        print("Running")

        print("Creating dictionary of sub dfs (1/4)...")
        sub_groups = dp.split_dfs(self.input, self.sort_var, self.confidence)

        print("Applying algorithms iteratively (2/4)...")
        print("Number of Subgraphs: " + str(len(sub_groups)))


        # iteratively apply algorithms onto each sub df
        sub_groups = [dl.apply_algo(sub_groups, i, cluster=cluster, census_id=self.census_id, confidence=self.confidence, lat=self.lat, lon=self.lon, k_between=k_between, cluster_kwargs=cluster_kwargs, path_kwargs=path_kwargs) for i in range(0, len(sub_groups))]

        print("Cleaning output (3/4)...")
        final = pd.concat(sub_groups)

        final = final.drop_duplicates([self.cd_id, self.census_id])

        final['anchor'] = final['anchor'].fillna(0)
        final['spatial_weight'] = np.where(final['spatial_weight'].isnull(), final[self.confidence] + 1, final['spatial_weight']) # conf + 1 when weight is null - these are the rows that had did not req. spatial disambiguation, hence would def. be in shortest path

        print("Disambiguating (4/4)...")
        disambiguated = dl.get_matches(final)
        self.bipartite = disambiguated['graph']
        self.results = disambiguated['results']

        end_time = time.time()
        print("Total time:", end_time - start_time)
        print("Done! :)")
    
    def get_result(self):
        return self.results

    def get_bgraph(self):
        return self.bipartite

    def save_result(self, output_fn):
        self.results.to_csv(output_fn, index=False)

class Disambiguator1880(Disambiguator):
    def __init__(self, match_df, cd_id='CD_ID', census_id='CENSUS_ID', confidence='confidence_score', lon='LONG', lat='LAT'):
        super().__init__(match_df, cd_id=cd_id, census_id=census_id, confidence=confidence, lon=lon, lat=lat)
        self.cen_lon = None
        self.cen_lat = None
        self.cen_add = None
        self.cd_add = None

    # adding in variables needed for analysis
    def merge_census_var(self, df, merge_cen_id="CENSUS_ID"):
        try:
           self.results = self.results.merge(df, how="left", left_on=self.census_id, right_on=merge_cen_id)

        except AttributeError:
            raise Exception("Please run run_disambiguation() first")

    def merge_cd_var(self, df, merge_cd_id="CD_ID"):
        try:
            self.results = self.results.merge(df, how="left", left_on=self.cd_id, right_on=merge_cd_id)

        except AttributeError:
            raise Exception("Please run run_disambiguation() first")

    def set_var(self, var= None):
        if var is None:
            var = {'cen_lon': 'CENSUS_X', 'cen_lat': 'CENSUS_Y', 'cen_add': 'CENSUS_MATCH_ADDR', 'cd_add': 'MATCH_ADDR'}

        for key, value in var.items():
            setattr(self, key, value)

    # functions for analysis
    def get_match_rate(self):
        
        return da.get_match_rate(self.results, cd_id=self.cd_id)

    def get_addr_success(self):
        if self.cd_add is None or self.cen_add is None:
            raise Exception("Please check that cd_add and cen_add have been initialized through set_var")

        return da.get_addr_success(self.results, cd_add=self.cd_add, cen_add=self.cen_add)

    def get_dist_error(self):
        if self.cen_lon is None or self.cen_lat is None:
            raise Exception("Please check that cen_lon and cen_lat have been initialized through set_var")

        return da.get_dist_error(self.results, cen_lon=self.cen_lon, cen_lat=self.cen_lat, lon=self.lon, lat=self.lat)

    def get_under12_selections(self, age='CENSUS_AGE'):
        return da.get_under12_selections(self.results, age=age)

class Benchmark():

    def __init__(self, match, census, cd):

        #Format lat/long for census data
        self.census = self.set_census(census)
        self.cd = self.set_cd(cd)
        self.match = match
        self.benchmark = self.create_benchmark()

        #Variables to set
        self.disambiguated = None
        self.confidence = None

        #outputs
        self.benchmark_results = None
        self.confusion_matrix = None

    def set_census(self, census):
        census = census.loc[:,['CENSUS_MATCH_ADDR', 'CENSUS_Y', 'CENSUS_X']].drop_duplicates()  # select diff variables
        census.loc[census.CENSUS_Y > 1000, 'CENSUS_Y'] = 40.799935
        return census

    def set_cd(self, cd):
        return cd[['OBJECTID', 'LONG', 'LAT', "CD_BLOCK_NUM"]]

    def set_disambiguated(self, disambiguated):
        self.disambiguated = disambiguated

    def set_confidence(self, confidence):
        self.confidence = confidence
        # Need to figure out what's going on here
        self.benchmark['confidence_score'] = self.benchmark['add_match'] #+ self.benchmark[self.confidence]

    def create_benchmark(self):

        benchmark = self.match.merge(self.census, how='left', on='CENSUS_MATCH_ADDR', validate='many_to_one')

        benchmark['cd_hn'] = benchmark.apply(lambda row: bm.get_hn(row.MATCH_ADDR), axis=1)
        benchmark['cen_hn'] = benchmark.apply(lambda row: bm.get_hn(row.CENSUS_MATCH_ADDR), axis=1)
        benchmark['cd_add_cln'] = benchmark.apply(lambda row: bm.get_st(row.MATCH_ADDR), axis=1)
        benchmark['cen_add_cln'] = benchmark.apply(lambda row: bm.get_st(row.CENSUS_MATCH_ADDR), axis=1)

        benchmark['add_match'] = np.where(benchmark.cd_hn == benchmark.cen_hn, 0.1, 0) + np.where(
            benchmark.cen_add_cln == benchmark.cd_add_cln, 0.9, 0)

        benchmark.merge(self.cd, how='left', on='OBJECTID', validate='many_to_one')

        return benchmark

    def run_benchmarking(self):
        if self.confidence is None:
            raise Exception("Please set confidence first")
        if self.disambiguated is None:
            raise Exception("Please set disambiguated first")

        self.benchmark_results = da.get_dist_based_match(self.benchmark, lon = "LONG", lat = "LAT")['results']
        self.confusion_matrix = da.compare_selections(self.disambiguated, self.benchmark_results)['confusion_matrix']

    def get_benchmark(self):
        return self.benchmark

    def get_benchmark_results(self):
        if self.benchmark_results is None:
            raise Exception("Please run benchmarking first")
        return self.benchmark_results

    def get_confusion_matrix(self):
        if self.confusion_matrix is None:
            raise Exception("Please run benchmarking first")
        return self.confusion_matrix

#version for new run where cd/census information does not need to be joined in
class Benchmark_v02():

    def __init__(self, match):

        self.match = match
        self.benchmark = self.create_benchmark()

        #Variables to set
        self.disambiguated = None
        self.confidence = None

        #outputs
        self.benchmark_results = None
        self.confusion_matrix = None

    def set_disambiguated(self, disambiguated):
        self.disambiguated = disambiguated

    def set_confidence(self, confidence):
        self.confidence = confidence
        # Need to figure out what's going on here
        self.benchmark['confidence_score'] = self.benchmark['add_match'] #+ self.benchmark[self.confidence]

    def create_benchmark(self):
        benchmark = self.match.copy()

        #change to use actual street/house numbers once they are sorted out for the census data
        benchmark['cd_hn'] = benchmark.apply(lambda row: bm.get_hn(row.CD_H_ADDRESS), axis=1)
        benchmark['cen_hn'] = benchmark.apply(lambda row: bm.get_hn(row.CENSUS_MATCH_ADDR), axis=1)
        benchmark['cd_add_cln'] = benchmark.apply(lambda row: bm.get_st(row.CD_H_ADDRESS), axis=1)
        benchmark['cen_add_cln'] = benchmark.apply(lambda row: bm.get_st(row.CENSUS_MATCH_ADDR), axis=1)

        benchmark['add_match'] = np.where(benchmark.cd_hn == benchmark.cen_hn, 0.1, 0) + np.where(
            benchmark.cen_add_cln == benchmark.cd_add_cln, 0.9, 0)

        return benchmark

    def run_benchmarking(self):
        if self.confidence is None:
            raise Exception("Please set confidence first")
        if self.disambiguated is None:
            raise Exception("Please set disambiguated first")

        self.benchmark_results = da.get_dist_based_match(self.benchmark, confidence = self.confidence)['results']
        self.confusion_matrix = da.compare_selections(self.disambiguated, self.benchmark_results)['confusion_matrix']

    def get_benchmark(self):
        return self.benchmark

    def get_benchmark_results(self):
        if self.benchmark_results is None:
            raise Exception("Please run benchmarking first")
        return self.benchmark_results

    def get_confusion_matrix(self):
        if self.confusion_matrix is None:
            raise Exception("Please run benchmarking first")
        return self.confusion_matrix

Writing __init__.py


### confidence score tuning

In [None]:
%%writefile confidence_score_tuning.py
from __init__ import Disambiguator, Disambiguator1880, Benchmark, Benchmark_v02

"""
Purpose: Generate confidence scores as a list
df: elastic search dataframe formatted for disambiguation
columns: columns we want to use to create confidence score
weights: corresponding weights we want to use to create confidence score, should sum to one
"""
def confidence_score(df, columns, weights):
    return [sum(row[col]*w for col, w in zip(columns,weights)) for index,row in df.iterrows()]

"""
#Unneeded in new data run
Purpose: Format census data for benchmarking
census: census data 
"""
def census_for_disamb(census):
    census_latlng_tuning = census.copy()
    census_latlng_tuning['CENSUS_ID'] = 'CENSUS_' + census_latlng_tuning['OBJECTID.x'].astype(str)
    census_latlng_tuning = census_latlng_tuning.loc[:, ['CENSUS_ID', 'CENSUS_X', 'CENSUS_Y']]
    census_latlng_tuning.loc[census_latlng_tuning.CENSUS_Y > 1000, 'CENSUS_Y'] = 40.799935
    return census_latlng_tuning

"""
param_grid: list of dictionaries with names of columns to use for a trial cf score and corresponding weights
df_allcols: elastic search output formatted for disambiguation
df_census: census data for benchmarking
df_cd: city directory data for benchmarking
"""
def confidence_score_tuning(param_grid, df_allcols, df_census, df_cd):
    # Store results
    results = {}
    df = df_allcols.copy()

    # Get confidence score for each value in grid
    for i in range(len(param_grid)):
        name = "confidence_score_" + str(i)
        df.loc[:, name] = confidence_score(df_allcols, param_grid[i]["columns"], param_grid[i]["weights"])

    # Create benchmark object
    benchmark = Benchmark(df, df_census, df_cd)

    # Format census data for tuning
    census_tuning = census_for_disamb(df_census)

    # try:
    for i in range(len(param_grid)):
        name = "confidence_score_" + str(i)

        # Run disambiguation process (use betweeness and clustering -- based on Jolene's work)
        basic = Disambiguator1880(df, confidence=name)

        try:
            basic.run_disambiguation()
        except:
            continue

        result = basic.get_result()  # .to_csv("..data/confidence_score_tuning/confidence_score_"+str(i))

        # Results analysis
        basic.merge_census_var(census_tuning)
        basic.set_var()

        # benchmarking
        benchmark.set_confidence(name)
        benchmark.set_disambiguated(result)
        benchmark.run_benchmarking()

        # Store results
        results[name] = {"columns": param_grid[i]["columns"], "weights": param_grid[i]["weights"],
                         "Match Rate": basic.get_match_rate(), "Address Success": basic.get_addr_success(),
                         "Under 12": basic.get_under12_selections(),
                         "confusion matrix": benchmark.get_confusion_matrix()}

        # will return results so far even if exception occurs
        # Spit out the best columns and weights (Add this in when decide what makes something the best)
        # For now simply output the analysis
    return results

#Uses new version of benchmarking, bc elastic search output means we don't need to join in the x/ys separately
def confidence_score_tuning_v02(param_grid, df_elastic_search):
    # Store results
    results = {}
    df = df_elastic_search.copy()

    # Get confidence score for each value in grid
    for i in range(len(param_grid)):
        name = "confidence_score_" + str(i)
        df.loc[:, name] = confidence_score(df_elastic_search, param_grid[i]["columns"], param_grid[i]["weights"])

    benchmark = Benchmark_v02(df_elastic_search)

    for i in range(len(param_grid)):
        name = "confidence_score_" + str(i)

        # Run disambiguation process (use betweeness and clustering -- based on Jolene's work)
        basic = Disambiguator1880(df, confidence=name)

        #try:
        basic.run_disambiguation()
        #except:
            #continue

        result = basic.get_result()  # .to_csv("..data/confidence_score_tuning/confidence_score_"+str(i))

        # Results analysis
        basic.set_var()

        # benchmarking
        benchmark.set_confidence(name)
        benchmark.set_disambiguated(result)
        benchmark.run_benchmarking()

        # Store results
        results[name] = {"columns": param_grid[i]["columns"], "weights": param_grid[i]["weights"],
                         "Match Rate": basic.get_match_rate(), "Address Success": basic.get_addr_success(),
                         "Under 12": basic.get_under12_selections(),
                         "confusion matrix": benchmark.get_confusion_matrix()}

        # will return results so far even if exception occurs
        # Spit out the best columns and weights (Add this in when decide what makes something the best)
        # For now simply output the analysis
    return results



Writing confidence_score_tuning.py


## dwellings

Goal: Assign address, block number, x coordinate, and y coordinate to each dwelling matched with an address through disambiguation\ Problem: Disambiguation occurs at the individual census record level, matching individuals between the city directory and census records. Because a dwelling has a single address this means that when there's a match's information can be assigned to every record within that dwelling. However, records within a dwelling could be matched to different addresses, these conflicts need to be resolved\ Method: When there's a single match among census records within a dwelling assign values from that match to every record in that dwelling. If there's a conflict within a dwelling select one of the matches and assign those values to every record in that dwelling.\ In this notebook: Illustrates different approaches to resolving dwelling conflicts and fills in dwellings accordingly.

In [None]:

import pandas as pd
import numpy as np
from collections import defaultdict
import re
import networkx as nx  
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
%load_ext autoreload
%autoreload 2
import disambiguation
import processing as dp
from networkx.algorithms import bipartite

### get data

In [None]:
census_1850 = pd.read_csv("/content/census_1850_indexUpdate.csv")
disambiguated_1850 = pd.read_csv("/content/1850_mn_match_24-11-2020.csv")

### Merge 1850 census data with the selected matches from disambiguation

In [None]:
disambiguated_1850.info()

In [None]:
disambiguated_selected = disambiguated_1850[["CENSUS_ID", "CD_H_ADDRESS", "selected", "spatial_weight", "CD_X", "CD_Y", "CD_BLOCK_NUM"]]
disambiguated_selected = disambiguated_selected[disambiguated_selected["selected"] == 1]
disambiguated_selected.loc[:,"CENSUS_ID"] = disambiguated_selected["CENSUS_ID"].apply(lambda x: x.strip("CENSUS_"))

Notes: Only done on 1850 dataset due to lack of clearly indicated dwelling numbers/household numbers on 1880 dataset. Need to check how well this works manually. -Use CENSUS_ID from disambiguated and CENSUS_IPUMSUID from census data for joins, these have the same values, but need to strip the word CENSUS from CENSUS_ID data (this was added in during the disambiguation process)

In [None]:
CensusDis1850 = census_1850.merge(disambiguated_selected, how = "left", left_on = "CENSUS_IPUMS_UID", right_on = "CENSUS_ID")
CensusDis1850.head()

Unnamed: 0,CENSUS_SERIALP,CENSUS_AGE,CENSUS_SEX,CENSUS_MARST,CENSUS_RACE,CENSUS_LABFORCE,CENSUS_IMPREL,CENSUS_OCCSTR,CENSUS_NAMELAST,CENSUS_NAMEFRST,CENSUS_SEQ_NUM,CENSUS_HH_NUM,CENSUS_IPUMS_UID,CENSUS_CITY,CENSUS_PAGENO_HOUSEHOLD,CENSUS_WARD_NUM,CENSUS_REEL_HOUSEHOLD,CENSUS_PLACE,CENSUS_DWELLING_NUM,CENSUS_DWELLING_SEQ,CENSUS_DWELLING_SIZE,CENSUS_GEOG,CENSUS_LINE,CENSUS_INDEX,CENSUS_ID,CD_H_ADDRESS,selected,spatial_weight,CD_X,CD_Y,CD_BLOCK_NUM
0,2044262,49,1,6,120,2,1,INN KEEPER,FLINT,JOHN M,1,401735,1a0cea81-aa4c-4e02-af29-37f3fdadd987,4610,70,1,534,MANHATTAN,1.0,1.0,84,NEW YORK WARD 1 EASTERN DIVISION,10,1,,,,,,,
1,2044263,36,2,6,120,0,1,,FLINT,MARY A,2,401735,29aed4d0-b649-4d6b-9e28-3db8f4da81d8,4610,70,1,534,MANHATTAN,1.0,1.0,84,NEW YORK WARD 1 EASTERN DIVISION,10,2,,,,,,,
2,2044264,12,2,6,120,0,1,,FLINT,MARY D,3,401735,8cfac447-5fdf-44a4-888c-ee1c7e2a8355,4610,70,1,534,MANHATTAN,1.0,1.0,84,NEW YORK WARD 1 EASTERN DIVISION,10,3,,,,,,,
3,2044265,40,1,6,120,2,1,COMB MERCHANT,OATMAN,JAMES C,4,401736,ec0bff21-a919-42bf-a22c-7c6ef9b656af,4610,70,1,534,MANHATTAN,1.0,1.0,84,NEW YORK WARD 1 EASTERN DIVISION,10,4,,,,,,,
4,2044266,28,2,6,120,0,1,,OATMAN,CAROLINE E,5,401736,61d38b86-d8bc-4b69-bb6b-3169fa0e594d,4610,70,1,534,MANHATTAN,1.0,1.0,84,NEW YORK WARD 1 EASTERN DIVISION,10,5,,,,,,,


In [None]:
census_1850.shape

(515630, 24)

In [None]:
print("Proportion of census data assigned addresses:", CensusDis1850.CD_H_ADDRESS.count()/515630)

Proportion of census data assigned addresses: 0.06170703799235886


In [None]:
def get_counts(x, one_add, no_add, more_add, col, counts1, counts2, counts3):

    c = x["CD_H_ADDRESS"].nunique()
    if c == 0:
        no_add.append((x["CENSUS_WARD_NUM"].iloc[0], x[col].iloc[0]))
        counts1.append(len(x))
    elif c == 1:
        one_add.append((x["CENSUS_WARD_NUM"].iloc[0], x[col].iloc[0]))
        counts2.append(len(x))
    
    elif c > 1:
        more_add.append((x["CENSUS_WARD_NUM"].iloc[0], x[col].iloc[0]))
        counts3.append(len(x))

In [None]:
no_add = []
one_add = []
more_add = []
counts_no_add = []
counts_one_add = []
counts_more_add = []

for index, df in CensusDis1850.groupby(["CENSUS_WARD_NUM", "CENSUS_DWELLING_NUM"]):
    get_counts(df, one_add, no_add, more_add, "CENSUS_DWELLING_NUM", counts_no_add, counts_one_add, counts_more_add)

In [None]:
print("Proportion of dwellings assigned one address:", len(one_add)/CensusDis1850.groupby(["CENSUS_WARD_NUM", "CENSUS_DWELLING_NUM"]).ngroups)
print("Proportion of dwellings without an address:", len(no_add)/CensusDis1850.groupby(["CENSUS_WARD_NUM", "CENSUS_DWELLING_NUM"]).ngroups)
print("Proportion of dwellings assigned more than one address:", len(more_add)/CensusDis1850.groupby(["CENSUS_WARD_NUM", "CENSUS_DWELLING_NUM"]).ngroups)

Proportion of dwellings assigned one address: 0.27702395964691046
Proportion of dwellings without an address: 0.3470365699873897
Proportion of dwellings assigned more than one address: 0.37593947036569986


In [None]:
print("Proportion of census data that should be assigned an address as is:", sum(counts_one_add)/515630)
print("Proportion of census data that should be assigned an address after dealing with conflicts:", (sum(counts_one_add) + sum(counts_more_add))/515630)
print("Proportion of census data that we shouldn't be able to assign an address to:", sum(counts_no_add)/515630)

Proportion of census data that should be assigned an address as is: 0.25355972305723096
Proportion of census data that should be assigned an address after dealing with conflicts: 0.8010298081957993
Proportion of census data that we shouldn't be able to assign an address to: 0.19720148168260188


In [None]:

#Function for filling in households/dwelling numbers if relevant
def check_quant(x, exceptions, col, tuple = False):

    c = x["CD_H_ADDRESS"].nunique()
    if c > 1:
        if tuple:
            exceptions.append((x["CENSUS_WARD_NUM"].iloc[0], x[col].iloc[0]))
        else:
            exceptions.append(x[col].iloc[0])
    elif c == 1:
        x["CD_H_ADDRESS"] = x["CD_H_ADDRESS"].ffill().bfill()
        x["CD_X"] = x["CD_X"].ffill().bfill()
        x["CD_Y"] = x["CD_Y"].ffill().bfill()
        x["CD_BLOCK_NUM"] = x["CD_BLOCK_NUM"].ffill().bfill()
    return x

In [None]:
dwellings_conflicts = []
base_fill = CensusDis1850.groupby(["CENSUS_WARD_NUM", "CENSUS_DWELLING_NUM"]).apply(lambda x: check_quant(x, dwellings_conflicts, "CENSUS_DWELLING_NUM", tuple = True))

In [None]:
print("Proportion of census data assigned addresses:", base_fill.CD_H_ADDRESS.count()/515630)

Proportion of census data assigned addresses: 0.30232143203459844


In [None]:
base_fill.CD_H_ADDRESS.count()

155886

In [None]:
sum(counts_one_add)

130743

In [None]:
more_add == dwellings_conflicts

True

It's a little odd that more values are filled in than I would expect, but that the two lists are the same seems promising? Maybe missing dwelling numbers are causing this?

In [None]:
Dwelling_nums_nas = CensusDis1850[CensusDis1850["CENSUS_DWELLING_NUM"].isnull()]
nans_addresses = Dwelling_nums_nas.CD_H_ADDRESS.count()
Dwelling_nums_nas.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 912 entries, 1307 to 495263
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   CENSUS_SERIALP           912 non-null    int64  
 1   CENSUS_AGE               912 non-null    int64  
 2   CENSUS_SEX               912 non-null    int64  
 3   CENSUS_MARST             912 non-null    int64  
 4   CENSUS_RACE              912 non-null    int64  
 5   CENSUS_LABFORCE          912 non-null    int64  
 6   CENSUS_IMPREL            912 non-null    int64  
 7   CENSUS_OCCSTR            367 non-null    object 
 8   CENSUS_NAMELAST          911 non-null    object 
 9   CENSUS_NAMEFRST          912 non-null    object 
 10  CENSUS_SEQ_NUM           912 non-null    int64  
 11  CENSUS_HH_NUM            912 non-null    int64  
 12  CENSUS_IPUMS_UID         912 non-null    object 
 13  CENSUS_CITY              912 non-null    int64  
 14  CENSUS_PAGENO_HOUSEH

So this is too small to explain the difference. For now it's not different enough to stop from continuing, but it's definitely worth keeping in mind.

### Fill in Addresses for Household/Dwelling, based on Disambiguation Match
Note: Does not resolve conflicts

#### Fill in addresses for census entries in the same household

In [None]:
#confirm that household/dwelling values are unique accross the entire dataset
def uniqueness(df, col):
    df_check = df[["CENSUS_WARD_NUM", col]]
    df2 = df_check.groupby("CENSUS_WARD_NUM").apply(lambda x: x[col].unique())
    d = df2.to_dict()
    for key1 in d:
        for key2 in d:
            if key1 != key2:
                check = any(item in d[key1] for item in d[key2])
                if check is True:
                    print(key1, key2)
                    raise Exception(str(col) + " numbers are not unique")
    return True

In [None]:
uniqueness(CensusDis1850, "CENSUS_HH_NUM")

True

In [None]:
uniqueness(CensusDis1850, "CENSUS_DWELLING_NUM")

In [None]:
uniqueness(CensusDis1850, "CENSUS_SERIALP")

True

In [None]:
#Filling addresses for people in the same household
households = [] # keep track of any households with multiple addresses 
Census_hh = CensusDis1850.groupby("CENSUS_HH_NUM").apply(lambda x: check_quant(x, households, "CENSUS_HH_NUM"))

In [None]:
#These households need to be inspected more carefully, it seems that they
#have been assigned multiple addresses
len(households)

1649

In [None]:
with open('/content/households.txt', 'w') as filehandle:
    for listitem in households:
        filehandle.write('%s\n' % listitem)

In [None]:

x = Census_hh[Census_hh["CENSUS_HH_NUM"] == households[7]]
x[["CENSUS_NAMEFRST", "CENSUS_NAMELAST", "CENSUS_AGE", "CENSUS_SEX", "CENSUS_OCCSTR", "CENSUS_HH_NUM","CENSUS_DWELLING_NUM", "CD_H_ADDRESS", "spatial_weight"]]

Unnamed: 0,CENSUS_NAMEFRST,CENSUS_NAMELAST,CENSUS_AGE,CENSUS_SEX,CENSUS_OCCSTR,CENSUS_HH_NUM,CENSUS_DWELLING_NUM,CD_H_ADDRESS,spatial_weight
4655,MICHAEL,RYAN,50,1,LABORER,403866,263.0,6 DEPEYSTER ST,1.772511
4656,MARY,RYAN,40,2,,403866,263.0,,
4657,ANN,RYAN,35,2,,403866,263.0,,
4658,ANN,RYAN,4,2,,403866,263.0,,
4659,JAMES,RYAN,47,1,LABORER,403866,263.0,81 GREENWICH ST,1.91
4660,MICHAEL,RYAN,10,1,,403866,263.0,,


In [None]:
Census_hh.to_csv("/content/Census_1850_household.csv", index = False)

In [None]:
print("Proportion of census data assigned addresses:", Census_hh["CD_H_ADDRESS"].count()/522152)

Proportion of census data assigned addresses: 0.20924941396375002


#### use IPUMS household number

In [None]:
Census_hh = pd.read_csv("/content/Census_1850_household.csv")

In [None]:
households_IPUMS = []
Census_hh_IPUMS = Census_hh.groupby("CENSUS_SERIALP").apply(lambda x: check_quant(x, households_IPUMS, "CENSUS_SERIALP"))

In [None]:
len(households_IPUMS)

3072

In [None]:
with open('/content/housholds_IPUMS.txt', 'w') as filehandle:
    for listitem in households_IPUMS:
        filehandle.write('%s\n' % listitem)

In [None]:
Census_hh_IPUMS.to_csv("/content/Census_1850_household_IPUMS.csv", index = False)

In [None]:
print("Proportion of census data assigned addresses:", Census_hh_IPUMS["CD_H_ADDRESS"].count()/522152)

Proportion of census data assigned addresses: 0.30396895923026246


#### Fill in address for census entries with same dwelling number

In [None]:
#Filling addresses for people in the same household
dwellings = [] # keep track of any dwellings with multiple addresses 
Census_hh_dw = Census_hh_IPUMS.groupby(["CENSUS_WARD_NUM","CENSUS_DWELLING_NUM"]).apply(lambda x: check_quant(x, dwellings, "CENSUS_DWELLING_NUM", tuple = True))

In [None]:
#Dwellings that are assigned more than one address
len(dwellings)

7469

In [None]:
with open('/content/dwellings.txt', 'w') as filehandle:
    for listitem in dwellings:
        line = ' '.join(str(x) for x in listitem)
        filehandle.write(line + '\n')

In [None]:
Census_hh_dw.to_csv("/content/Census_1850_dwellings.csv", index = False)

In [None]:
print("Proportion of census data assigned addresses:", Census_hh_dw["CD_H_ADDRESS"].count()/522152)

Proportion of census data assigned addresses: 0.47102376319539135


#### Check Number of Dwellings with Multiple addresses

In [None]:
#Function for filling in households/dwelling numbers if relevant
def no_address(x, col):
    c = x["CD_H_ADDRESS"].nunique()
    if c == 0:
            nones.append((x["CENSUS_WARD_NUM"].iloc[0], x[col].iloc[0]))

In [None]:
nones = []
for index,df in Census_hh_dw.groupby(["CENSUS_WARD_NUM", "CENSUS_DWELLING_NUM"]):
    no_address(df, "CENSUS_DWELLING_NUM")

In [None]:
len(nones)

6866

In [None]:

print("Proportion of dwellings with no match at all:",len(nones)/Census_hh_dw.groupby(["CENSUS_WARD_NUM", "CENSUS_DWELLING_NUM"]).ngroups)

Proportion of dwellings with no match at all: 0.34633039092055484


This is a little less than what we'd expect from the earlier analysis, clearly there's some situation that's not being accounted for here

Issue: There are both households and dwellings that are assigned multiple addresses
Potential cause: an incorrect match -- it may make sense to incorporate that dwellings/households may have only a single match within the disambiguation process

possible approach: two levels of bipartite matching for household and dwellings
possible approach: incorporate into initial bipartite matching
Potential cause: dwellings/households that are referred to by multiple addresses - ei corner building, maybe a historical address change, streets with two names (Avenue of the Americas/6th Ave), etc.

standardization can help

### Use Bipartite Matching to Get A Single Address for Dwellings

census 1850 index file for census data and 1850 mn match 24-11 file for disambiguated output used for the analysis

In [None]:
census_1850 = pd.read_csv("/content/census_1850_indexUpdate.csv")
disambiguated_1850 = pd.read_csv("/content/1850_mn_match_24-11-2020.csv")

In [None]:
# selected = 1 disambiguated records selected
disambiguated_1850_selected = disambiguated_1850[disambiguated_1850["selected"] == 1].copy()

In [None]:
disambiguated_1850_selected.columns

In [None]:
disambiguated_1850_selected["unique_ward"] = disambiguated_1850_selected.apply(lambda row: str(row.CENSUS_WARD_NUM) + "_" + str(row.CENSUS_DWELLING_NUM), axis = 1)

In [None]:
disambiguated_1850_selected[["CENSUS_WARD_NUM", "CENSUS_DWELLING_NUM","unique_ward", "CD_H_ADDRESS"]].head()

Unnamed: 0,CENSUS_WARD_NUM,CENSUS_DWELLING_NUM,unique_ward,CD_H_ADDRESS
0,1,1.0,1_1.0,47 WALL ST
1,1,1.0,1_1.0,11 BRG ST
10,1,371.0,1_371.0,8 DEPEYSTER ST
24,1,1.0,1_1.0,70 LIBERTY ST
32,1,117.0,1_117.0,39 PEARL ST


In [None]:
def get_matches_dwelling(df, cd_id = 'CD_ID', census_id = 'CENSUS_ID', weight = 'spatial_weight'):
    
    #This is to preserve order
    df = df.copy()
    df[cd_id] = "A_ " + df[cd_id].astype(str)
    df[census_id] = "B_ " + df[census_id].astype(str)
    
    b_edges = [(row[cd_id], row[census_id], row[weight]) for index, row in df.iterrows()]
    b = nx.Graph()
    b.add_nodes_from(df[cd_id].unique(), bipartite = 0)
    b.add_nodes_from(df[census_id].unique(), bipartite = 1)
    b.add_weighted_edges_from(b_edges)

    # algorithm is too expensive if we perform it on entire graph. moreover, graph is actually disconnected into sub_graphs. apply algorithm on subgraphs instead
    subgraphs = [b.subgraph(c) for c in nx.connected_components(b)]
    matches = [list(nx.max_weight_matching(graph, maxcardinality = True)) for graph in subgraphs]
    matches = [sorted(list(item)) for sublist in matches for item in sublist] # unnest and convert pairs from tuple to list
    matches = pd.DataFrame(matches, columns=[cd_id, census_id])
    
    matches.loc[:,cd_id] = matches[cd_id].apply(lambda x: x.strip("A_ "))
    matches.loc[:,census_id] = matches[census_id].apply(lambda x: x.strip("B_ "))

    return matches

In [None]:
match = get_matches_dwelling(disambiguated_1850_selected, cd_id = "unique_ward", census_id = "CD_H_ADDRESS")

In [None]:
disambiguated_1850_selected = disambiguated_1850_selected[["CENSUS_IPUMS_UID", "unique_ward"]].copy()
disambiguated_1850_selected.drop_duplicates("unique_ward", inplace = True)
matched = disambiguated_1850_selected.merge(match, how = "left", on = "unique_ward", validate = "one_to_one")
dwelling_addresses = census_1850.merge(matched, how = "left", on = "CENSUS_IPUMS_UID")

In [None]:
print("Proportion of census data assigned addresses:", dwelling_addresses.CD_H_ADDRESS.count()/515630)

Proportion of census data assigned addresses: 0.024061827279250627


This is still higher than last run by tabhita

In [None]:
#Function for filling in households/dwelling numbers if relevant
def check_quant(x, nones, exceptions, col, tuple = False):

    c = x["CD_H_ADDRESS"].nunique()
    if c == 0:
        if tuple:
            nones.append((x["CENSUS_WARD_NUM"].iloc[0], x[col].iloc[0]))
        else:
            nones.append(x[col].iloc[0])
    elif c > 1:
        if tuple:
            exceptions.append((x["CENSUS_WARD_NUM"].iloc[0], x[col].iloc[0]))
        else:
            exceptions.append(x[col].iloc[0])
    elif c == 1:
        x["CD_H_ADDRESS"] = x["CD_H_ADDRESS"].ffill().bfill()
  
    return x

In [None]:
dwellings_noadd = []
dwellings = [] # keep track of any dwellings with multiple addresses 
Census_hh_dw = dwelling_addresses.groupby(["CENSUS_WARD_NUM","CENSUS_DWELLING_NUM"]).apply(lambda x: check_quant(x, dwellings_noadd, dwellings, "CENSUS_DWELLING_NUM", tuple = True))

In [None]:
print("Proportion of census data assigned addresses:", Census_hh_dw.CD_H_ADDRESS.count()/515630)

Proportion of census data assigned addresses: 0.7715532455442856


In [None]:
print("Proportion of Dwellings Without Addresses:", len(dwellings_noadd)/Census_hh_dw.groupby(["CENSUS_WARD_NUM","CENSUS_DWELLING_NUM"]).ngroups)

Proportion of Dwellings Without Addresses: 0.37472887767969737


### Interlude: Geog Variable
Invesitgation shows that institutions have one address as expected

In [None]:
Census_hh_dw["CENSUS_GEOG"].unique()

array(['NEW YORK WARD 1 EASTERN DIVISION', nan,
       'NEW YORK WARD 1 WESTERN DIVISION', 'NEW YORK WARD 2',
       'NEW YORK WARD 3', 'NEW YORK WARD 4', 'NEW YORK WARD 5',
       'NEW YORK WARD 6', 'NEW YORK WARD 7 DISTRICT 1',
       'NEW YORK WARD 7 DISTRICT 2', 'NEW YORK WARD 8 DISTRICT 1',
       'NEW YORK WARD 8', 'NEW YORK WARD 9 DISTRICT 1',
       'NEW YORK WARD 9 DISTRICT 2', 'NEW YORK WARD 9 DISTRICT 3',
       'NEW YORK WARD 10', 'NEW YORK WARD 11', 'NEW YORK WARD 12',
       'NEW YORK WARD 13', 'NEW YORK WARD 14',
       'NEW YORK WARD 15 WESTERN HALF', 'NEW YORK WARD 15 EASTERN HALF',
       'NEW YORK WARD 16 DISTRICT 1', 'NEW YORK WARD 16 DISTRICT 2',
       'NEW YORK WARD 16 DISTRICT 3', 'NEW YORK WARD 17',
       'NEW YORK WARD 18', 'NEW YORK WARD 18 DISTRICT 2',
       'NEW YORK WARD 18 BELLEVUE HOSPITAL',
       'NEW YORK WARD 18 HOUSE OF REFUGE', 'NEW YORK WARD 19'],
      dtype=object)

In [None]:
hospital = Census_hh_dw[Census_hh_dw["CENSUS_GEOG"] == 'NEW YORK WARD 18 BELLEVUE HOSPITAL']

In [None]:
hospital.nunique()

CENSUS_SERIALP             438
CENSUS_AGE                  61
CENSUS_SEX                   2
CENSUS_MARST                 2
CENSUS_RACE                  2
CENSUS_LABFORCE              2
CENSUS_IMPREL                1
CENSUS_OCCSTR               46
CENSUS_NAMELAST            322
CENSUS_NAMEFRST            117
CENSUS_SEQ_NUM             479
CENSUS_HH_NUM              422
CENSUS_IPUMS_UID           479
CENSUS_CITY                  1
CENSUS_PAGENO_HOUSEHOLD      3
CENSUS_WARD_NUM              1
CENSUS_REEL_HOUSEHOLD        1
CENSUS_PLACE                 1
CENSUS_DWELLING_NUM          1
CENSUS_DWELLING_SEQ          1
CENSUS_DWELLING_SIZE         3
CENSUS_GEOG                  1
CENSUS_LINE                  1
CENSUS_INDEX               479
unique_ward                  1
CD_H_ADDRESS                 1
dtype: int64

In [None]:
refuge = Census_hh_dw[Census_hh_dw["CENSUS_GEOG"] == 'NEW YORK WARD 18 HOUSE OF REFUGE']
refuge.nunique()

CENSUS_SERIALP             372
CENSUS_AGE                  29
CENSUS_SEX                   2
CENSUS_MARST                 2
CENSUS_RACE                  3
CENSUS_LABFORCE              2
CENSUS_IMPREL                1
CENSUS_OCCSTR               34
CENSUS_NAMELAST            330
CENSUS_NAMEFRST            142
CENSUS_SEQ_NUM             411
CENSUS_HH_NUM              394
CENSUS_IPUMS_UID           411
CENSUS_CITY                  1
CENSUS_PAGENO_HOUSEHOLD      1
CENSUS_WARD_NUM              1
CENSUS_REEL_HOUSEHOLD        1
CENSUS_PLACE                 1
CENSUS_DWELLING_NUM          1
CENSUS_DWELLING_SEQ          1
CENSUS_DWELLING_SIZE         2
CENSUS_GEOG                  1
CENSUS_LINE                  2
CENSUS_INDEX               411
unique_ward                  1
CD_H_ADDRESS                 1
dtype: int64


It seems that there are fewer NA's (no address at all for a dwelling then when using the previous nested approach). Let's try nested and see if that helps, though I am pretty confused about why it would?

### Choose Dwelling Address By Maximum Spatial Weight

In [None]:
#Function for filling in households/dwelling numbers if relevant
def dwelling_weight_fill(x, nones):
    
    c = x["CD_H_ADDRESS"].nunique()
    if c == 0:
        nones.append(1)
        #nones.append((x.loc["CENSUS_WARD_NUM"].iloc[0], x[col].iloc[0]))

    elif c >= 1:
        x.reset_index(drop = True, inplace = True)
        index = x["spatial_weight"].idxmax()
        x["CD_H_ADDRESS"] = x.iloc[index].loc["CD_H_ADDRESS"]
        x["CD_BLOCK_NUM"] = x.iloc[index].loc["CD_BLOCK_NUM"]
        x["CD_X"] = x.iloc[index].loc["CD_X"]
        x["CD_Y"] = x.iloc[index].loc["CD_Y"]
  
    return x

In [None]:
dwellings = []
Dwellings_weight = CensusDis1850.groupby(["CENSUS_WARD_NUM", "CENSUS_DWELLING_NUM"]).apply(lambda x: dwelling_weight_fill(x, dwellings))

len(dwellings)

6880

In [None]:
print("Proportion of census data assigned addresses:", Dwellings_weight.CD_H_ADDRESS.count()/515630)

Proportion of census data assigned addresses: 0.8010298081957993


In [None]:

print("Proportion of Dwellings Without Addresses:", len(dwellings)/CensusDis1850.groupby(["CENSUS_WARD_NUM","CENSUS_DWELLING_NUM"]).ngroups)

Proportion of Dwellings Without Addresses: 0.3470365699873897


### Look at how often we have different dwellings with the same address

In [None]:
org_same_add = defaultdict(list)
def same_address(x, d, col):
    d[(x["CENSUS_WARD_NUM"].iloc[0], x[col].iloc[0])].append(x.dropna(subset = ["CD_H_ADDRESS"]).CD_H_ADDRESS.unique())
for index, group in CensusDis1850.groupby(["CENSUS_WARD_NUM","CENSUS_DWELLING_NUM"]):
    same_address(group, org_same_add, "CENSUS_DWELLING_NUM")

In [None]:
which_dwellings = []
count = 0

for index,df in CensusDis1850.groupby("CENSUS_WARD_NUM"):
    org_same_add = defaultdict(list)
    for i, group in df.groupby("CENSUS_DWELLING_NUM"):
        same_address(group, org_same_add, "CENSUS_DWELLING_NUM")
    
    #check for same name (limit to within ward for efficiency)
    for key1 in org_same_add:
        for key2 in org_same_add:
            if key1 != key2:
                if org_same_add[key1][0].any() in org_same_add[key2][0]:
                    count += 1 
                    which_dwellings.append((key1,key2))
                    
    if index % 2 == 0:
        print("Finished up to index", str(index))

Finished up to index 2
Finished up to index 4
Finished up to index 6
Finished up to index 8
Finished up to index 10
Finished up to index 12
Finished up to index 14
Finished up to index 16
Finished up to index 18


In [None]:
with open('/content/dwellings_same_address.txt', 'w') as filehandle:
    for listitem in which_dwellings:
        line = ' '.join(str(x) for x in listitem)
        filehandle.write(line + '\n')

In [None]:
print("Number of Dwellings that could share the same address:", count)

Number of Dwellings that could share the same address: 11895


In [None]:
Dwellings_weight.reset_index(drop = True, inplace = True)
same_add_selected = Dwellings_weight.groupby(["CENSUS_WARD_NUM", "CENSUS_DWELLING_NUM"]).first()
num_same = same_add_selected["CD_H_ADDRESS"].count() - same_add_selected["CD_H_ADDRESS"].nunique()
print("Number of Dwellings with Same Address Selected:", num_same)

Number of Dwellings with Same Address Selected: 1844


In [None]:
print("Proportion of Dwellings with Same Address:", num_same/Dwellings_weight.groupby(["CENSUS_WARD_NUM", "CENSUS_DWELLING_NUM"]).ngroups)

Proportion of Dwellings with Same Address: 0.09301387137452712


In [None]:
print("Dwellings with same address selected out of possiblity:", num_same/count)

Dwellings with same address selected out of possiblity: 0.1550231189575452


### Choose Dwelling Address By Maximum Spatial Weight Sum 

In [None]:
#we can use this to compare between the two versions, because this is where they'll cause differences
count_sames = CensusDis1850.groupby(["CENSUS_WARD_NUM","CENSUS_DWELLING_NUM"], as_index = False).filter(lambda df: df["CD_H_ADDRESS"].count() > 0 and df["CD_H_ADDRESS"].value_counts()[0] > 1)

In [None]:

count_sames.groupby(["CENSUS_WARD_NUM","CENSUS_DWELLING_NUM"]).ngroups

3407

In [None]:
def dwelling_weight_fill(x):
    #x["CD_ADDRESS"] = x.groupby(["CD_ADDRESS"])['spatial_weight'].agg('sum').idxmax()
    if x["CD_H_ADDRESS"].count() > 0:
        x["spatial_weight_sum"] = x.groupby(["CD_H_ADDRESS"])['spatial_weight'].transform('sum')
        x.reset_index(drop = True, inplace = True)
        index = x["spatial_weight_sum"].idxmax()

        x["CD_H_ADDRESS"] = x.iloc[index].loc["CD_H_ADDRESS"]
        x["CD_BLOCK_NUM"] = x.iloc[index].loc["CD_BLOCK_NUM"]
        x["CD_X"] = x.iloc[index].loc["CD_X"]
        x["CD_Y"] = x.iloc[index].loc["CD_Y"]
    
    return x

In [None]:
Dwelling_fill_max = Dwellings_weight.copy()

In [None]:
#We do it this way so that ordering differences don't cause arbitrary differences between the max and sum methods
Dwelling_fill_sum_part1 = count_sames.groupby(["CENSUS_WARD_NUM","CENSUS_DWELLING_NUM"]).apply(dwelling_weight_fill)
Dwelling_fill_sum_part2 = Dwelling_fill_max[~Dwelling_fill_max["CENSUS_IPUMS_UID"].isin(count_sames.CENSUS_IPUMS_UID.values)]
Dwelling_fill_sum = pd.concat([Dwelling_fill_sum_part1, Dwelling_fill_sum_part2])

In [None]:
Dwelling_fill_sum_alt = CensusDis1850.groupby(["CENSUS_WARD_NUM", "CENSUS_DWELLING_NUM"]).apply(dwelling_weight_fill)

In [None]:
#Add back entries that were eliminated because they don't have a dwelling number 
no_dwelling = CensusDis1850[CensusDis1850["CENSUS_DWELLING_NUM"].isnull()]

In [None]:
Dwelling_fill_sum_alt = pd.concat([Dwelling_fill_sum_alt, no_dwelling])

In [None]:
#Dwelling_fill_max = Dwellings_weight.copy()
Dwelling_fill_max =  pd.concat([Dwelling_fill_max, no_dwelling])
Dwelling_fill_max.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 515630 entries, 0 to 495263
Data columns (total 31 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   CENSUS_SERIALP           515630 non-null  int64  
 1   CENSUS_AGE               515630 non-null  int64  
 2   CENSUS_SEX               515630 non-null  int64  
 3   CENSUS_MARST             515630 non-null  int64  
 4   CENSUS_RACE              515630 non-null  int64  
 5   CENSUS_LABFORCE          515630 non-null  int64  
 6   CENSUS_IMPREL            515630 non-null  int64  
 7   CENSUS_OCCSTR            160426 non-null  object 
 8   CENSUS_NAMELAST          515546 non-null  object 
 9   CENSUS_NAMEFRST          515046 non-null  object 
 10  CENSUS_SEQ_NUM           515630 non-null  int64  
 11  CENSUS_HH_NUM            515630 non-null  int64  
 12  CENSUS_IPUMS_UID         515630 non-null  object 
 13  CENSUS_CITY              515630 non-null  int64  
 14  CENS

In [None]:
Dwelling_fill_sum_alt.to_csv("/content/dwelling_filled_sum_1850_mn.csv", index = False)
Dwelling_fill_max.to_csv("/content/dwelling_filled_max_1850_mn.csv", index = False)

In [None]:
print("Proportion of census data assigned addresses:", Dwelling_fill_sum_alt.CD_H_ADDRESS.count()/515630)
print("Proportion of Dwellings Without Addresses:", Dwelling_fill_sum_alt.groupby(["CENSUS_WARD_NUM", "CENSUS_DWELLING_NUM"], as_index = False).filter(lambda df: df["CD_H_ADDRESS"].count() == 0).groupby(["CENSUS_WARD_NUM", "CENSUS_DWELLING_NUM"]).ngroups/CensusDis1850.groupby(["CENSUS_WARD_NUM","CENSUS_DWELLING_NUM"]).ngroups)

Proportion of census data assigned addresses: 0.8011655644551325
Proportion of Dwellings Without Addresses: 0.3470365699873897


This matches with the expected proportions, the same as those for selecting the max weight instead of the max sum

### Look at difference between selecting the maximum and selecting the max sum

In [None]:
Dwelling_fill_sum_alt.dropna(subset = ["CD_H_ADDRESS"], inplace = True)
Dwelling_fill_max.dropna(subset = ["CD_H_ADDRESS"], inplace = True)

In [None]:
Dwelling_fill_sum_dropped = Dwelling_fill_sum_alt[["CENSUS_IPUMS_UID", "CD_H_ADDRESS", "CD_X", "CD_Y"]]
Dwelling_fill_max_dropped = Dwelling_fill_max[["CENSUS_IPUMS_UID", "CD_H_ADDRESS", "CD_X", "CD_Y"]]

In [None]:
Dwelling_filled_differences = Dwelling_fill_sum_dropped.merge(Dwelling_fill_max_dropped, on = "CENSUS_IPUMS_UID",suffixes = ('_sum', '_max'), how = "left", validate = "one_to_one")
Dwelling_filled_differences.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 413105 entries, 0 to 413104
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   CENSUS_IPUMS_UID  413105 non-null  object 
 1   CD_H_ADDRESS_sum  413105 non-null  object 
 2   CD_X_sum          413105 non-null  float64
 3   CD_Y_sum          413105 non-null  float64
 4   CD_H_ADDRESS_max  413105 non-null  object 
 5   CD_X_max          413105 non-null  float64
 6   CD_Y_max          413105 non-null  float64
dtypes: float64(4), object(3)
memory usage: 25.2+ MB


In [None]:

Dwelling_filled_same = Dwelling_filled_differences[Dwelling_filled_differences["CD_H_ADDRESS_max"] == Dwelling_filled_differences["CD_H_ADDRESS_sum"]]
Dwelling_filled_same.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 378676 entries, 0 to 413104
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   CENSUS_IPUMS_UID  378676 non-null  object 
 1   CD_H_ADDRESS_sum  378676 non-null  object 
 2   CD_X_sum          378676 non-null  float64
 3   CD_Y_sum          378676 non-null  float64
 4   CD_H_ADDRESS_max  378676 non-null  object 
 5   CD_X_max          378676 non-null  float64
 6   CD_Y_max          378676 non-null  float64
dtypes: float64(4), object(3)
memory usage: 23.1+ MB


In [None]:
Census1850_diff = CensusDis1850[~CensusDis1850["CENSUS_IPUMS_UID"].isin(Dwelling_filled_same.CENSUS_IPUMS_UID.values)]
Census1850_diff.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 136954 entries, 161 to 515629
Data columns (total 31 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   CENSUS_SERIALP           136954 non-null  int64  
 1   CENSUS_AGE               136954 non-null  int64  
 2   CENSUS_SEX               136954 non-null  int64  
 3   CENSUS_MARST             136954 non-null  int64  
 4   CENSUS_RACE              136954 non-null  int64  
 5   CENSUS_LABFORCE          136954 non-null  int64  
 6   CENSUS_IMPREL            136954 non-null  int64  
 7   CENSUS_OCCSTR            41580 non-null   object 
 8   CENSUS_NAMELAST          136924 non-null  object 
 9   CENSUS_NAMEFRST          136765 non-null  object 
 10  CENSUS_SEQ_NUM           136954 non-null  int64  
 11  CENSUS_HH_NUM            136954 non-null  int64  
 12  CENSUS_IPUMS_UID         136954 non-null  object 
 13  CENSUS_CITY              136954 non-null  int64  
 14  CE

In [None]:
#Create a column to display spatial weights
Census1850_diff = Census1850_diff.copy()
Census1850_diff["spatial_weight_sum"] = Census1850_diff.groupby(["CENSUS_WARD_NUM","CENSUS_DWELLING_NUM", "CD_H_ADDRESS"])["spatial_weight"].transform('sum')

In [None]:
Dwelling_filled_differences_all = Census1850_diff.merge(Dwelling_filled_differences, how = "left", on = "CENSUS_IPUMS_UID")
Dwelling_filled_differences_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 136954 entries, 0 to 136953
Data columns (total 38 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   CENSUS_SERIALP           136954 non-null  int64  
 1   CENSUS_AGE               136954 non-null  int64  
 2   CENSUS_SEX               136954 non-null  int64  
 3   CENSUS_MARST             136954 non-null  int64  
 4   CENSUS_RACE              136954 non-null  int64  
 5   CENSUS_LABFORCE          136954 non-null  int64  
 6   CENSUS_IMPREL            136954 non-null  int64  
 7   CENSUS_OCCSTR            41580 non-null   object 
 8   CENSUS_NAMELAST          136924 non-null  object 
 9   CENSUS_NAMEFRST          136765 non-null  object 
 10  CENSUS_SEQ_NUM           136954 non-null  int64  
 11  CENSUS_HH_NUM            136954 non-null  int64  
 12  CENSUS_IPUMS_UID         136954 non-null  object 
 13  CENSUS_CITY              136954 non-null  int64  
 14  CENS

In [None]:
Dwelling_filled_differences_all.dropna(subset = ["spatial_weight"], inplace = True)

In [None]:
i = 0
for index, group in Dwelling_filled_differences_all.groupby(["CENSUS_WARD_NUM", "CENSUS_DWELLING_NUM"]):
    display(group.head())
    i += 1
    if i == 5:
        break

Unnamed: 0,CENSUS_SERIALP,CENSUS_AGE,CENSUS_SEX,CENSUS_MARST,CENSUS_RACE,CENSUS_LABFORCE,CENSUS_IMPREL,CENSUS_OCCSTR,CENSUS_NAMELAST,CENSUS_NAMEFRST,CENSUS_SEQ_NUM,CENSUS_HH_NUM,CENSUS_IPUMS_UID,CENSUS_CITY,CENSUS_PAGENO_HOUSEHOLD,CENSUS_WARD_NUM,CENSUS_REEL_HOUSEHOLD,CENSUS_PLACE,CENSUS_DWELLING_NUM,CENSUS_DWELLING_SEQ,CENSUS_DWELLING_SIZE,CENSUS_GEOG,CENSUS_LINE,CENSUS_INDEX,CENSUS_ID,CD_H_ADDRESS,selected,spatial_weight,CD_X,CD_Y,CD_BLOCK_NUM,spatial_weight_sum,CD_H_ADDRESS_sum,CD_X_sum,CD_Y_sum,CD_H_ADDRESS_max,CD_X_max,CD_Y_max
1713,2045391,32,1,1,120,2,3,MERCHANT,CHURCHILL,WILLIAM H,5,405981,df227495-1eee-44f2-a4f4-2b28e523e819,4610,1760,1,534,MANHATTAN,6.0,12.0,52,NEW YORK WARD 1 WESTERN DIVISION,130,7114,df227495-1eee-44f2-a4f4-2b28e523e819,46 CEDAR ST,1.0,2.0,-74.008955,40.707471,mn1850_01_74.0094_40.7075,2.0,96 LIBERTY ST,-74.011527,40.709551,46 CEDAR ST,-74.008955,40.707471
1734,2045391,23,1,6,120,2,12,CLERK,COFFIN,C B,26,405999,7e9951c2-ad41-4798-a614-8e484c007230,4610,1760,1,534,MANHATTAN,6.0,12.0,52,NEW YORK WARD 1 WESTERN DIVISION,130,7135,7e9951c2-ad41-4798-a614-8e484c007230,96 LIBERTY ST,1.0,2.0,-74.011527,40.709551,mn1850_01_74.0115_40.7093,4.0,96 LIBERTY ST,-74.011527,40.709551,46 CEDAR ST,-74.008955,40.707471
1739,2045391,21,1,1,120,2,8,CLERK,HAINES,WILLIAM C,31,406004,faa0c4fd-6111-474b-b0f9-233ad86a9ec5,4610,1760,1,534,MANHATTAN,6.0,12.0,52,NEW YORK WARD 1 WESTERN DIVISION,130,7140,faa0c4fd-6111-474b-b0f9-233ad86a9ec5,96 LIBERTY ST,1.0,2.0,-74.011527,40.709551,mn1850_01_74.0115_40.7093,4.0,96 LIBERTY ST,-74.011527,40.709551,46 CEDAR ST,-74.008955,40.707471
1743,2045391,23,1,6,120,2,12,CLERK,WHITE,JOHN H,35,406007,07084d13-5bbc-499d-be5d-d2a636068c86,4610,1760,1,534,MANHATTAN,6.0,12.0,52,NEW YORK WARD 1 WESTERN DIVISION,130,7144,07084d13-5bbc-499d-be5d-d2a636068c86,132 LIBERTY ST,1.0,1.857692,-74.013174,40.710311,mn1850_01_74.0131_40.7101,1.857692,96 LIBERTY ST,-74.011527,40.709551,46 CEDAR ST,-74.008955,40.707471
1754,2045391,47,1,1,120,2,12,PHYSICIAN,PRATT,PETER,46,406017,042be81b-7121-4691-a63f-360a0194faac,4610,1760,1,534,MANHATTAN,6.0,12.0,52,NEW YORK WARD 1 WESTERN DIVISION,130,7155,042be81b-7121-4691-a63f-360a0194faac,98 LIBERTY ST,1.0,2.0,-74.011642,40.709606,mn1850_01_74.0115_40.7093,2.0,96 LIBERTY ST,-74.011527,40.709551,46 CEDAR ST,-74.008955,40.707471


Unnamed: 0,CENSUS_SERIALP,CENSUS_AGE,CENSUS_SEX,CENSUS_MARST,CENSUS_RACE,CENSUS_LABFORCE,CENSUS_IMPREL,CENSUS_OCCSTR,CENSUS_NAMELAST,CENSUS_NAMEFRST,CENSUS_SEQ_NUM,CENSUS_HH_NUM,CENSUS_IPUMS_UID,CENSUS_CITY,CENSUS_PAGENO_HOUSEHOLD,CENSUS_WARD_NUM,CENSUS_REEL_HOUSEHOLD,CENSUS_PLACE,CENSUS_DWELLING_NUM,CENSUS_DWELLING_SEQ,CENSUS_DWELLING_SIZE,CENSUS_GEOG,CENSUS_LINE,CENSUS_INDEX,CENSUS_ID,CD_H_ADDRESS,selected,spatial_weight,CD_X,CD_Y,CD_BLOCK_NUM,spatial_weight_sum,CD_H_ADDRESS_sum,CD_X_sum,CD_Y_sum,CD_H_ADDRESS_max,CD_X_max,CD_Y_max
1838,2045424,43,2,6,120,0,1,,DUTCHER,CATHARINE B,1,406124,db5340b3-1f19-4350-8a56-af1670443574,4610,1810,1,534,MANHATTAN,21.0,45.0,36,NEW YORK WARD 1 WESTERN DIVISION,190,7324,db5340b3-1f19-4350-8a56-af1670443574,94 LIBERTY ST,1.0,1.9,-74.011412,40.709496,mn1850_01_74.0115_40.7093,3.9,94 LIBERTY ST,-74.011412,40.709496,58 CEDAR ST,-74.009383,40.707751
1850,2045424,20,1,6,120,2,12,CLERK,ALLEN,HENRY S,13,406128,d09684f1-b2f9-4135-a827-308438b7a73f,4610,1810,1,534,MANHATTAN,21.0,45.0,36,NEW YORK WARD 1 WESTERN DIVISION,190,7336,d09684f1-b2f9-4135-a827-308438b7a73f,58 CEDAR ST,1.0,2.0,-74.009383,40.707751,mn1850_01_74.0094_40.7075,2.0,94 LIBERTY ST,-74.011412,40.709496,58 CEDAR ST,-74.009383,40.707751
1852,2045424,26,1,6,120,2,12,MERCHANT,ELLISON,BENJAMIN,15,406129,a8615f18-14a4-4ecc-909e-a4ba43dca6f5,4610,1810,1,534,MANHATTAN,21.0,45.0,36,NEW YORK WARD 1 WESTERN DIVISION,190,7338,a8615f18-14a4-4ecc-909e-a4ba43dca6f5,94 LIBERTY ST,1.0,2.0,-74.011412,40.709496,mn1850_01_74.0115_40.7093,3.9,94 LIBERTY ST,-74.011412,40.709496,58 CEDAR ST,-74.009383,40.707751


Unnamed: 0,CENSUS_SERIALP,CENSUS_AGE,CENSUS_SEX,CENSUS_MARST,CENSUS_RACE,CENSUS_LABFORCE,CENSUS_IMPREL,CENSUS_OCCSTR,CENSUS_NAMELAST,CENSUS_NAMEFRST,CENSUS_SEQ_NUM,CENSUS_HH_NUM,CENSUS_IPUMS_UID,CENSUS_CITY,CENSUS_PAGENO_HOUSEHOLD,CENSUS_WARD_NUM,CENSUS_REEL_HOUSEHOLD,CENSUS_PLACE,CENSUS_DWELLING_NUM,CENSUS_DWELLING_SEQ,CENSUS_DWELLING_SIZE,CENSUS_GEOG,CENSUS_LINE,CENSUS_INDEX,CENSUS_ID,CD_H_ADDRESS,selected,spatial_weight,CD_X,CD_Y,CD_BLOCK_NUM,spatial_weight_sum,CD_H_ADDRESS_sum,CD_X_sum,CD_Y_sum,CD_H_ADDRESS_max,CD_X_max,CD_Y_max
50,2044374,20,1,6,120,2,12,SAILOR,HARVEY,CHARLES,2,401916,a867d110-1453-413d-9e6a-831b168a71df,4610,120,1,534,MANHATTAN,28.0,28.0,3,NEW YORK WARD 1 EASTERN DIVISION,200,231,a867d110-1453-413d-9e6a-831b168a71df,51 WILLIAM ST,1.0,2.0,-74.009179,40.706848,mn1850_01_74.0098_40.7070,2.0,95 CEDAR ST,-74.011213,40.709056,51 WILLIAM ST,-74.009179,40.706848
1945,2045435,36,1,1,120,2,1,LABORER,SMITH,PATRICK,1,406226,18ab29cf-b879-4838-8f48-9a37040c2a8c,4610,1840,1,534,MANHATTAN,28.0,56.0,5,NEW YORK WARD 1 WESTERN DIVISION,250,7456,18ab29cf-b879-4838-8f48-9a37040c2a8c,95 CEDAR ST,1.0,1.883333,-74.011213,40.709056,mn1850_01_74.0110_40.7091,5.833333,95 CEDAR ST,-74.011213,40.709056,51 WILLIAM ST,-74.009179,40.706848
1950,2045436,45,1,1,120,2,1,TAILOR,SALTER,JOHN,1,406228,41e79e97-d423-429a-81d4-a1da2dc6a81a,4610,1840,1,534,MANHATTAN,28.0,57.0,2,NEW YORK WARD 1 WESTERN DIVISION,300,7461,41e79e97-d423-429a-81d4-a1da2dc6a81a,95 CEDAR ST,1.0,1.95,-74.011213,40.709056,mn1850_01_74.0110_40.7091,5.833333,95 CEDAR ST,-74.011213,40.709056,51 WILLIAM ST,-74.009179,40.706848
1952,2045437,30,1,1,120,2,1,PORTER,MURPHY,OWEN,1,406229,9cafeaec-bcf6-4620-9a94-d3e05e668933,4610,1840,1,534,MANHATTAN,28.0,58.0,9,NEW YORK WARD 1 WESTERN DIVISION,320,7463,9cafeaec-bcf6-4620-9a94-d3e05e668933,95 CEDAR ST,1.0,2.0,-74.011213,40.709056,mn1850_01_74.0110_40.7091,5.833333,95 CEDAR ST,-74.011213,40.709056,51 WILLIAM ST,-74.009179,40.706848
1958,2045437,28,1,6,120,2,12,HACK DRIVER,BOYD,WILLIAM,7,406231,f74b012d-6216-404a-9113-5f9f70493bd1,4610,1840,1,534,MANHATTAN,28.0,58.0,9,NEW YORK WARD 1 WESTERN DIVISION,320,7469,f74b012d-6216-404a-9113-5f9f70493bd1,132 BROADWAY,1.0,1.95,-74.010699,40.708885,mn1850_01_74.0101_40.7086,1.95,95 CEDAR ST,-74.011213,40.709056,51 WILLIAM ST,-74.009179,40.706848


Unnamed: 0,CENSUS_SERIALP,CENSUS_AGE,CENSUS_SEX,CENSUS_MARST,CENSUS_RACE,CENSUS_LABFORCE,CENSUS_IMPREL,CENSUS_OCCSTR,CENSUS_NAMELAST,CENSUS_NAMEFRST,CENSUS_SEQ_NUM,CENSUS_HH_NUM,CENSUS_IPUMS_UID,CENSUS_CITY,CENSUS_PAGENO_HOUSEHOLD,CENSUS_WARD_NUM,CENSUS_REEL_HOUSEHOLD,CENSUS_PLACE,CENSUS_DWELLING_NUM,CENSUS_DWELLING_SEQ,CENSUS_DWELLING_SIZE,CENSUS_GEOG,CENSUS_LINE,CENSUS_INDEX,CENSUS_ID,CD_H_ADDRESS,selected,spatial_weight,CD_X,CD_Y,CD_BLOCK_NUM,spatial_weight_sum,CD_H_ADDRESS_sum,CD_X_sum,CD_Y_sum,CD_H_ADDRESS_max,CD_X_max,CD_Y_max
1961,2045438,33,1,1,120,2,1,MERCHANT,JOHNSON,JEREMIAH,1,406234,7f1291e5-4ac8-478e-bde6-fb5095cbe159,4610,1840,1,534,MANHATTAN,29.0,59.0,12,NEW YORK WARD 1 WESTERN DIVISION,410,7472,7f1291e5-4ac8-478e-bde6-fb5095cbe159,97 CEDAR ST,1.0,1.95,-74.011379,40.709132,mn1850_01_74.0110_40.7091,3.883333,97 CEDAR ST,-74.011379,40.709132,"97 CEDAR """" ST",-74.011379,40.709132
1977,2045441,26,1,1,120,2,1,TAILOR,CULLIN,JAMES,1,406241,7c9f8635-8953-49b1-bcc2-9db215fb6978,4610,1850,1,534,MANHATTAN,29.0,62.0,5,NEW YORK WARD 1 WESTERN DIVISION,150,7488,7c9f8635-8953-49b1-bcc2-9db215fb6978,97 CEDAR ST,1.0,1.933333,-74.011379,40.709132,mn1850_01_74.0110_40.7091,3.883333,97 CEDAR ST,-74.011379,40.709132,"97 CEDAR """" ST",-74.011379,40.709132
1982,2045442,46,1,1,120,2,1,GOLDSMITH,SELIGER,EDWARD,1,406243,a3a16d8c-0fe4-45a0-9aab-b0004fff2c75,4610,1850,1,534,MANHATTAN,29.0,63.0,3,NEW YORK WARD 1 WESTERN DIVISION,200,7493,a3a16d8c-0fe4-45a0-9aab-b0004fff2c75,"97 CEDAR """" ST",1.0,2.0,-74.011379,40.709132,mn1850_01_74.0110_40.7091,2.0,97 CEDAR ST,-74.011379,40.709132,"97 CEDAR """" ST",-74.011379,40.709132


Unnamed: 0,CENSUS_SERIALP,CENSUS_AGE,CENSUS_SEX,CENSUS_MARST,CENSUS_RACE,CENSUS_LABFORCE,CENSUS_IMPREL,CENSUS_OCCSTR,CENSUS_NAMELAST,CENSUS_NAMEFRST,CENSUS_SEQ_NUM,CENSUS_HH_NUM,CENSUS_IPUMS_UID,CENSUS_CITY,CENSUS_PAGENO_HOUSEHOLD,CENSUS_WARD_NUM,CENSUS_REEL_HOUSEHOLD,CENSUS_PLACE,CENSUS_DWELLING_NUM,CENSUS_DWELLING_SEQ,CENSUS_DWELLING_SIZE,CENSUS_GEOG,CENSUS_LINE,CENSUS_INDEX,CENSUS_ID,CD_H_ADDRESS,selected,spatial_weight,CD_X,CD_Y,CD_BLOCK_NUM,spatial_weight_sum,CD_H_ADDRESS_sum,CD_X_sum,CD_Y_sum,CD_H_ADDRESS_max,CD_X_max,CD_Y_max
90,2044406,27,1,1,120,2,1,PORTER,LUCEY,DENNIS,1,402021,3bd314e4-14f4-4dba-969f-11bf9f63afd9,4610,150,1,534,MANHATTAN,53.0,60.0,4,NEW YORK WARD 1 EASTERN DIVISION,370,373,3bd314e4-14f4-4dba-969f-11bf9f63afd9,46 PEARL ST,1.0,1.9108,-74.011901,40.703302,mn1850_01_74.0132_40.7030,3.8104,46 PEARL ST,-74.011901,40.703302,112 CEDAR ST,-74.012473,40.709475
94,2044407,36,1,1,120,2,1,BAKER,FINERTY,THOMAS,1,402022,9ba10e85-2d1a-4783-9606-b48104a278be,4610,150,1,534,MANHATTAN,53.0,61.0,7,NEW YORK WARD 1 EASTERN DIVISION,410,377,9ba10e85-2d1a-4783-9606-b48104a278be,46 PEARL ST,1.0,1.8996,-74.011901,40.703302,mn1850_01_74.0132_40.7030,3.8104,46 PEARL ST,-74.011901,40.703302,112 CEDAR ST,-74.012473,40.709475
2205,2045525,28,1,6,120,2,1,TAILOR,DIMOND,MICHAEL,1,406438,975a81d8-87ce-4769-89d9-eb75a54e06e3,4610,1950,1,534,MANHATTAN,53.0,147.0,1,NEW YORK WARD 1 WESTERN DIVISION,50,7898,975a81d8-87ce-4769-89d9-eb75a54e06e3,112 CEDAR ST,1.0,2.0,-74.012473,40.709475,mn1850_01_74.0124_40.7093,2.0,46 PEARL ST,-74.011901,40.703302,112 CEDAR ST,-74.012473,40.709475


In [None]:
Dwelling_filled_differences_all.groupby(["CENSUS_WARD_NUM", "CENSUS_DWELLING_NUM"]).ngroups

765

In [None]:
Dwelling_filled_differences_all.to_csv("/content/dwelling_filled_differences_all_mn_1850.csv", index = False)

### Conclusion
Either filling in based on maximum spatial weight or maximum spatial weight sum would make sense, however which one is the better choice is somewhat unclear. Another aspect is that there are often ties between the spatial weights, currently, it's set up to simply select the first option when there's a tie, but it's unclear whether that's the best approach.

### Quick sanity check

In [None]:
# save the outputs by the two ways
dwell_max = pd.read_csv("/content/dwelling_filled_max_1850_mn.csv")
dwell_sum = pd.read_csv("/content/dwelling_filled_sum_1850_mn.csv")

In [None]:
dwell_max.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 515630 entries, 0 to 515629
Data columns (total 31 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   CENSUS_SERIALP           515630 non-null  int64  
 1   CENSUS_AGE               515630 non-null  int64  
 2   CENSUS_SEX               515630 non-null  int64  
 3   CENSUS_MARST             515630 non-null  int64  
 4   CENSUS_RACE              515630 non-null  int64  
 5   CENSUS_LABFORCE          515630 non-null  int64  
 6   CENSUS_IMPREL            515630 non-null  int64  
 7   CENSUS_OCCSTR            160426 non-null  object 
 8   CENSUS_NAMELAST          515546 non-null  object 
 9   CENSUS_NAMEFRST          515046 non-null  object 
 10  CENSUS_SEQ_NUM           515630 non-null  int64  
 11  CENSUS_HH_NUM            515630 non-null  int64  
 12  CENSUS_IPUMS_UID         515630 non-null  object 
 13  CENSUS_CITY              515630 non-null  int64  
 14  CENS

In [None]:
dwell_sum.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 515630 entries, 0 to 515629
Data columns (total 32 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   CENSUS_SERIALP           515630 non-null  int64  
 1   CENSUS_AGE               515630 non-null  int64  
 2   CENSUS_SEX               515630 non-null  int64  
 3   CENSUS_MARST             515630 non-null  int64  
 4   CENSUS_RACE              515630 non-null  int64  
 5   CENSUS_LABFORCE          515630 non-null  int64  
 6   CENSUS_IMPREL            515630 non-null  int64  
 7   CENSUS_OCCSTR            160426 non-null  object 
 8   CENSUS_NAMELAST          515546 non-null  object 
 9   CENSUS_NAMEFRST          515046 non-null  object 
 10  CENSUS_SEQ_NUM           515630 non-null  int64  
 11  CENSUS_HH_NUM            515630 non-null  int64  
 12  CENSUS_IPUMS_UID         515630 non-null  object 
 13  CENSUS_CITY              515630 non-null  int64  
 14  CENS