In [None]:
!pip install pyjarowinkler
!pip install haversine
!pip install hdbscan

### write processing module

In [2]:
# processing
%%writefile processing.py

import pandas as pd
import networkx as nx
import numpy as np
from pyjarowinkler import distance
from haversine import haversine, Unit
import time


def col_for_disamb(df, cd_id, cen_id, cd_fn="CD_FIRST_NAME", cen_fn="CENSUS_FIRST_NAME", cd_ln="CD_LAST_NAME",
                           cen_ln="CENSUS_LAST_NAME", cen_occ="CENSUS_OCCUPATION", cen_age="CENSUS_AGE"):
    # name jw dist
    df["jw_fn"] = df.apply(lambda x: distance.get_jaro_distance(x[cd_fn], x[cen_fn], winkler=True, scaling=0.1), axis=1)
    df["jw_ln"] = df.apply(lambda x: distance.get_jaro_distance(x[cd_ln], x[cen_ln], winkler=True, scaling=0.1), axis=1)
    df["jw_score"] = 0.4 * df["jw_fn"] + 0.6 * df["jw_ln"]

    # occ
    df['occ_listed'] = np.where((df[cen_occ].isnull()) | (df[cen_occ] == '*'), 0, 1)

    # age
    df['age_score'] = np.where(df[cen_age] <= 12, 0, 1)

    # cd conflicts
    df["cd_count"] = df.groupby(cd_id)[cen_id].transform('count')
    df["census_count"] = df.groupby(cen_id)[cd_id].transform('count')

    df['census_count_inverse'] = 1 / df['census_count']
    df['cd_count_inverse'] = 1 / df['cd_count']

    #This is so the bipartite matching algorthm works the way we need it to
    df['CD_ID'] = 'CD_' + df[cd_id].astype(str)
    df['CENSUS_ID'] = 'CENSUS_' + df[cen_id].astype(str)

    return df


"""
Applies confidence score to df
"""
def apply_confidence_score(df, cd_fn = "CD_FIRST_NAME", cen_fn = "CENSUS_FIRST_NAME", cd_ln = "CD_LAST_NAME", cen_ln = "CENSUS_LAST_NAME", cen_occ = "CENSUS_OCCLABELB", cen_age = "CENSUS_AGE", cd_id="OBJECTID", cen_id="OBJECTID.x"):
    
    # name jw dist
    df["jw_fn"] = df.apply(lambda x: distance.get_jaro_distance(x[cd_fn], x[cen_fn], winkler=True, scaling=0.1), axis = 1)
    df["jw_ln"] = df.apply(lambda x: distance.get_jaro_distance(x[cd_ln], x[cen_ln], winkler=True, scaling=0.1), axis = 1)
    df["jw_score"] = 0.4 * df["jw_fn"] + 0.6 * df["jw_ln"]

    # occ
    df['occ_listed'] = np.where((df[cen_occ].isnull()) | (df[cen_occ] == '*'), 0, 1)

    # age
    df['age_score'] = np.where(df[cen_age] <= 12, 0, 1)

    # cd conflicts
    df["cd_count"] = df.groupby(cd_id)[cen_id].transform('count')
    df["census_count"] = df.groupby(cen_id)[cd_id].transform('count')

    df['confidence_score'] = .5*df.jw_score + .2*(1/df.cd_count) + \
                             .2*(1/df.census_count) + .05*df.occ_listed + \
                             .05*df.age_score
    df['confidence_score'] = df['confidence_score'].round(decimals = 2)

    return df


#not needed in new run
"""
Takes elastic search and census directory geocode file to create a dataframe 
ready for the disambiguation process.
Can add/incorporate new columns to include in confidence score here
elastic_search: either df with elastic search output or elastic search output file
city_directory: either df with city directory data or file name
file: boolean value, set to True if elastic_search/city_directory are file names otherwise set 
to false. Default false 
"""
def elastic_to_disamb(elastic_search, city_directory, file = False):

    if file:
        elastic_search = pd.read_csv(elastic_search, sep='\t', engine='python')
        city_directory = pd.read_csv(city_directory)

    else:
        elastic_search = elastic_search.copy()
        city_directory = city_directory.copy()

    latlng = city_directory[['OBJECTID', 'LONG', 'LAT']]

    #print(elastic_search.head())
    #print(latlng.head())

    match = apply_confidence_score(elastic_search, cen_fn='CENSUS_NAMEFRSTB', cen_ln='CENSUS_NAMELASTB',
                                      cen_occ='CENSUS_OCCLABELB', cen_id='OBJECTID.x')

    #print(match.head())
    match['CD_ID'] = 'CD_' + match['OBJECTID'].astype(str)
    match['CENSUS_ID'] = 'CENSUS_' + match['OBJECTID.x'].astype(str)

    #Can remove this after finalizing the confidence score
    match['census_count_inverse'] = 1 / match['census_count']
    match['cd_count_inverse'] = 1 / match['cd_count']

    match = match.merge(latlng, how='left', on='OBJECTID', validate='many_to_one')
    #print(match.head())
    return match
"""
Create a list of dataframes where the top row is an anchor
Each dataframe is one where spatial disambiguation will be applied
This is necessary as else, algorithms take too long to run
Match: df of matches
confidence_score: name of confidence score column
"""

def split_dfs(match, sort_var="CENSUS_ID", confidence="confidence_score"):

    match = match.sort_values(by=[sort_var])

    # identify anchors and assign anchor ID
    match['anchor'] = np.where(match[confidence] == 1, 1, None)
    sub_group = pd.DataFrame({'index': list(match.loc[match.anchor.notnull(), :].index), 'group_ID': range(0, sum(match['anchor'].notnull()))}).set_index('index')
    match = match.join(sub_group)
    match['group_ID'] = match['group_ID'].fillna(method='ffill').fillna(method='backfill')

    # split df into multiple df, each bounded by anchor

    # sub_group_dict = {group: df for group, df in match.groupby('group_ID')}
    sub_groups = [df for group, df in match.groupby('group_ID')]
    
    # add bottom anchor back
    """
    for i in range(0, len(sub_group_dict) - 1):
        sub_group_dict[i] = pd.concat([sub_group_dict[i], sub_group_dict[i+1][0:1]])
    """
    return sub_groups

"""
Create node ID for each match, to be using the shortest path algorithm 
sub must be a df with each row as a potential match between a CD and census record. It must contain the columns CD_ID, CENSUS_ID, LONG, LAT, confidence_score and MATCH_ADDR.
the column names can be specified individually if they are named differently
Returns the dataframe with new columns, 'anchor', 'node_ID' and 'letter'.
anchor: whether row is an anchor (confidence score = 1)
node_ID: unique node ID. each node is a match, so e.g. A0 and A1 refers to two potential CD matches for the same census record
letter: grouping for identical census records 
add_prefixes: whether to add prefixes 'CD_' and 'CENSUS_' to cd_id and census_id respectively. prefixes are required for subsequent bipartite matching
"""
def create_path_df(sub_graph, census_id = "CENSUS_ID"):

    sub_graph['node_ID'] = sub_graph.groupby(census_id).cumcount()

    letter_id = sub_graph[census_id].unique().tolist()
    letters = ['N' + str(x) for x in range(0, len(letter_id))]
    letter_id = pd.DataFrame({'CENSUS_ID': letter_id, 'letter': letters})

    sub_graph = sub_graph.merge(letter_id, how='left', left_on=census_id, right_on="CENSUS_ID", validate='many_to_one')

    sub_graph['node_ID'] = sub_graph.apply(lambda row: row.letter + '_' + str(row.node_ID), axis=1)

    return sub_graph

"""
Creates a graph from the sub_graph dataframe
Each node being a potential CD-census match and 
    each edge being the link between the potential CD records of consecutive census records
The weight of each edge = the haversine distance between the two
cluster_col: name of column with cluster group. If does not exist, use None
Returns the graph object
"""

def create_path_graph(g, cluster_col='in_cluster_x', lat='CD_X', lon='CD_Y'):

    g.loc[:, 'key'] = 0
    g = g.merge(g, on='key')

    #This is time consuming
    g['key'] = g.apply(lambda row: 1 if int(row.letter_x[1:]) - int(row.letter_y[1:]) == -1 else 0, axis = 1)

    g = g[g.key == 1]

    g['weight'] = g.apply(lambda row: haversine((row[lat + '_y'], row[lon + '_y']), (row[lat + '_x'], row[lon + '_x']), unit=Unit.METERS), axis=1)

    if cluster_col != None:
        g['weight'] = g.apply(lambda row: row.weight + 999 if row[cluster_col] == -1 else row.weight, axis=1)

    g_edges = [(row.node_ID_x, row.node_ID_y, row.weight) for index, row in g.iterrows()]
    graph = nx.DiGraph()
    graph.add_weighted_edges_from(g_edges)

    
    return graph

Writing processing.py


### write disambiguation module

In [3]:
# disambiguation
%%writefile disambiguation.py

import pandas as pd
import networkx as nx
import hdbscan
from itertools import islice
#import disambiguation.processing as dp
import processing as dp
import time

"""
Wrapper function for everything below, including checks
Designed to work within list comprehension only! (refer to Disambiguator())
Works by applying algorithm to specified df (using index i) in the list
sub_groups: list of dfs, each df being a subset of the census bounded by 2 anchors
i: index of df in the list
"""
def apply_algo(sub_groups, i, cluster=True, k_between=True, census_id='CENSUS_ID', census_count="census_count", confidence='confidence_score', lat="CD_X", lon="CD_Y", cluster_kwargs={}, path_kwargs={}):

    if i % 1000 == 0:
        print("Reached: " + str(i))
    df = sub_groups[i]
    if sum(df[census_count] > 1) == 0: # no disambiguation needed
        return df

    if i + 1 < len(sub_groups): # add bottom anchor
        df = pd.concat([df, sub_groups[i+1][0:1]]) 

    path_df = dp.create_path_df(df, census_id)


    if cluster:
        # apply density clustering and remove outlier nodes
        path_df = apply_density_clustering(path_df, lat, lon, **cluster_kwargs)
        cluster_arg = 'in_cluster_x'
    else:
        cluster_arg = None

    # create graph and k shortest paths centrality

    g = dp.create_path_graph(path_df, cluster_col=cluster_arg, lat=lat, lon=lon)

    if k_between:

        output = apply_k_betweenness(path_df, g, confidence = confidence, **path_kwargs)
    else:
        output = apply_shortest_path(path_df, g, confidence = confidence, **path_kwargs)

    return output

"""
Apply Dijkstra's algorithm to the graph and get spatial weights
Spatial weights are computed as confidence score +1 if match was included in shortest path, and confidence score + 0 otherwise
df: dataframe of records with confidence score and node ID, names can be modified via parameters
graph: graph object created from create_path_graph()
source: start node, e.g. 'A0'. By default it chooses first row in the table
target: end node, e.g. 'J0'. By default it chooses last row in the table
Returns a dataframe of matches with added 'spatial weight'
"""
def apply_shortest_path(df, graph, source = None, target = None, confidence = 'confidence_score', node_id = 'node_ID'):
    if source == None:
        source = list(df[node_id])[0]
    if target == None:
        target = list(df[node_id])[-1]

    path = nx.dijkstra_path(graph, source, target)
    df['spatial_weight'] = df.apply(lambda row: row[confidence] + 1 if row[node_id] in path else row[confidence], axis = 1)

    return df

"""
Apply betweenness centrality using k shortest paths (as documented in spatial_disambiguation.ipynb)
df: df of matches
graph: graph object created from create_path_graph()
source: start node, e.g. 'A0'. By default it chooses first row in the table
target: end node, e.g. 'J0'. By default it chooses last row in the table
k: how many shortest paths to choose from (absolute number). By default 1 or ~ 1/2 of number of possible paths if there are more than 30 paths
scale: how much to scale the score by when adding it with confidence score. Default = 1 (equal weight of confidence score and spatial weight)
Returns
    df: df with spatial weights column
    k_paths: paths used for calculation
"""
def apply_k_betweenness(df, graph, confidence = "confidence_score", source=None, target=None, k=None, scale=1):
    if source == None:
        source = list(df["node_ID"])[0]
    if target == None:
        target = list(df["node_ID"])[-1]

    k_paths = nx.shortest_simple_paths(graph, source, target, weight="weight")

    length = get_n_paths(df)
    if k == None:
        if length < 31:
            k = 1
        elif length > 50:
            k = 50
        else:
            k = int(0.5 * length)

    k_paths = list(islice(k_paths, k))

    # initialize output: dict with nodes as keys
    spatial_weights = dict.fromkeys(graph.nodes, 0)
    
    # count
    for path in k_paths:
        for node in path:
            spatial_weights[node] += 1
    
    spatial_weights = [[key , round(value / k, 2) * scale] for key, value in spatial_weights.items()]
    spatial_df = pd.DataFrame(spatial_weights, columns=["node_ID", 'spatial_weight'])
    df = df.merge(spatial_df, how="inner", on="node_ID", validate="one_to_one")
    df['spatial_weight'] = df['spatial_weight'] + df[confidence]

    return df

"""
Helper method to count the number of possible paths in the graph
"""
def get_n_paths(df):
    k = 1
    counts = df.groupby('letter')['letter'].size().to_list()
    for count in counts:
        k *= count
    
    return k

"""
Apply density based clustering to detect outliers. Requires `hdbscan` library
Refer to hdbscan documentation on parameters
Returns df with a column 'in_cluster' indicating which cluster the nodes are in
"""
def apply_density_clustering(df, lat='CD_X', lon="CD_Y", min_cluster_size=10, min_samples=10, allow_single_cluster=True, **kwargs):
    cluster_sub = df.loc[:, [lon, lat]]
    clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples, allow_single_cluster=allow_single_cluster, **kwargs).fit(cluster_sub)

    df['in_cluster'] = pd.Series(clusterer.labels_).values
    return df

"""
A bipartite graph is created from the matches, with each node being either a census or CD record and each edge indicating a potential match. 
Note that subgraph MUST have prefixes on the cd_id ('CD_') and census_id ('CENSUS_') columns
The matching algorithm (maximum weighted matching) will 
    (1) select sets of matches that give the highest number of matches 
    (2) choose the match set that has the highest weight based on that
Returns a dictionary with 'graph' as the list of bipartite graphs and 'results' being the original df with an additional 'selected' column, indicating the correct match and 'graph_id' column, indicated subgraph.
"""
def get_matches(df, cd_id = 'CD_ID', census_id = 'CENSUS_ID', weight = 'spatial_weight'):
    b_edges = [(row[cd_id], row[census_id], row[weight]) for index, row in df.iterrows()]
    b = nx.Graph()
    b.add_weighted_edges_from(b_edges)

    # algorithm is too expensive if we perform it on entire graph. moreover, graph is actually disconnected into sub_graphs. apply algorithm on subgraphs instead
    subgraphs = [b.subgraph(c) for c in nx.connected_components(b)]
    matches = [list(nx.max_weight_matching(graph, maxcardinality = True)) for graph in subgraphs]
    matches = [sorted(list(item)) for sublist in matches for item in sublist] # unnest and convert pairs from tuple to list
    matches = pd.DataFrame(matches, columns=[cd_id, census_id])
    matches['selected'] = 1

    df = df.merge(matches, how='left', on=[cd_id, census_id], validate='one_to_one')
    df['selected'] = df['selected'].fillna(0)

    # add subgraph id
    subgraph_id = [{'graph_ID': i, 'CD_ID': node} for i in range(0, len(subgraphs)) for node in list(subgraphs[i].nodes) if node[:2] == 'CD']
    subgraph_id = pd.DataFrame(subgraph_id)
    df = df.merge(subgraph_id, how="inner", left_on=cd_id, right_on="CD_ID", validate="many_to_one")

    return {'graph': subgraphs, 'results': df}

Writing disambiguation.py


### write unique dwelling module

In [4]:
# dataprocessing
%%writefile dataprocessing.py

import pandas as pd

"""
Purpose: generate dwelling id that's unique for every dwelling
df: dataframe with census data
dwelling_col: column with dwelling number information from census
"""

def create_unique_dwelling(df, dwelling_col = "CENSUS_DWELLING_NUM"):
    dwelling = df[dwelling_col].iloc[0]
    dwelling_num = 1
    dwelling_id = []
    for row in df.itertuples():
        row_dwelling = getattr(row, dwelling_col)
        if row_dwelling == dwelling:
            dwelling_id.append(dwelling_num)
        if row_dwelling != dwelling:
            dwelling_num += 1
            dwelling_id.append(dwelling_num)
            dwelling = row_dwelling

    df["dwelling_id"] = dwelling_id
    return df

Writing dataprocessing.py


## dwellings

In [5]:

import pandas as pd
import numpy as np
from collections import defaultdict
import re
import networkx as nx  
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
%load_ext autoreload
%autoreload 2
import disambiguation
import processing as dp
from networkx.algorithms import bipartite
import dataprocessing

Goal: Assign address, block number, x coordinate, and y coordinate to each dwelling matched with an address through disambiguation\ Problem: Disambiguation occurs at the individual census record level, matching individuals between the city directory and census records. Because a dwelling has a single address this means that when there's a match's information can be assigned to every record within that dwelling. However, records within a dwelling could be matched to different addresses, these conflicts need to be resolved\ Method: When there's a single match among census records within a dwelling assign values from that match to every record in that dwelling. If there's a conflict within a dwelling select one of the matches and assign those values to every record in that dwelling.\ In this notebook: Illustrates different approaches to resolving dwelling conflicts and fills in dwellings accordingly.

### get data

In [6]:
# used 22-09 disambiguated output file for dwelling address conflicts
census_1850 = pd.read_csv("/content/census_1850_indexUpdate.csv")
#disambiguated_1850 = pd.read_csv("/content/1850_mn_match_24-11-2020.csv")
disambiguated_1850 = pd.read_csv("/content/1850_mn_match_v2.csv")

### Merge 1850 census data with the selected matches from disambiguation

Notes: Only done on 1850 dataset due to lack of clearly indicated dwelling numbers/household numbers on 1880 dataset. Need to check how well this works manually. -Use CENSUS_ID from disambiguated and CENSUS_IPUMSUID from census data for joins, these have the same values, but need to strip the word CENSUS from CENSUS_ID data (this was added in during the disambiguation process)

create unique dwelling ids grouped by ward no. these unique dwelling ids are further used to resolve dwelling address conflicts istead of 'census_dwelling_num'

In [7]:
disambiguated_1850.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63312 entries, 0 to 63311
Data columns (total 54 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   CENSUS_INDEX          63312 non-null  int64  
 1   CENSUS_IPUMS_UID      63312 non-null  object 
 2   CENSUS_SERIAL         63312 non-null  int64  
 3   CENSUS_HH_NUM         63312 non-null  int64  
 4   CENSUS_SEQ_NUM        63312 non-null  int64  
 5   CENSUS_REEL           63312 non-null  int64  
 6   CENSUS_PAGENUM        63312 non-null  int64  
 7   CENSUS_LINE           63312 non-null  int64  
 8   CENSUS_AGE            63312 non-null  int64  
 9   CENSUS_GENDER         63312 non-null  int64  
 10  CENSUS_RACE           63312 non-null  int64  
 11  CENSUS_LABFORCE       63312 non-null  int64  
 12  CENSUS_OCCUPATION     37973 non-null  object 
 13  CENSUS_IMPREL         63312 non-null  int64  
 14  CENSUS_FIRST_NAME     63312 non-null  object 
 15  CENSUS_LAST_NAME   

In [8]:
disambiguated_selected = disambiguated_1850[["CENSUS_ID", "CD_H_ADDRESS", "selected", "spatial_weight", "CD_X", "CD_Y", "CD_BLOCK_NUM"]]
disambiguated_selected = disambiguated_selected[disambiguated_selected["selected"] == 1]
disambiguated_selected.loc[:,"CENSUS_ID"] = disambiguated_selected["CENSUS_ID"].apply(lambda x: x.strip("CENSUS_"))

In [9]:
#create unique dwelling id
census_1850 = census_1850.groupby('CENSUS_WARD_NUM', as_index = False).apply(lambda x: dataprocessing.create_unique_dwelling(x, 'CENSUS_DWELLING_NUM'))

In [10]:
# merge census and disambiguated output
CensusDis1850 = census_1850.merge(disambiguated_selected, how = "left", left_on = "CENSUS_IPUMS_UID", right_on = "CENSUS_ID")
CensusDis1850.head()

Unnamed: 0,CENSUS_SERIALP,CENSUS_AGE,CENSUS_SEX,CENSUS_MARST,CENSUS_RACE,CENSUS_LABFORCE,CENSUS_IMPREL,CENSUS_OCCSTR,CENSUS_NAMELAST,CENSUS_NAMEFRST,CENSUS_SEQ_NUM,CENSUS_HH_NUM,CENSUS_IPUMS_UID,CENSUS_CITY,CENSUS_PAGENO_HOUSEHOLD,CENSUS_WARD_NUM,CENSUS_REEL_HOUSEHOLD,CENSUS_PLACE,CENSUS_DWELLING_NUM,CENSUS_DWELLING_SEQ,CENSUS_DWELLING_SIZE,CENSUS_GEOG,CENSUS_LINE,CENSUS_INDEX,dwelling_id,CENSUS_ID,CD_H_ADDRESS,selected,spatial_weight,CD_X,CD_Y,CD_BLOCK_NUM
0,2044262,49,1,6,120,2,1,INN KEEPER,FLINT,JOHN M,1,401735,1a0cea81-aa4c-4e02-af29-37f3fdadd987,4610,70,1,534,MANHATTAN,1.0,1.0,84,NEW YORK WARD 1 EASTERN DIVISION,10,1,1,,,,,,,
1,2044263,36,2,6,120,0,1,,FLINT,MARY A,2,401735,29aed4d0-b649-4d6b-9e28-3db8f4da81d8,4610,70,1,534,MANHATTAN,1.0,1.0,84,NEW YORK WARD 1 EASTERN DIVISION,10,2,1,,,,,,,
2,2044264,12,2,6,120,0,1,,FLINT,MARY D,3,401735,8cfac447-5fdf-44a4-888c-ee1c7e2a8355,4610,70,1,534,MANHATTAN,1.0,1.0,84,NEW YORK WARD 1 EASTERN DIVISION,10,3,1,,,,,,,
3,2044265,40,1,6,120,2,1,COMB MERCHANT,OATMAN,JAMES C,4,401736,ec0bff21-a919-42bf-a22c-7c6ef9b656af,4610,70,1,534,MANHATTAN,1.0,1.0,84,NEW YORK WARD 1 EASTERN DIVISION,10,4,1,,,,,,,
4,2044266,28,2,6,120,0,1,,OATMAN,CAROLINE E,5,401736,61d38b86-d8bc-4b69-bb6b-3169fa0e594d,4610,70,1,534,MANHATTAN,1.0,1.0,84,NEW YORK WARD 1 EASTERN DIVISION,10,5,1,,,,,,,


In [11]:
census_1850.shape

(515630, 25)

In [12]:
print("Proportion of census data assigned addresses:", CensusDis1850.CD_H_ADDRESS.count()/515630)

Proportion of census data assigned addresses: 0.047679537652968215


In [13]:
def get_counts(x, one_add, no_add, more_add, col, counts1, counts2, counts3):

    c = x["CD_H_ADDRESS"].nunique()
    if c == 0:
        no_add.append((x["CENSUS_WARD_NUM"].iloc[0], x[col].iloc[0]))
        counts1.append(len(x))
    elif c == 1:
        one_add.append((x["CENSUS_WARD_NUM"].iloc[0], x[col].iloc[0]))
        counts2.append(len(x))
    
    elif c > 1:
        more_add.append((x["CENSUS_WARD_NUM"].iloc[0], x[col].iloc[0]))
        counts3.append(len(x))

In [14]:
no_add = []
one_add = []
more_add = []
counts_no_add = []
counts_one_add = []
counts_more_add = []

for index, df in CensusDis1850.groupby(["CENSUS_WARD_NUM", "dwelling_id"]):
    get_counts(df, one_add, no_add, more_add, "dwelling_id", counts_no_add, counts_one_add, counts_more_add)

In [15]:
print("Proportion of dwellings assigned one address:", len(one_add)/CensusDis1850.groupby(["CENSUS_WARD_NUM", "dwelling_id"]).ngroups)
print("Proportion of dwellings without an address:", len(no_add)/CensusDis1850.groupby(["CENSUS_WARD_NUM", "dwelling_id"]).ngroups)
print("Proportion of dwellings assigned more than one address:", len(more_add)/CensusDis1850.groupby(["CENSUS_WARD_NUM", "dwelling_id"]).ngroups)

Proportion of dwellings assigned one address: 0.3304719917012448
Proportion of dwellings without an address: 0.5768153526970954
Proportion of dwellings assigned more than one address: 0.09271265560165975


In [16]:
print("Proportion of census data that should be assigned an address as is:", sum(counts_one_add)/515630)
print("Proportion of census data that should be assigned an address after dealing with conflicts:", (sum(counts_one_add) + sum(counts_more_add))/515630)
print("Proportion of census data that we shouldn't be able to assign an address to:", sum(counts_no_add)/515630)

Proportion of census data that should be assigned an address as is: 0.3624478792932917
Proportion of census data that should be assigned an address after dealing with conflicts: 0.533052770397378
Proportion of census data that we shouldn't be able to assign an address to: 0.466947229602622


In [17]:

#Function for filling in households/dwelling numbers if relevant
def check_quant(x, exceptions, col, tuple = False):

    c = x["CD_H_ADDRESS"].nunique()
    if c > 1:
        if tuple:
            exceptions.append((x["CENSUS_WARD_NUM"].iloc[0], x[col].iloc[0]))
        else:
            exceptions.append(x[col].iloc[0])
    elif c == 1:
        x["CD_H_ADDRESS"] = x["CD_H_ADDRESS"].ffill().bfill()
        x["CD_X"] = x["CD_X"].ffill().bfill()
        x["CD_Y"] = x["CD_Y"].ffill().bfill()
        x["CD_BLOCK_NUM"] = x["CD_BLOCK_NUM"].ffill().bfill()
    return x

In [18]:

dwellings_conflicts = []
base_fill = CensusDis1850.groupby(["CENSUS_WARD_NUM", "dwelling_id"]).apply(lambda x: check_quant(x, dwellings_conflicts, "dwelling_id", tuple = True))

In [19]:
print("Proportion of census data assigned addresses:", base_fill.CD_H_ADDRESS.count()/515630)

Proportion of census data assigned addresses: 0.38036770552527976


In [20]:
base_fill.CD_H_ADDRESS.count()

196129

In [21]:
sum(counts_one_add)

186889

In [22]:
more_add == dwellings_conflicts

True

In [23]:
Dwelling_nums_nas = CensusDis1850[CensusDis1850["CENSUS_DWELLING_NUM"].isnull()]
nans_addresses = Dwelling_nums_nas.CD_H_ADDRESS.count()
Dwelling_nums_nas.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 912 entries, 1307 to 495263
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   CENSUS_SERIALP           912 non-null    int64  
 1   CENSUS_AGE               912 non-null    int64  
 2   CENSUS_SEX               912 non-null    int64  
 3   CENSUS_MARST             912 non-null    int64  
 4   CENSUS_RACE              912 non-null    int64  
 5   CENSUS_LABFORCE          912 non-null    int64  
 6   CENSUS_IMPREL            912 non-null    int64  
 7   CENSUS_OCCSTR            367 non-null    object 
 8   CENSUS_NAMELAST          911 non-null    object 
 9   CENSUS_NAMEFRST          912 non-null    object 
 10  CENSUS_SEQ_NUM           912 non-null    int64  
 11  CENSUS_HH_NUM            912 non-null    int64  
 12  CENSUS_IPUMS_UID         912 non-null    object 
 13  CENSUS_CITY              912 non-null    int64  
 14  CENSUS_PAGENO_HOUSEH

So this is too small to explain the difference. For now it's not different enough to stop from continuing, but it's definitely worth keeping in mind.

### Fill in Addresses for Household/Dwelling, based on Disambiguation Match
Note: Does not resolve conflicts

#### Fill in addresses for census entries in the same household

In [24]:
#confirm that household/dwelling values are unique accross the entire dataset
def uniqueness(df, col):
    df_check = df[["CENSUS_WARD_NUM", col]]
    df2 = df_check.groupby("CENSUS_WARD_NUM").apply(lambda x: x[col].unique())
    d = df2.to_dict()
    for key1 in d:
        for key2 in d:
            if key1 != key2:
                check = any(item in d[key1] for item in d[key2])
                if check is True:
                    print(key1, key2)
                    raise Exception(str(col) + " numbers are not unique")
    return True

In [25]:
uniqueness(CensusDis1850, "CENSUS_HH_NUM")

True

In [26]:
uniqueness(CensusDis1850, "CENSUS_SERIALP")

True

In [27]:
#Filling addresses for people in the same household
households = [] # keep track of any households with multiple addresses 
Census_hh = CensusDis1850.groupby("CENSUS_HH_NUM").apply(lambda x: check_quant(x, households, "CENSUS_HH_NUM"))

In [28]:
#These households need to be inspected more carefully, it seems that they
#have been assigned multiple addresses
len(households)

966

In [29]:
with open('/content/households.txt', 'w') as filehandle:
    for listitem in households:
        filehandle.write('%s\n' % listitem)

In [30]:

x = Census_hh[Census_hh["CENSUS_HH_NUM"] == households[7]]
x[["CENSUS_NAMEFRST", "CENSUS_NAMELAST", "CENSUS_AGE", "CENSUS_SEX", "CENSUS_OCCSTR", "CENSUS_HH_NUM","dwelling_id", "CD_H_ADDRESS", "spatial_weight"]]

Unnamed: 0,CENSUS_NAMEFRST,CENSUS_NAMELAST,CENSUS_AGE,CENSUS_SEX,CENSUS_OCCSTR,CENSUS_HH_NUM,dwelling_id,CD_H_ADDRESS,spatial_weight
15031,MICHAEL,MORAN,48,1,LABORER,409771,858,46 TRINITY PL,2.0
15032,CECILIA,MORAN,42,2,,409771,858,,
15033,CECILIA,MORAN,15,2,,409771,858,,
15034,FRANCIS,MORAN,8,1,,409771,858,42 WHITEHALL ST,1.9
15035,JULIAN,MORAN,4,2,,409771,858,,


In [31]:
Census_hh.to_csv("/content/Census_1850_household.csv", index = False)

In [32]:
print("Proportion of census data assigned addresses:", Census_hh["CD_H_ADDRESS"].count()/522152)

Proportion of census data assigned addresses: 0.1690044278294443


#### use IPUMS household number

In [33]:
Census_hh = pd.read_csv("/content/Census_1850_household.csv")

In [34]:
households_IPUMS = []
Census_hh_IPUMS = Census_hh.groupby("CENSUS_SERIALP").apply(lambda x: check_quant(x, households_IPUMS, "CENSUS_SERIALP"))

In [35]:
len(households_IPUMS)

1788

In [36]:
with open('/content/housholds_IPUMS.txt', 'w') as filehandle:
    for listitem in households_IPUMS:
        filehandle.write('%s\n' % listitem)

In [37]:
Census_hh_IPUMS.to_csv("/content/Census_1850_household_IPUMS.csv", index = False)

In [38]:
print("Proportion of census data assigned addresses:", Census_hh_IPUMS["CD_H_ADDRESS"].count()/522152)

Proportion of census data assigned addresses: 0.2539911749835297


#### Fill in address for census entries with same dwelling number

In [39]:
#Filling addresses for people in the same household
dwellings = [] # keep track of any dwellings with multiple addresses 
Census_hh_dw = Census_hh_IPUMS.groupby(["CENSUS_WARD_NUM","dwelling_id"]).apply(lambda x: check_quant(x, dwellings, "dwelling_id", tuple = True))

In [40]:
#Dwellings that are assigned more than one address
len(dwellings)

3584

In [41]:
with open('/content/dwellings.txt', 'w') as filehandle:
    for listitem in dwellings:
        line = ' '.join(str(x) for x in listitem)
        filehandle.write(line + '\n')

In [42]:
Census_hh_dw.to_csv("/content/Census_1850_dwellings.csv", index = False)

In [43]:
print("Proportion of census data assigned addresses:", Census_hh_dw["CD_H_ADDRESS"].count()/522152)

Proportion of census data assigned addresses: 0.42507928725735034


#### Check Number of Dwellings with Multiple addresses

In [44]:
#Function for filling in households/dwelling numbers if relevant
def no_address(x, col):
    c = x["CD_H_ADDRESS"].nunique()
    if c == 0:
            nones.append((x["CENSUS_WARD_NUM"].iloc[0], x[col].iloc[0]))

In [45]:
nones = []
for index,df in Census_hh_dw.groupby(["CENSUS_WARD_NUM", "dwelling_id"]):
    no_address(df, "dwelling_id")

In [46]:
len(nones)

22002

In [47]:

print("Proportion of dwellings with no match at all:",len(nones)/Census_hh_dw.groupby(["CENSUS_WARD_NUM", "dwelling_id"]).ngroups)

Proportion of dwellings with no match at all: 0.5705912863070539


### Use Bipartite Matching to Get A Single Address for Dwellings

This is a little less than what we'd expect from the earlier analysis, clearly there's some situation that's not being accounted for here

Issue: There are both households and dwellings that are assigned multiple addresses

Potential cause: an incorrect match -- it may make sense to incorporate that dwellings/households may have only a single match within the disambiguation process

* possible approach: two levels of bipartite matching for household and dwellings
* possible approach: incorporate into initial bipartite matching

Potential cause: dwellings/households that are referred to by multiple addresses - ei corner building, maybe a historical address change, streets with two names (Avenue of the Americas/6th Ave), etc.

* standardization can help
* create unique dwelling ids 

In [75]:
# import census file and 1850 disambiguated output
census_1850 = pd.read_csv("/content/census_1850_indexUpdate.csv")
#disambiguated_1850 = pd.read_csv("/content/1850_mn_match_24-11-2020.csv")
disambiguated_1850 = pd.read_csv("/content/1850_mn_match_v2.csv")

In [76]:
#create unique dwelling id
census_1850 = census_1850.groupby('CENSUS_WARD_NUM', as_index = False).apply(lambda x: dataprocessing.create_unique_dwelling(x, 'CENSUS_DWELLING_NUM'))

In [77]:
census_1850.head()

Unnamed: 0,CENSUS_SERIALP,CENSUS_AGE,CENSUS_SEX,CENSUS_MARST,CENSUS_RACE,CENSUS_LABFORCE,CENSUS_IMPREL,CENSUS_OCCSTR,CENSUS_NAMELAST,CENSUS_NAMEFRST,CENSUS_SEQ_NUM,CENSUS_HH_NUM,CENSUS_IPUMS_UID,CENSUS_CITY,CENSUS_PAGENO_HOUSEHOLD,CENSUS_WARD_NUM,CENSUS_REEL_HOUSEHOLD,CENSUS_PLACE,CENSUS_DWELLING_NUM,CENSUS_DWELLING_SEQ,CENSUS_DWELLING_SIZE,CENSUS_GEOG,CENSUS_LINE,CENSUS_INDEX,dwelling_id
0,2044262,49,1,6,120,2,1,INN KEEPER,FLINT,JOHN M,1,401735,1a0cea81-aa4c-4e02-af29-37f3fdadd987,4610,70,1,534,MANHATTAN,1.0,1.0,84,NEW YORK WARD 1 EASTERN DIVISION,10,1,1
1,2044263,36,2,6,120,0,1,,FLINT,MARY A,2,401735,29aed4d0-b649-4d6b-9e28-3db8f4da81d8,4610,70,1,534,MANHATTAN,1.0,1.0,84,NEW YORK WARD 1 EASTERN DIVISION,10,2,1
2,2044264,12,2,6,120,0,1,,FLINT,MARY D,3,401735,8cfac447-5fdf-44a4-888c-ee1c7e2a8355,4610,70,1,534,MANHATTAN,1.0,1.0,84,NEW YORK WARD 1 EASTERN DIVISION,10,3,1
3,2044265,40,1,6,120,2,1,COMB MERCHANT,OATMAN,JAMES C,4,401736,ec0bff21-a919-42bf-a22c-7c6ef9b656af,4610,70,1,534,MANHATTAN,1.0,1.0,84,NEW YORK WARD 1 EASTERN DIVISION,10,4,1
4,2044266,28,2,6,120,0,1,,OATMAN,CAROLINE E,5,401736,61d38b86-d8bc-4b69-bb6b-3169fa0e594d,4610,70,1,534,MANHATTAN,1.0,1.0,84,NEW YORK WARD 1 EASTERN DIVISION,10,5,1


In [78]:
# join disambiguated output with census again to get unique dwelling id rather than create it again
disambiguated_1850 = disambiguated_1850.merge(census_1850[['CENSUS_IPUMS_UID','CENSUS_WARD_NUM','dwelling_id']], how = "left", left_on = "CENSUS_IPUMS_UID", right_on = "CENSUS_IPUMS_UID")

In [79]:
disambiguated_1850.columns

Index(['CENSUS_INDEX', 'CENSUS_IPUMS_UID', 'CENSUS_SERIAL', 'CENSUS_HH_NUM',
       'CENSUS_SEQ_NUM', 'CENSUS_REEL', 'CENSUS_PAGENUM', 'CENSUS_LINE',
       'CENSUS_AGE', 'CENSUS_GENDER', 'CENSUS_RACE', 'CENSUS_LABFORCE',
       'CENSUS_OCCUPATION', 'CENSUS_IMPREL', 'CENSUS_FIRST_NAME',
       'CENSUS_LAST_NAME', 'CENSUS_DWELLING_NUM', 'CENSUS_DWELLING_SEQ',
       'CENSUS_DWELLING_SIZE', 'CENSUS_GEOG', 'OBJECTID', 'CD_INDEX',
       'CD_RECORD_ID', 'CD_FIRST_NAME', 'CD_LAST_NAME', 'CD_OCCUPATION_STD',
       'CD_H_ADDRESS', 'CD_H_HOUSE_NUMBER', 'CD_H_STREET_NAME', 'CD_BLOCK_NUM',
       'CD_WARD_NUM', 'CD_X', 'CD_Y', 'jw_fn', 'jw_ln', 'jw_score',
       'occ_listed', 'age_score', 'cd_count', 'census_count',
       'census_count_inverse', 'cd_count_inverse', 'CD_ID', 'CENSUS_ID',
       'confidence_score', 'anchor', 'group_ID', 'node_ID', 'letter',
       'in_cluster', 'key', 'spatial_weight', 'selected', 'graph_ID',
       'CENSUS_WARD_NUM', 'dwelling_id'],
      dtype='object')

In [80]:
disambiguated_1850_selected = disambiguated_1850[disambiguated_1850["selected"] == 1].copy()

In [81]:
disambiguated_1850_selected.shape

(24585, 56)

In [82]:
# create unique ward no
disambiguated_1850_selected["unique_ward"] = disambiguated_1850_selected.apply(lambda row: str(row.CENSUS_WARD_NUM) + "_" + str(row.dwelling_id), axis = 1)

In [83]:
disambiguated_1850_selected.shape

(24585, 57)

In [84]:
def get_matches_dwelling(df, cd_id = 'CD_ID', census_id = 'CENSUS_ID', weight = 'spatial_weight'):
    
    #This is to preserve order
    df = df.copy()
    df[cd_id] = "A_ " + df[cd_id].astype(str)
    df[census_id] = "B_ " + df[census_id].astype(str)
    
    b_edges = [(row[cd_id], row[census_id], row[weight]) for index, row in df.iterrows()]
    b = nx.Graph()
    b.add_nodes_from(df[cd_id].unique(), bipartite = 0)
    b.add_nodes_from(df[census_id].unique(), bipartite = 1)
    b.add_weighted_edges_from(b_edges)

    # algorithm is too expensive if we perform it on entire graph. moreover, graph is actually disconnected into sub_graphs. apply algorithm on subgraphs instead
    subgraphs = [b.subgraph(c) for c in nx.connected_components(b)]
    matches = [list(nx.max_weight_matching(graph, maxcardinality = True)) for graph in subgraphs]
    matches = [sorted(list(item)) for sublist in matches for item in sublist] # unnest and convert pairs from tuple to list
    matches = pd.DataFrame(matches, columns=[cd_id, census_id])
    
    matches.loc[:,cd_id] = matches[cd_id].apply(lambda x: x.strip("A_ "))
    matches.loc[:,census_id] = matches[census_id].apply(lambda x: x.strip("B_ "))

    return matches

In [85]:
match = get_matches_dwelling(disambiguated_1850_selected, cd_id = "unique_ward", census_id = "CD_H_ADDRESS")

In [86]:
disambiguated_1850_selected = disambiguated_1850_selected[["CENSUS_IPUMS_UID", "unique_ward"]].copy()
disambiguated_1850_selected.drop_duplicates("unique_ward", inplace = True)
matched = disambiguated_1850_selected.merge(match, how = "left", on = "unique_ward", validate = "one_to_one")
dwelling_addresses = census_1850.merge(matched, how = "left", on = "CENSUS_IPUMS_UID")

In [87]:
print("Proportion of census data assigned addresses:", dwelling_addresses.CD_H_ADDRESS.count()/515630)

Proportion of census data assigned addresses: 0.02757403564571495


In [88]:
#Function for filling in households/dwelling numbers if relevant
def check_quant(x, nones, exceptions, col, tuple = False):

    c = x["CD_H_ADDRESS"].nunique()
    if c == 0:
        if tuple:
            nones.append((x["CENSUS_WARD_NUM"].iloc[0], x[col].iloc[0]))
        else:
            nones.append(x[col].iloc[0])
    elif c > 1:
        if tuple:
            exceptions.append((x["CENSUS_WARD_NUM"].iloc[0], x[col].iloc[0]))
        else:
            exceptions.append(x[col].iloc[0])
    elif c == 1:
        x["CD_H_ADDRESS"] = x["CD_H_ADDRESS"].ffill().bfill()
  
    return x

In [89]:
dwellings_noadd = []
dwellings = [] # keep track of any dwellings with multiple addresses 
Census_hh_dw = dwelling_addresses.groupby(["CENSUS_WARD_NUM","dwelling_id"]).apply(lambda x: check_quant(x, dwellings_noadd, dwellings, "dwelling_id", tuple = True))

In [90]:
# according to understanding it should be lower as on resolving dwelling conflicts less people should get same addresses
print("Proportion of census data assigned addresses:", Census_hh_dw.CD_H_ADDRESS.count()/515630)

Proportion of census data assigned addresses: 0.4620852161433586


In [91]:
print("Proportion of Dwellings Without Addresses:", len(dwellings_noadd)/Census_hh_dw.groupby(["CENSUS_WARD_NUM","dwelling_id"]).ngroups)

Proportion of Dwellings Without Addresses: 0.6312759336099585


### Interlude: Geog Variable
Invesitgation shows that institutions have one address as expected

In [92]:
Census_hh_dw["CENSUS_GEOG"].unique()

array(['NEW YORK WARD 1 EASTERN DIVISION',
       'NEW YORK WARD 1 WESTERN DIVISION', nan, 'NEW YORK WARD 2',
       'NEW YORK WARD 3', 'NEW YORK WARD 4', 'NEW YORK WARD 5',
       'NEW YORK WARD 6', 'NEW YORK WARD 7 DISTRICT 1',
       'NEW YORK WARD 7 DISTRICT 2', 'NEW YORK WARD 8 DISTRICT 1',
       'NEW YORK WARD 8', 'NEW YORK WARD 9 DISTRICT 1',
       'NEW YORK WARD 9 DISTRICT 2', 'NEW YORK WARD 9 DISTRICT 3',
       'NEW YORK WARD 10', 'NEW YORK WARD 11', 'NEW YORK WARD 12',
       'NEW YORK WARD 13', 'NEW YORK WARD 14',
       'NEW YORK WARD 15 WESTERN HALF', 'NEW YORK WARD 15 EASTERN HALF',
       'NEW YORK WARD 16 DISTRICT 1', 'NEW YORK WARD 16 DISTRICT 2',
       'NEW YORK WARD 16 DISTRICT 3', 'NEW YORK WARD 17',
       'NEW YORK WARD 18', 'NEW YORK WARD 18 DISTRICT 2',
       'NEW YORK WARD 18 BELLEVUE HOSPITAL',
       'NEW YORK WARD 18 HOUSE OF REFUGE', 'NEW YORK WARD 19'],
      dtype=object)

In [93]:
hospital = Census_hh_dw[Census_hh_dw["CENSUS_GEOG"] == 'NEW YORK WARD 18 BELLEVUE HOSPITAL']

In [94]:
hospital.nunique()

CENSUS_SERIALP             438
CENSUS_AGE                  61
CENSUS_SEX                   2
CENSUS_MARST                 2
CENSUS_RACE                  2
CENSUS_LABFORCE              2
CENSUS_IMPREL                1
CENSUS_OCCSTR               46
CENSUS_NAMELAST            322
CENSUS_NAMEFRST            117
CENSUS_SEQ_NUM             479
CENSUS_HH_NUM              422
CENSUS_IPUMS_UID           479
CENSUS_CITY                  1
CENSUS_PAGENO_HOUSEHOLD      3
CENSUS_WARD_NUM              1
CENSUS_REEL_HOUSEHOLD        1
CENSUS_PLACE                 1
CENSUS_DWELLING_NUM          1
CENSUS_DWELLING_SEQ          1
CENSUS_DWELLING_SIZE         3
CENSUS_GEOG                  1
CENSUS_LINE                  1
CENSUS_INDEX               479
dwelling_id                  1
unique_ward                  1
CD_H_ADDRESS                 1
dtype: int64

In [95]:
refuge = Census_hh_dw[Census_hh_dw["CENSUS_GEOG"] == 'NEW YORK WARD 18 HOUSE OF REFUGE']
refuge.nunique()

CENSUS_SERIALP             372
CENSUS_AGE                  29
CENSUS_SEX                   2
CENSUS_MARST                 2
CENSUS_RACE                  3
CENSUS_LABFORCE              2
CENSUS_IMPREL                1
CENSUS_OCCSTR               34
CENSUS_NAMELAST            330
CENSUS_NAMEFRST            142
CENSUS_SEQ_NUM             411
CENSUS_HH_NUM              394
CENSUS_IPUMS_UID           411
CENSUS_CITY                  1
CENSUS_PAGENO_HOUSEHOLD      1
CENSUS_WARD_NUM              1
CENSUS_REEL_HOUSEHOLD        1
CENSUS_PLACE                 1
CENSUS_DWELLING_NUM          1
CENSUS_DWELLING_SEQ          1
CENSUS_DWELLING_SIZE         2
CENSUS_GEOG                  1
CENSUS_LINE                  2
CENSUS_INDEX               411
dwelling_id                  1
unique_ward                  1
CD_H_ADDRESS                 1
dtype: int64


It seems that there are fewer NA's (no address at all for a dwelling then when using the previous nested approach). Let's try nested and see if that helps, though I am pretty confused about why it would?

### Choose Dwelling Address By Maximum Spatial Weight

In [96]:
CensusDis1850.columns

Index(['CENSUS_SERIALP', 'CENSUS_AGE', 'CENSUS_SEX', 'CENSUS_MARST',
       'CENSUS_RACE', 'CENSUS_LABFORCE', 'CENSUS_IMPREL', 'CENSUS_OCCSTR',
       'CENSUS_NAMELAST', 'CENSUS_NAMEFRST', 'CENSUS_SEQ_NUM', 'CENSUS_HH_NUM',
       'CENSUS_IPUMS_UID', 'CENSUS_CITY', 'CENSUS_PAGENO_HOUSEHOLD',
       'CENSUS_WARD_NUM', 'CENSUS_REEL_HOUSEHOLD', 'CENSUS_PLACE',
       'CENSUS_DWELLING_NUM', 'CENSUS_DWELLING_SEQ', 'CENSUS_DWELLING_SIZE',
       'CENSUS_GEOG', 'CENSUS_LINE', 'CENSUS_INDEX', 'dwelling_id',
       'CENSUS_ID', 'CD_H_ADDRESS', 'selected', 'spatial_weight', 'CD_X',
       'CD_Y', 'CD_BLOCK_NUM'],
      dtype='object')

In [97]:
#Function for filling in households/dwelling numbers if relevant
def dwelling_weight_fill(x, nones):
    
    c = x["CD_H_ADDRESS"].nunique()
    if c == 0:
        nones.append(1)
        #nones.append((x.loc["CENSUS_WARD_NUM"].iloc[0], x[col].iloc[0]))

    elif c >= 1:
        x.reset_index(drop = True, inplace = True)
        index = x["spatial_weight"].idxmax()
        x["CD_H_ADDRESS"] = x.iloc[index].loc["CD_H_ADDRESS"]
        x["CD_BLOCK_NUM"] = x.iloc[index].loc["CD_BLOCK_NUM"]
        x["CD_X"] = x.iloc[index].loc["CD_X"]
        x["CD_Y"] = x.iloc[index].loc["CD_Y"]
  
    return x

In [98]:
dwellings = []
Dwellings_weight = CensusDis1850.groupby(["CENSUS_WARD_NUM", "dwelling_id"]).apply(lambda x: dwelling_weight_fill(x, dwellings))

len(dwellings)

22242

In [99]:
print("Proportion of census data assigned addresses:", Dwellings_weight.CD_H_ADDRESS.count()/515630)

Proportion of census data assigned addresses: 0.533052770397378


In [100]:

print("Proportion of Dwellings Without Addresses:", len(dwellings)/CensusDis1850.groupby(["CENSUS_WARD_NUM","dwelling_id"]).ngroups)

Proportion of Dwellings Without Addresses: 0.5768153526970954


### Look at how often we have different dwellings with the same address

In [101]:
# taking more time to run if dwelling id 

In [102]:
org_same_add = defaultdict(list)
def same_address(x, d, col):
    d[(x["CENSUS_WARD_NUM"].iloc[0], x[col].iloc[0])].append(x.dropna(subset = ["CD_H_ADDRESS"]).CD_H_ADDRESS.unique())
for index, group in CensusDis1850.groupby(["CENSUS_WARD_NUM","dwelling_id"]):
    same_address(group, org_same_add, "dwelling_id")

In [103]:
which_dwellings = []
count = 0

for index,df in CensusDis1850.groupby("CENSUS_WARD_NUM"):
    org_same_add = defaultdict(list)
    for i, group in df.groupby("dwelling_id"):
        same_address(group, org_same_add, "dwelling_id")
    
    #check for same name (limit to within ward for efficiency)
    for key1 in org_same_add:
        for key2 in org_same_add:
            if key1 != key2:
                if org_same_add[key1][0].any() in org_same_add[key2][0]:
                    count += 1 
                    which_dwellings.append((key1,key2))
                    
    if index % 2 == 0:
        print("Finished up to index", str(index))

Finished up to index 2
Finished up to index 4
Finished up to index 6
Finished up to index 8
Finished up to index 10
Finished up to index 12
Finished up to index 14
Finished up to index 16
Finished up to index 18


In [104]:
with open('/content/dwellings_same_address.txt', 'w') as filehandle:
    for listitem in which_dwellings:
        line = ' '.join(str(x) for x in listitem)
        filehandle.write(line + '\n')

In [105]:
print("Number of Dwellings that could share the same address:", count)

Number of Dwellings that could share the same address: 10663


In [106]:
Dwellings_weight.reset_index(drop = True, inplace = True)
same_add_selected = Dwellings_weight.groupby(["CENSUS_WARD_NUM", "dwelling_id"]).first()
num_same = same_add_selected["CD_H_ADDRESS"].count() - same_add_selected["CD_H_ADDRESS"].nunique()
print("Number of Dwellings with Same Address Selected:", num_same)

Number of Dwellings with Same Address Selected: 2872


In [107]:
print("Proportion of Dwellings with Same Address:", num_same/Dwellings_weight.groupby(["CENSUS_WARD_NUM", "dwelling_id"]).ngroups)

Proportion of Dwellings with Same Address: 0.07448132780082987


In [108]:
print("Dwellings with same address selected out of possiblity:", num_same/count)

Dwellings with same address selected out of possiblity: 0.26934258651411425


### Choose Dwelling Address By Maximum Spatial Weight Sum 

In [109]:
#we can use this to compare between the two versions, because this is where they'll cause differences
count_sames = CensusDis1850.groupby(["CENSUS_WARD_NUM","dwelling_id"], as_index = False).filter(lambda df: df["CD_H_ADDRESS"].count() > 0 and df["CD_H_ADDRESS"].value_counts()[0] > 1)

In [110]:

count_sames.groupby(["CENSUS_WARD_NUM","dwelling_id"]).ngroups

2918

In [111]:
def dwelling_weight_fill(x):
    #x["CD_ADDRESS"] = x.groupby(["CD_ADDRESS"])['spatial_weight'].agg('sum').idxmax()
    if x["CD_H_ADDRESS"].count() > 0:
        x["spatial_weight_sum"] = x.groupby(["CD_H_ADDRESS"])['spatial_weight'].transform('sum')
        x.reset_index(drop = True, inplace = True)
        index = x["spatial_weight_sum"].idxmax()

        x["CD_H_ADDRESS"] = x.iloc[index].loc["CD_H_ADDRESS"]
        x["CD_BLOCK_NUM"] = x.iloc[index].loc["CD_BLOCK_NUM"]
        x["CD_X"] = x.iloc[index].loc["CD_X"]
        x["CD_Y"] = x.iloc[index].loc["CD_Y"]
    
    return x

In [112]:
Dwelling_fill_max = Dwellings_weight.copy()

In [113]:
#We do it this way so that ordering differences don't cause arbitrary differences between the max and sum methods
Dwelling_fill_sum_part1 = count_sames.groupby(["CENSUS_WARD_NUM","dwelling_id"]).apply(dwelling_weight_fill)
Dwelling_fill_sum_part2 = Dwelling_fill_max[~Dwelling_fill_max["CENSUS_IPUMS_UID"].isin(count_sames.CENSUS_IPUMS_UID.values)]
Dwelling_fill_sum = pd.concat([Dwelling_fill_sum_part1, Dwelling_fill_sum_part2])

In [114]:
Dwelling_fill_sum_alt = CensusDis1850.groupby(["CENSUS_WARD_NUM", "dwelling_id"]).apply(dwelling_weight_fill)

In [115]:
#Add back entries that were eliminated because they don't have a dwelling number 
no_dwelling = CensusDis1850[CensusDis1850["dwelling_id"].isnull()]

In [116]:
Dwelling_fill_sum_alt = pd.concat([Dwelling_fill_sum_alt, no_dwelling])

In [117]:
#Dwelling_fill_max = Dwellings_weight.copy()
Dwelling_fill_max =  pd.concat([Dwelling_fill_max, no_dwelling])
Dwelling_fill_max.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 515630 entries, 0 to 515629
Data columns (total 32 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   CENSUS_SERIALP           515630 non-null  int64  
 1   CENSUS_AGE               515630 non-null  int64  
 2   CENSUS_SEX               515630 non-null  int64  
 3   CENSUS_MARST             515630 non-null  int64  
 4   CENSUS_RACE              515630 non-null  int64  
 5   CENSUS_LABFORCE          515630 non-null  int64  
 6   CENSUS_IMPREL            515630 non-null  int64  
 7   CENSUS_OCCSTR            160426 non-null  object 
 8   CENSUS_NAMELAST          515546 non-null  object 
 9   CENSUS_NAMEFRST          515046 non-null  object 
 10  CENSUS_SEQ_NUM           515630 non-null  int64  
 11  CENSUS_HH_NUM            515630 non-null  int64  
 12  CENSUS_IPUMS_UID         515630 non-null  object 
 13  CENSUS_CITY              515630 non-null  int64  
 14  CENS

In [118]:
Dwelling_fill_sum_alt.to_csv("/content/dwelling_filled_sum_1850_mn.csv", index = False)
Dwelling_fill_max.to_csv("/content/dwelling_filled_max_1850_mn.csv", index = False)

In [119]:
print("Proportion of census data assigned addresses:", Dwelling_fill_sum_alt.CD_H_ADDRESS.count()/515630)
#print("Proportion of Dwellings Without Addresses:", Dwelling_fill_sum_alt.groupby(["CENSUS_WARD_NUM", "dwelling_id"], as_index = False).filter(lambda df: df["CD_H_ADDRESS"].count() == 0).groupby(["CENSUS_WARD_NUM", "dwelling_id"]).ngroups/CensusDis1850.groupby(["CENSUS_WARD_NUM","dwelling_id"]).ngroups)

Proportion of census data assigned addresses: 0.533052770397378


This matches with the expected proportions, the same as those for selecting the max weight instead of the max sum

### Look at difference between selecting the maximum and selecting the max sum

In [120]:
Dwelling_fill_sum_alt.dropna(subset = ["CD_H_ADDRESS"], inplace = True)
Dwelling_fill_max.dropna(subset = ["CD_H_ADDRESS"], inplace = True)

In [121]:
Dwelling_fill_sum_dropped = Dwelling_fill_sum_alt[["CENSUS_IPUMS_UID", "CD_H_ADDRESS", "CD_X", "CD_Y"]]
Dwelling_fill_max_dropped = Dwelling_fill_max[["CENSUS_IPUMS_UID", "CD_H_ADDRESS", "CD_X", "CD_Y"]]

In [122]:
Dwelling_fill_sum_dropped.columns

Index(['CENSUS_IPUMS_UID', 'CD_H_ADDRESS', 'CD_X', 'CD_Y'], dtype='object')

In [123]:
Dwelling_filled_differences = Dwelling_fill_sum_dropped.merge(Dwelling_fill_max_dropped, on = "CENSUS_IPUMS_UID",suffixes = ('_sum', '_max'), how = "left")
Dwelling_filled_differences.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 274858 entries, 0 to 274857
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   CENSUS_IPUMS_UID  274858 non-null  object 
 1   CD_H_ADDRESS_sum  274858 non-null  object 
 2   CD_X_sum          274858 non-null  float64
 3   CD_Y_sum          274858 non-null  float64
 4   CD_H_ADDRESS_max  274858 non-null  object 
 5   CD_X_max          274858 non-null  float64
 6   CD_Y_max          274858 non-null  float64
dtypes: float64(4), object(3)
memory usage: 16.8+ MB


In [124]:

Dwelling_filled_same = Dwelling_filled_differences[Dwelling_filled_differences["CD_H_ADDRESS_max"] == Dwelling_filled_differences["CD_H_ADDRESS_sum"]]
Dwelling_filled_same.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 270352 entries, 0 to 274857
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   CENSUS_IPUMS_UID  270352 non-null  object 
 1   CD_H_ADDRESS_sum  270352 non-null  object 
 2   CD_X_sum          270352 non-null  float64
 3   CD_Y_sum          270352 non-null  float64
 4   CD_H_ADDRESS_max  270352 non-null  object 
 5   CD_X_max          270352 non-null  float64
 6   CD_Y_max          270352 non-null  float64
dtypes: float64(4), object(3)
memory usage: 16.5+ MB


In [125]:
Census1850_diff = CensusDis1850[~CensusDis1850["CENSUS_IPUMS_UID"].isin(Dwelling_filled_same.CENSUS_IPUMS_UID.values)]
Census1850_diff.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 245278 entries, 161 to 515629
Data columns (total 32 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   CENSUS_SERIALP           245278 non-null  int64  
 1   CENSUS_AGE               245278 non-null  int64  
 2   CENSUS_SEX               245278 non-null  int64  
 3   CENSUS_MARST             245278 non-null  int64  
 4   CENSUS_RACE              245278 non-null  int64  
 5   CENSUS_LABFORCE          245278 non-null  int64  
 6   CENSUS_IMPREL            245278 non-null  int64  
 7   CENSUS_OCCSTR            74158 non-null   object 
 8   CENSUS_NAMELAST          245241 non-null  object 
 9   CENSUS_NAMEFRST          245010 non-null  object 
 10  CENSUS_SEQ_NUM           245278 non-null  int64  
 11  CENSUS_HH_NUM            245278 non-null  int64  
 12  CENSUS_IPUMS_UID         245278 non-null  object 
 13  CENSUS_CITY              245278 non-null  int64  
 14  CE

In [126]:
#Create a column to display spatial weights
Census1850_diff = Census1850_diff.copy()
Census1850_diff["spatial_weight_sum"] = Census1850_diff.groupby(["CENSUS_WARD_NUM","dwelling_id", "CD_H_ADDRESS"])["spatial_weight"].transform('sum')

In [127]:
Dwelling_filled_differences_all = Census1850_diff.merge(Dwelling_filled_differences, how = "left", on = "CENSUS_IPUMS_UID")
Dwelling_filled_differences_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 245278 entries, 0 to 245277
Data columns (total 39 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   CENSUS_SERIALP           245278 non-null  int64  
 1   CENSUS_AGE               245278 non-null  int64  
 2   CENSUS_SEX               245278 non-null  int64  
 3   CENSUS_MARST             245278 non-null  int64  
 4   CENSUS_RACE              245278 non-null  int64  
 5   CENSUS_LABFORCE          245278 non-null  int64  
 6   CENSUS_IMPREL            245278 non-null  int64  
 7   CENSUS_OCCSTR            74158 non-null   object 
 8   CENSUS_NAMELAST          245241 non-null  object 
 9   CENSUS_NAMEFRST          245010 non-null  object 
 10  CENSUS_SEQ_NUM           245278 non-null  int64  
 11  CENSUS_HH_NUM            245278 non-null  int64  
 12  CENSUS_IPUMS_UID         245278 non-null  object 
 13  CENSUS_CITY              245278 non-null  int64  
 14  CENS

In [128]:
Dwelling_filled_differences_all.dropna(subset = ["spatial_weight"], inplace = True)

In [129]:
i = 0
for index, group in Dwelling_filled_differences_all.groupby(["CENSUS_WARD_NUM", "dwelling_id"]):
    display(group.head())
    i += 1
    if i == 5:
        break

Unnamed: 0,CENSUS_SERIALP,CENSUS_AGE,CENSUS_SEX,CENSUS_MARST,CENSUS_RACE,CENSUS_LABFORCE,CENSUS_IMPREL,CENSUS_OCCSTR,CENSUS_NAMELAST,CENSUS_NAMEFRST,CENSUS_SEQ_NUM,CENSUS_HH_NUM,CENSUS_IPUMS_UID,CENSUS_CITY,CENSUS_PAGENO_HOUSEHOLD,CENSUS_WARD_NUM,CENSUS_REEL_HOUSEHOLD,CENSUS_PLACE,CENSUS_DWELLING_NUM,CENSUS_DWELLING_SEQ,CENSUS_DWELLING_SIZE,CENSUS_GEOG,CENSUS_LINE,CENSUS_INDEX,dwelling_id,CENSUS_ID,CD_H_ADDRESS,selected,spatial_weight,CD_X,CD_Y,CD_BLOCK_NUM,spatial_weight_sum,CD_H_ADDRESS_sum,CD_X_sum,CD_Y_sum,CD_H_ADDRESS_max,CD_X_max,CD_Y_max
3722,2045424,43,2,6,120,0,1,,DUTCHER,CATHARINE B,1,406124,db5340b3-1f19-4350-8a56-af1670443574,4610,1810,1,534,MANHATTAN,21.0,45.0,36,NEW YORK WARD 1 WESTERN DIVISION,190,7324,471,db5340b3-1f19-4350-8a56-af1670443574,94 LIBERTY ST,1.0,1.9,-74.011412,40.709496,mn1850_01_74.0115_40.7093,3.9,94 LIBERTY ST,-74.011412,40.709496,58 CEDAR ST,-74.009383,40.707751
3734,2045424,20,1,6,120,2,12,CLERK,ALLEN,HENRY S,13,406128,d09684f1-b2f9-4135-a827-308438b7a73f,4610,1810,1,534,MANHATTAN,21.0,45.0,36,NEW YORK WARD 1 WESTERN DIVISION,190,7336,471,d09684f1-b2f9-4135-a827-308438b7a73f,58 CEDAR ST,1.0,2.0,-74.009383,40.707751,mn1850_01_74.0094_40.7075,2.0,94 LIBERTY ST,-74.011412,40.709496,58 CEDAR ST,-74.009383,40.707751
3736,2045424,26,1,6,120,2,12,MERCHANT,ELLISON,BENJAMIN,15,406129,a8615f18-14a4-4ecc-909e-a4ba43dca6f5,4610,1810,1,534,MANHATTAN,21.0,45.0,36,NEW YORK WARD 1 WESTERN DIVISION,190,7338,471,a8615f18-14a4-4ecc-909e-a4ba43dca6f5,94 LIBERTY ST,1.0,2.0,-74.011412,40.709496,mn1850_01_74.0115_40.7093,3.9,94 LIBERTY ST,-74.011412,40.709496,58 CEDAR ST,-74.009383,40.707751


Unnamed: 0,CENSUS_SERIALP,CENSUS_AGE,CENSUS_SEX,CENSUS_MARST,CENSUS_RACE,CENSUS_LABFORCE,CENSUS_IMPREL,CENSUS_OCCSTR,CENSUS_NAMELAST,CENSUS_NAMEFRST,CENSUS_SEQ_NUM,CENSUS_HH_NUM,CENSUS_IPUMS_UID,CENSUS_CITY,CENSUS_PAGENO_HOUSEHOLD,CENSUS_WARD_NUM,CENSUS_REEL_HOUSEHOLD,CENSUS_PLACE,CENSUS_DWELLING_NUM,CENSUS_DWELLING_SEQ,CENSUS_DWELLING_SIZE,CENSUS_GEOG,CENSUS_LINE,CENSUS_INDEX,dwelling_id,CENSUS_ID,CD_H_ADDRESS,selected,spatial_weight,CD_X,CD_Y,CD_BLOCK_NUM,spatial_weight_sum,CD_H_ADDRESS_sum,CD_X_sum,CD_Y_sum,CD_H_ADDRESS_max,CD_X_max,CD_Y_max
3854,2045438,33,1,1,120,2,1,MERCHANT,JOHNSON,JEREMIAH,1,406234,7f1291e5-4ac8-478e-bde6-fb5095cbe159,4610,1840,1,534,MANHATTAN,29.0,59.0,12,NEW YORK WARD 1 WESTERN DIVISION,410,7472,479,7f1291e5-4ac8-478e-bde6-fb5095cbe159,97 CEDAR ST,1.0,1.95,-74.011379,40.709132,mn1850_01_74.0110_40.7091,3.883333,97 CEDAR ST,-74.011379,40.709132,"97 CEDAR """" ST",-74.011379,40.709132
3870,2045441,26,1,1,120,2,1,TAILOR,CULLIN,JAMES,1,406241,7c9f8635-8953-49b1-bcc2-9db215fb6978,4610,1850,1,534,MANHATTAN,29.0,62.0,5,NEW YORK WARD 1 WESTERN DIVISION,150,7488,479,7c9f8635-8953-49b1-bcc2-9db215fb6978,97 CEDAR ST,1.0,1.933333,-74.011379,40.709132,mn1850_01_74.0110_40.7091,3.883333,97 CEDAR ST,-74.011379,40.709132,"97 CEDAR """" ST",-74.011379,40.709132
3875,2045442,46,1,1,120,2,1,GOLDSMITH,SELIGER,EDWARD,1,406243,a3a16d8c-0fe4-45a0-9aab-b0004fff2c75,4610,1850,1,534,MANHATTAN,29.0,63.0,3,NEW YORK WARD 1 WESTERN DIVISION,200,7493,479,a3a16d8c-0fe4-45a0-9aab-b0004fff2c75,"97 CEDAR """" ST",1.0,2.0,-74.011379,40.709132,mn1850_01_74.0110_40.7091,2.0,97 CEDAR ST,-74.011379,40.709132,"97 CEDAR """" ST",-74.011379,40.709132


Unnamed: 0,CENSUS_SERIALP,CENSUS_AGE,CENSUS_SEX,CENSUS_MARST,CENSUS_RACE,CENSUS_LABFORCE,CENSUS_IMPREL,CENSUS_OCCSTR,CENSUS_NAMELAST,CENSUS_NAMEFRST,CENSUS_SEQ_NUM,CENSUS_HH_NUM,CENSUS_IPUMS_UID,CENSUS_CITY,CENSUS_PAGENO_HOUSEHOLD,CENSUS_WARD_NUM,CENSUS_REEL_HOUSEHOLD,CENSUS_PLACE,CENSUS_DWELLING_NUM,CENSUS_DWELLING_SEQ,CENSUS_DWELLING_SIZE,CENSUS_GEOG,CENSUS_LINE,CENSUS_INDEX,dwelling_id,CENSUS_ID,CD_H_ADDRESS,selected,spatial_weight,CD_X,CD_Y,CD_BLOCK_NUM,spatial_weight_sum,CD_H_ADDRESS_sum,CD_X_sum,CD_Y_sum,CD_H_ADDRESS_max,CD_X_max,CD_Y_max
5745,2046255,23,1,1,120,2,1,BOARDING HOUSE,BAST,WILLIAM,1,408367,2996ea47-3c83-4639-8e21-b6afab12fbce,4610,2890,1,534,MANHATTAN,278.0,878.0,4,NEW YORK WARD 1 WESTERN DIVISION,150,11856,728,2996ea47-3c83-4639-8e21-b6afab12fbce,135 WASHINGTON ST,1.0,1.937,-74.013788,40.709614,mn1850_01_74.0133_40.7096,3.9034,135 WASHINGTON ST,-74.013788,40.709614,42 WHITEHALL ST,-74.012984,40.702799
5786,2046263,48,1,1,120,2,1,LABORER,TRY,MORRIS,1,408380,fda6f081-033e-4c5e-9c94-2f93cba7f2f4,4610,2900,1,534,MANHATTAN,278.0,886.0,7,NEW YORK WARD 1 WESTERN DIVISION,140,11897,728,fda6f081-033e-4c5e-9c94-2f93cba7f2f4,135 WASHINGTON ST,1.0,1.9664,-74.013788,40.709614,mn1850_01_74.0133_40.7096,3.9034,135 WASHINGTON ST,-74.013788,40.709614,42 WHITEHALL ST,-74.012984,40.702799
5802,2046266,23,1,6,120,2,3,LABORER,FARLEY,JAMES,3,408383,c1ce4a90-1a6b-4944-99c5-6e62bcb25687,4610,2900,1,534,MANHATTAN,278.0,889.0,5,NEW YORK WARD 1 WESTERN DIVISION,280,11913,728,c1ce4a90-1a6b-4944-99c5-6e62bcb25687,42 WHITEHALL ST,1.0,1.979,-74.012984,40.702799,mn1850_01_74.0132_40.7030,1.979,135 WASHINGTON ST,-74.013788,40.709614,42 WHITEHALL ST,-74.012984,40.702799


Unnamed: 0,CENSUS_SERIALP,CENSUS_AGE,CENSUS_SEX,CENSUS_MARST,CENSUS_RACE,CENSUS_LABFORCE,CENSUS_IMPREL,CENSUS_OCCSTR,CENSUS_NAMELAST,CENSUS_NAMEFRST,CENSUS_SEQ_NUM,CENSUS_HH_NUM,CENSUS_IPUMS_UID,CENSUS_CITY,CENSUS_PAGENO_HOUSEHOLD,CENSUS_WARD_NUM,CENSUS_REEL_HOUSEHOLD,CENSUS_PLACE,CENSUS_DWELLING_NUM,CENSUS_DWELLING_SEQ,CENSUS_DWELLING_SIZE,CENSUS_GEOG,CENSUS_LINE,CENSUS_INDEX,dwelling_id,CENSUS_ID,CD_H_ADDRESS,selected,spatial_weight,CD_X,CD_Y,CD_BLOCK_NUM,spatial_weight_sum,CD_H_ADDRESS_sum,CD_X_sum,CD_Y_sum,CD_H_ADDRESS_max,CD_X_max,CD_Y_max
8644,2047618,28,1,6,120,2,1,SOLDIER,GARDNER,JOHN B,2,411653,c1cbdcb0-dcb7-45ef-af3d-c89c6aaefbd1,4610,4470,1,534,MANHATTAN,549.0,1899.0,269,NEW YORK WARD 1 WESTERN DIVISION,80,18485,1006,c1cbdcb0-dcb7-45ef-af3d-c89c6aaefbd1,87 CEDAR ST,1.0,1.933333,-74.010652,40.70876,mn1850_01_74.0101_40.7086,1.933333,38 TRINITY PL,-74.01276,40.708196,58 FRONT ST,-74.009119,40.703362
8652,2047626,29,1,6,120,2,1,SOLDIER,ALLEN,JOSEPH,10,411661,1407a120-7742-4785-9f05-69cb978ba384,4610,4470,1,534,MANHATTAN,549.0,1899.0,269,NEW YORK WARD 1 WESTERN DIVISION,80,18493,1006,1407a120-7742-4785-9f05-69cb978ba384,58 FRONT ST,1.0,2.0,-74.009119,40.703362,mn1850_01_74.0091_40.7036,2.0,38 TRINITY PL,-74.01276,40.708196,58 FRONT ST,-74.009119,40.703362
8698,2047672,24,1,6,120,2,1,SOLDIER,DELANEY,JOHN,56,411706,9cf3730e-676f-41fe-b90c-a6b624da58df,4610,4470,1,534,MANHATTAN,549.0,1899.0,269,NEW YORK WARD 1 WESTERN DIVISION,80,18539,1006,9cf3730e-676f-41fe-b90c-a6b624da58df,38 TRINITY PL,1.0,1.916667,-74.01276,40.708196,mn1850_01_74.0129_40.7085,3.85,38 TRINITY PL,-74.01276,40.708196,58 FRONT ST,-74.009119,40.703362
8722,2047696,42,1,6,120,2,1,SOLDIER,GALLAGHER,JOHN,80,411729,8dbf8ce7-aa92-486c-8fff-eddb48ce77d5,4610,4470,1,534,MANHATTAN,549.0,1899.0,269,NEW YORK WARD 1 WESTERN DIVISION,80,18563,1006,8dbf8ce7-aa92-486c-8fff-eddb48ce77d5,38 TRINITY PL,1.0,1.933333,-74.01276,40.708196,mn1850_01_74.0129_40.7085,3.85,38 TRINITY PL,-74.01276,40.708196,58 FRONT ST,-74.009119,40.703362
8747,2047721,31,1,6,120,2,1,SOLDIER,KANE,JOHN,105,411752,fb05e8cf-fa87-4eb3-a77f-adfe951a2acf,4610,4470,1,534,MANHATTAN,549.0,1899.0,269,NEW YORK WARD 1 WESTERN DIVISION,80,18588,1006,fb05e8cf-fa87-4eb3-a77f-adfe951a2acf,32 TRINITY PL,1.0,1.91,-74.012628,40.708392,mn1850_01_74.0129_40.7085,1.91,38 TRINITY PL,-74.01276,40.708196,58 FRONT ST,-74.009119,40.703362


Unnamed: 0,CENSUS_SERIALP,CENSUS_AGE,CENSUS_SEX,CENSUS_MARST,CENSUS_RACE,CENSUS_LABFORCE,CENSUS_IMPREL,CENSUS_OCCSTR,CENSUS_NAMELAST,CENSUS_NAMEFRST,CENSUS_SEQ_NUM,CENSUS_HH_NUM,CENSUS_IPUMS_UID,CENSUS_CITY,CENSUS_PAGENO_HOUSEHOLD,CENSUS_WARD_NUM,CENSUS_REEL_HOUSEHOLD,CENSUS_PLACE,CENSUS_DWELLING_NUM,CENSUS_DWELLING_SEQ,CENSUS_DWELLING_SIZE,CENSUS_GEOG,CENSUS_LINE,CENSUS_INDEX,dwelling_id,CENSUS_ID,CD_H_ADDRESS,selected,spatial_weight,CD_X,CD_Y,CD_BLOCK_NUM,spatial_weight_sum,CD_H_ADDRESS_sum,CD_X_sum,CD_Y_sum,CD_H_ADDRESS_max,CD_X_max,CD_Y_max
10932,2048800,30,1,1,120,2,1,BOATMAN,ISAACKS,GEORGE,1,414983,2c790c4a-18b6-4c4a-8375-6e6d07d976fa,4610,880,2,535,MANHATTAN,220.0,604.0,7,NEW YORK WARD 2,10,23238,220,2c790c4a-18b6-4c4a-8375-6e6d07d976fa,299 PEARL ST,1.0,1.9332,-74.003002,40.708507,mn1850_02_74.0032_40.7086,3.9164,299 PEARL ST,-74.003002,40.708507,61 ANN ST,-74.006395,40.710161
10939,2048801,36,1,1,120,2,1,TAILOR,GILLMORE,FREDERICK,1,414986,5b22ae5e-d6e3-4595-a47c-c49884407fbe,4610,880,2,535,MANHATTAN,220.0,605.0,5,NEW YORK WARD 2,80,23245,220,5b22ae5e-d6e3-4595-a47c-c49884407fbe,299 PEARL ST,1.0,1.9832,-74.003002,40.708507,mn1850_02_74.0032_40.7086,3.9164,299 PEARL ST,-74.003002,40.708507,61 ANN ST,-74.006395,40.710161
10947,2048803,22,1,1,120,2,1,JUNK KEEPER,ATHERDEN,JOHN,1,414992,9f3f1ca0-4f5b-425e-9ffc-fe0dbabd09c2,4610,880,2,535,MANHATTAN,220.0,607.0,2,NEW YORK WARD 2,160,23253,220,9f3f1ca0-4f5b-425e-9ffc-fe0dbabd09c2,61 ANN ST,1.0,2.0,-74.006395,40.710161,mn1850_02_74.0064_40.7105,2.0,299 PEARL ST,-74.003002,40.708507,61 ANN ST,-74.006395,40.710161
10956,2048804,72,1,6,120,2,12,SHIP JOINER,ELLSWORTH,DANIEL,8,414994,fadb4579-ddd0-4e41-9795-6b504082287e,4610,880,2,535,MANHATTAN,220.0,608.0,25,NEW YORK WARD 2,180,23262,220,fadb4579-ddd0-4e41-9795-6b504082287e,305 PEARL ST,1.0,2.0,-74.002779,40.708624,mn1850_02_74.0032_40.7086,2.0,299 PEARL ST,-74.003002,40.708507,61 ANN ST,-74.006395,40.710161
10967,2048804,18,2,6,120,0,3,,MAHONEY,MARY,19,415005,db76bcd2-66ac-42a3-be5f-0bc8f3c09d39,4610,880,2,535,MANHATTAN,220.0,608.0,25,NEW YORK WARD 2,180,23273,220,db76bcd2-66ac-42a3-be5f-0bc8f3c09d39,30 ANN ST,1.0,1.8832,-74.007604,40.710644,mn1850_02_74.0082_40.7107,1.8832,299 PEARL ST,-74.003002,40.708507,61 ANN ST,-74.006395,40.710161


In [130]:
Dwelling_filled_differences_all.groupby(["CENSUS_WARD_NUM", "dwelling_id"]).ngroups

141

In [131]:
Dwelling_filled_differences_all.to_csv("/content/dwelling_filled_differences_all_mn_1850.csv", index = False)

### Conclusion
Either filling in based on maximum spatial weight or maximum spatial weight sum would make sense, however which one is the better choice is somewhat unclear. Another aspect is that there are often ties between the spatial weights, currently, it's set up to simply select the first option when there's a tie, but it's unclear whether that's the best approach.

### Quick sanity check

In [132]:
dwell_max = pd.read_csv("/content/dwelling_filled_max_1850_mn.csv")
dwell_sum = pd.read_csv("/content/dwelling_filled_sum_1850_mn.csv")

In [133]:
dwell_max.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 515630 entries, 0 to 515629
Data columns (total 32 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   CENSUS_SERIALP           515630 non-null  int64  
 1   CENSUS_AGE               515630 non-null  int64  
 2   CENSUS_SEX               515630 non-null  int64  
 3   CENSUS_MARST             515630 non-null  int64  
 4   CENSUS_RACE              515630 non-null  int64  
 5   CENSUS_LABFORCE          515630 non-null  int64  
 6   CENSUS_IMPREL            515630 non-null  int64  
 7   CENSUS_OCCSTR            160426 non-null  object 
 8   CENSUS_NAMELAST          515546 non-null  object 
 9   CENSUS_NAMEFRST          515046 non-null  object 
 10  CENSUS_SEQ_NUM           515630 non-null  int64  
 11  CENSUS_HH_NUM            515630 non-null  int64  
 12  CENSUS_IPUMS_UID         515630 non-null  object 
 13  CENSUS_CITY              515630 non-null  int64  
 14  CENS

In [134]:
dwell_sum.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 515630 entries, 0 to 515629
Data columns (total 33 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   CENSUS_SERIALP           515630 non-null  int64  
 1   CENSUS_AGE               515630 non-null  int64  
 2   CENSUS_SEX               515630 non-null  int64  
 3   CENSUS_MARST             515630 non-null  int64  
 4   CENSUS_RACE              515630 non-null  int64  
 5   CENSUS_LABFORCE          515630 non-null  int64  
 6   CENSUS_IMPREL            515630 non-null  int64  
 7   CENSUS_OCCSTR            160426 non-null  object 
 8   CENSUS_NAMELAST          515546 non-null  object 
 9   CENSUS_NAMEFRST          515046 non-null  object 
 10  CENSUS_SEQ_NUM           515630 non-null  int64  
 11  CENSUS_HH_NUM            515630 non-null  int64  
 12  CENSUS_IPUMS_UID         515630 non-null  object 
 13  CENSUS_CITY              515630 non-null  int64  
 14  CENS