In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
import re
import networkx as nx
import disambiguation
from networkx.algorithms import bipartite

### Fill in Household/Dwelling Addresses Based on Addresses Selected in Disambiguation

In [2]:
# census_1880 = pd.read_csv("data/census_1880_mn_v04.csv")
census_1850 = pd.read_csv("data/census_1850_indexUpdate.csv")

In [3]:
disambiguated_1880 = pd.read_csv("data/disambiguated-21-5-2020.csv")
disambiguated_1850 = pd.read_csv("data/1850_disambiguated.csv")

### Format Data and Get Some Basic Information

Notes: Only done on 1850 dataset due to lack of clearly indicated dwelling numbers/household numbers on 1880 dataset. Need to check how well this works manually. 
-Use CENSUS_ID from disambiguated and CENSUS_IPUMS_UID from census data for joins, these have the same values, but need to strip the word CENSUS_ from CENSUS_ID data (this was added in during the disambiguation process)

#### Merge 1850 census data with the selected matches from disambiguation 

In [4]:
census_1850.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 515630 entries, 0 to 515629
Data columns (total 24 columns):
CENSUS_SERIALP             515630 non-null int64
CENSUS_AGE                 515630 non-null int64
CENSUS_SEX                 515630 non-null int64
CENSUS_MARST               515630 non-null int64
CENSUS_RACE                515630 non-null int64
CENSUS_LABFORCE            515630 non-null int64
CENSUS_IMPREL              515630 non-null int64
CENSUS_OCCSTR              160426 non-null object
CENSUS_NAMELAST            515546 non-null object
CENSUS_NAMEFRST            515046 non-null object
CENSUS_SEQ_NUM             515630 non-null int64
CENSUS_HH_NUM              515630 non-null int64
CENSUS_IPUMS_UID           515630 non-null object
CENSUS_CITY                515630 non-null int64
CENSUS_PAGENO_HOUSEHOLD    515630 non-null int64
CENSUS_WARD_NUM            515630 non-null int64
CENSUS_REEL_HOUSEHOLD      515630 non-null int64
CENSUS_PLACE               515630 non-null object
CE

In [5]:
disambiguated_1850.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33042 entries, 0 to 33041
Data columns (total 36 columns):
BLOCK_NUM              33042 non-null int64
CD_ADDRESS             33042 non-null object
CD_FIRST_NAME          33042 non-null object
CD_ID                  33042 non-null object
CD_LAST_NAME           33042 non-null object
CD_OCCUPATION          31686 non-null object
CD_X                   33042 non-null float64
CD_Y                   33042 non-null float64
CENSUS_AGE             33042 non-null int64
CENSUS_DWELLING_SEQ    32922 non-null float64
CENSUS_DWELLIN_NUM     32946 non-null float64
CENSUS_ID              33042 non-null object
CENSUS_INDEX           33042 non-null int64
CENSUS_IPUMS_UID       33042 non-null object
CENSUS_NAMEFRST        33042 non-null object
CENSUS_NAMELAST        33042 non-null object
CENSUS_OCCSTR          19420 non-null object
CENSUS_WARD_NUM        33042 non-null int64
OBJECTID               33042 non-null int64
age_score              33042 non-null

In [6]:
disambiguated_selected = disambiguated_1850[["CENSUS_ID", "CD_ADDRESS", "selected", "spatial_weight" ]]
disambiguated_selected = disambiguated_selected[disambiguated_selected["selected"] == 1]
disambiguated_selected.loc[:,"CENSUS_ID"] = disambiguated_selected["CENSUS_ID"].apply(lambda x: x.strip("CENSUS_"))

In [7]:
disambiguated_selected.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12348 entries, 0 to 33041
Data columns (total 4 columns):
CENSUS_ID         12348 non-null object
CD_ADDRESS        12348 non-null object
selected          12348 non-null float64
spatial_weight    12348 non-null float64
dtypes: float64(2), object(2)
memory usage: 482.3+ KB


In [8]:
CensusDis1850 = census_1850.merge(disambiguated_selected, how = "left", left_on = "CENSUS_IPUMS_UID", right_on = "CENSUS_ID")
CensusDis1850.head()

Unnamed: 0,CENSUS_SERIALP,CENSUS_AGE,CENSUS_SEX,CENSUS_MARST,CENSUS_RACE,CENSUS_LABFORCE,CENSUS_IMPREL,CENSUS_OCCSTR,CENSUS_NAMELAST,CENSUS_NAMEFRST,...,CENSUS_DWELLING_NUM,CENSUS_DWELLING_SEQ,CENSUS_DWELLING_SIZE,CENSUS_GEOG,CENSUS_LINE,CENSUS_INDEX,CENSUS_ID,CD_ADDRESS,selected,spatial_weight
0,2044262,49,1,6,120,2,1,INN KEEPER,FLINT,JOHN M,...,1.0,1.0,84,NEW YORK WARD 1 EASTERN DIVISION,10,1,,,,
1,2044263,36,2,6,120,0,1,,FLINT,MARY A,...,1.0,1.0,84,NEW YORK WARD 1 EASTERN DIVISION,10,2,,,,
2,2044264,12,2,6,120,0,1,,FLINT,MARY D,...,1.0,1.0,84,NEW YORK WARD 1 EASTERN DIVISION,10,3,,,,
3,2044265,40,1,6,120,2,1,COMB MERCHANT,OATMAN,JAMES C,...,1.0,1.0,84,NEW YORK WARD 1 EASTERN DIVISION,10,4,,,,
4,2044266,28,2,6,120,0,1,,OATMAN,CAROLINE E,...,1.0,1.0,84,NEW YORK WARD 1 EASTERN DIVISION,10,5,,,,


In [9]:
CensusDis1850.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 515630 entries, 0 to 515629
Data columns (total 28 columns):
CENSUS_SERIALP             515630 non-null int64
CENSUS_AGE                 515630 non-null int64
CENSUS_SEX                 515630 non-null int64
CENSUS_MARST               515630 non-null int64
CENSUS_RACE                515630 non-null int64
CENSUS_LABFORCE            515630 non-null int64
CENSUS_IMPREL              515630 non-null int64
CENSUS_OCCSTR              160426 non-null object
CENSUS_NAMELAST            515546 non-null object
CENSUS_NAMEFRST            515046 non-null object
CENSUS_SEQ_NUM             515630 non-null int64
CENSUS_HH_NUM              515630 non-null int64
CENSUS_IPUMS_UID           515630 non-null object
CENSUS_CITY                515630 non-null int64
CENSUS_PAGENO_HOUSEHOLD    515630 non-null int64
CENSUS_WARD_NUM            515630 non-null int64
CENSUS_REEL_HOUSEHOLD      515630 non-null int64
CENSUS_PLACE               515630 non-null object
CE

In [10]:
print("Proportion of census data assigned addresses:", CensusDis1850.CD_ADDRESS.count()/515630)

Proportion of census data assigned addresses: 0.023947404146384035


In [11]:
def get_counts(x, one_add, no_add, more_add, col, counts1, counts2, counts3):

    c = x["CD_ADDRESS"].nunique()
    if c == 0:
        no_add.append((x["CENSUS_WARD_NUM"].iloc[0], x[col].iloc[0]))
        counts1.append(len(x))
    elif c == 1:
        one_add.append((x["CENSUS_WARD_NUM"].iloc[0], x[col].iloc[0]))
        counts2.append(len(x))
    
    elif c > 1:
        more_add.append((x["CENSUS_WARD_NUM"].iloc[0], x[col].iloc[0]))
        counts3.append(len(x))

In [12]:
no_add = []
one_add = []
more_add = []
counts_no_add = []
counts_one_add = []
counts_more_add = []

for index, df in CensusDis1850.groupby(["CENSUS_WARD_NUM", "CENSUS_DWELLING_NUM"]):
    get_counts(df, one_add, no_add, more_add, "CENSUS_DWELLING_NUM", counts_no_add, counts_one_add, counts_more_add)

In [13]:
print("Proportion of dwellings assigned one address:", len(one_add)/CensusDis1850.groupby(["CENSUS_WARD_NUM", "CENSUS_DWELLING_NUM"]).ngroups)
print("Proportion of dwellings without an address:", len(no_add)/CensusDis1850.groupby(["CENSUS_WARD_NUM", "CENSUS_DWELLING_NUM"]).ngroups)
print("Proportion of dwellings assigned more than one address:", len(more_add)/CensusDis1850.groupby(["CENSUS_WARD_NUM", "CENSUS_DWELLING_NUM"]).ngroups)

Proportion of dwellings assigned one address: 0.247515762925599
Proportion of dwellings without an address: 0.6204791929382093
Proportion of dwellings assigned more than one address: 0.13200504413619168


In [14]:
print("Proportion of census data that should be assigned an address as is:", sum(counts_one_add)/515630)
print("Proportion of census data that should be assigned an address after dealing with conflicts:", (sum(counts_one_add) + sum(counts_more_add))/515630)
print("Proportion of census data that we shouldn't be able to assign an address to:", sum(counts_no_add)/515630)

Proportion of census data that should be assigned an address as is: 0.2948199290188701
Proportion of census data that should be assigned an address after dealing with conflicts: 0.5070050229815953
Proportion of census data that we shouldn't be able to assign an address to: 0.4912262668968059


So from dealing with filling in the dwelling information alone, we should be able to give 0.50 of census entries an address and 0.38 of dwellings an address.

#### Let's see if this holds for what we can do right now

In [15]:
#Function for filling in households/dwelling numbers if relevant
def check_quant(x, exceptions, col, tuple = False):

    c = x["CD_ADDRESS"].nunique()
    if c > 1:
        if tuple:
            exceptions.append((x["CENSUS_WARD_NUM"].iloc[0], x[col].iloc[0]))
        else:
            exceptions.append(x[col].iloc[0])
    elif c == 1:
        x["CD_ADDRESS"] = x["CD_ADDRESS"].ffill().bfill()
  
    return x

In [16]:
dwellings_conflicts = []
base_fill = CensusDis1850.groupby(["CENSUS_WARD_NUM", "CENSUS_DWELLING_NUM"]).apply(lambda x: check_quant(x, dwellings_conflicts, "CENSUS_DWELLING_NUM", tuple = True))

In [17]:
print("Proportion of census data assigned addresses:", base_fill.CD_ADDRESS.count()/515630)

Proportion of census data assigned addresses: 0.3082966468203945


In [18]:
base_fill.CD_ADDRESS.count()

158967

In [19]:
sum(counts_one_add)

152018

In [20]:
more_add == dwellings_conflicts

True

It's a little odd that more values are filled in than I would expect, but that the two lists are the same seems promising? Maybe missing dwelling numbers are causing this?

In [21]:
Dwelling_nums_nas = CensusDis1850[CensusDis1850["CENSUS_DWELLING_NUM"].isnull()]
nans_addresses = Dwelling_nums_nas.CD_ADDRESS.count()
Dwelling_nums_nas.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 912 entries, 1307 to 495263
Data columns (total 28 columns):
CENSUS_SERIALP             912 non-null int64
CENSUS_AGE                 912 non-null int64
CENSUS_SEX                 912 non-null int64
CENSUS_MARST               912 non-null int64
CENSUS_RACE                912 non-null int64
CENSUS_LABFORCE            912 non-null int64
CENSUS_IMPREL              912 non-null int64
CENSUS_OCCSTR              367 non-null object
CENSUS_NAMELAST            911 non-null object
CENSUS_NAMEFRST            912 non-null object
CENSUS_SEQ_NUM             912 non-null int64
CENSUS_HH_NUM              912 non-null int64
CENSUS_IPUMS_UID           912 non-null object
CENSUS_CITY                912 non-null int64
CENSUS_PAGENO_HOUSEHOLD    912 non-null int64
CENSUS_WARD_NUM            912 non-null int64
CENSUS_REEL_HOUSEHOLD      912 non-null int64
CENSUS_PLACE               912 non-null object
CENSUS_DWELLING_NUM        0 non-null float64
CENSUS_DWE

So this is too small to explain the difference. For now it's not different enough to stop from continuing, but it's definitely worth keeping in mind.

### Fill in Addresses for Household/Dwelling, based on Disambiguation Match

#### Fill in addresses for census entries in the same household

In [22]:
#confirm that household numbers are unique accross the entire dataset
def uniqueness(df, col):
    df_check = df[["CENSUS_WARD_NUM", col]]
    df2 = df_check.groupby("CENSUS_WARD_NUM").apply(lambda x: x[col].unique())
    d = df2.to_dict()
    for key1 in d:
        for key2 in d:
            if key1 != key2:
                check = any(item in d[key1] for item in d[key2])
                if check is True:
                    print(key1, key2)
                    raise Exception(str(col) + " numbers are not unique")
    return True         

In [23]:
uniqueness(CensusDis1850, "CENSUS_HH_NUM")

True

In [24]:
uniqueness(CensusDis1850, "CENSUS_DWELLING_NUM")

1 2


Exception: CENSUS_DWELLING_NUM numbers are not unique

In [25]:
uniqueness(CensusDis1850, "CENSUS_SERIALP")

True

In [17]:
#Filling addresses for people in the same household
households = [] # keep track of any households with multiple addresses 
Census_hh = CensusDis1850.groupby("CENSUS_HH_NUM").apply(lambda x: check_quant(x, households, "CENSUS_HH_NUM"))

In [18]:
Census_hh.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 515630 entries, 0 to 515629
Data columns (total 27 columns):
CENSUS_SERIALP             515630 non-null int64
CENSUS_AGE                 515630 non-null int64
CENSUS_SEX                 515630 non-null int64
CENSUS_MARST               515630 non-null int64
CENSUS_RACE                515630 non-null int64
CENSUS_LABFORCE            515630 non-null int64
CENSUS_IMPREL              515630 non-null int64
CENSUS_OCCSTR              160426 non-null object
CENSUS_NAMELAST            515546 non-null object
CENSUS_NAMEFRST            515046 non-null object
CENSUS_SEQ_NUM             515630 non-null int64
CENSUS_HH_NUM              515630 non-null int64
CENSUS_IPUMS_UID           515630 non-null object
CENSUS_CITY                515630 non-null int64
CENSUS_PAGENO_HOUSEHOLD    515630 non-null int64
CENSUS_WARD_NUM            515630 non-null int64
CENSUS_REEL_HOUSEHOLD      515630 non-null int64
CENSUS_PLACE               515630 non-null object
CE

In [19]:
#These households need to be inspected more carefully, it seems that they
#have been assigned multiple addresses
len(households)

372

In [32]:
with open('data/housholds.txt', 'w') as filehandle:
    for listitem in households:
        filehandle.write('%s\n' % listitem)

In [22]:
x = Census_hh[Census_hh["CENSUS_HH_NUM"] == households[7]]
x[["CENSUS_NAMEFRST", "CENSUS_NAMELAST", "CENSUS_AGE", "CENSUS_SEX", "CENSUS_OCCSTR", "CENSUS_HH_NUM","CENSUS_DWELLING_NUM", "CD_ADDRESS"]]

Unnamed: 0,CENSUS_NAMEFRST,CENSUS_NAMELAST,CENSUS_AGE,CENSUS_SEX,CENSUS_OCCSTR,CENSUS_HH_NUM,CENSUS_DWELLING_NUM,CD_ADDRESS
15031,MICHAEL,MORAN,48,1,LABORER,409771,405.0,46 Trinity pl
15032,CECILIA,MORAN,42,2,,409771,405.0,
15033,CECILIA,MORAN,15,2,,409771,405.0,
15034,FRANCIS,MORAN,8,1,,409771,405.0,42 Whitehall
15035,JULIAN,MORAN,4,2,,409771,405.0,


In [23]:
print("Proportion of census data assigned addresses:", Census_hh["CD_ADDRESS"].count()/522152)

Proportion of census data assigned addresses: 0.08624691660665859


In [24]:
Census_hh.to_csv("data/Census_1850_household.csv", index = False)

#### Use IPUMs household number

In [25]:
Census_hh = pd.read_csv("data/Census_1850_household.csv")

In [26]:
households_IPUMS = []
Census_hh_IPUMS = Census_hh.groupby("CENSUS_SERIALP").apply(lambda x: check_quant(x, households_IPUMS, "CENSUS_SERIALP"))

In [27]:
print("Proportion of census data assigned addresses:", Census_hh_IPUMS["CD_ADDRESS"].count()/522152)

Proportion of census data assigned addresses: 0.1384175489129602


In [28]:
len(households_IPUMS)

751

In [31]:
with open('data/housholds_IPUMS.txt', 'w') as filehandle:
    for listitem in households_IPUMS:
        filehandle.write('%s\n' % listitem)

In [35]:
Census_hh_IPUMS.to_csv("data/Census_1850_household_IPUMS.csv", index = False)

#### Fill in address for census entries with same dwelling number

In [36]:
#Filling addresses for people in the same household
dwellings = [] # keep track of any dwellings with multiple addresses 
Census_hh_dw = Census_hh_IPUMS.groupby(["CENSUS_WARD_NUM","CENSUS_DWELLING_NUM"]).apply(lambda x: check_quant(x, dwellings, "CENSUS_DWELLING_NUM", tuple = True))

In [37]:
print("Proportion of census data assigned addresses:", Census_hh_dw["CD_ADDRESS"].count()/522152)

Proportion of census data assigned addresses: 0.3579436639139561


In [38]:
#Dwellings that are assigned more than one address
len(dwellings)

2624

In [41]:
with open('data/dwellings.txt', 'w') as filehandle:
    for listitem in dwellings:
        line = ' '.join(str(x) for x in listitem)
        filehandle.write(line + '\n')

In [42]:
Census_hh_dw.to_csv("data/Census_1850_dwellings.csv", index = False)

#### Check Number of Dwellings with Multiple addresses

In [44]:
Census_hh = pd.read_csv("data/Census_1850_household_IPUMS.csv")

In [45]:
dwellings = [] # keep track of any dwellings with multiple addresses 
Census_hh_dw = Census_hh.groupby(["CENSUS_WARD_NUM","CENSUS_DWELLING_NUM"]).apply(lambda x: check_quant(x, dwellings, "CENSUS_DWELLING_NUM", tuple = True))

In [47]:
len(dwellings)/Census_hh_dw.groupby(["CENSUS_WARD_NUM", "CENSUS_DWELLING_NUM"]).ngroups

0.13235813366960908

This about holds with expectations from analysis in the beginning

#### Check Number of Dwellings with No Address at All

In [48]:
#Function for filling in households/dwelling numbers if relevant
def no_address(x, col):
    c = x["CD_ADDRESS"].nunique()
    if c == 0:
            nones.append((x["CENSUS_WARD_NUM"].iloc[0], x[col].iloc[0]))

In [49]:
nones = []
for index,df in Census_hh_dw.groupby(["CENSUS_WARD_NUM", "CENSUS_DWELLING_NUM"]):
    no_address(df, "CENSUS_DWELLING_NUM")

In [50]:
len(nones)

12285

In [51]:
len(nones)/Census_hh_dw.groupby(["CENSUS_WARD_NUM", "CENSUS_DWELLING_NUM"]).ngroups

0.6196721311475409

This is a little less than what we'd expect from the earlier analysis, clearly there's some situation that's not being accounted for here

#### Issue: There are both households and dwellings that are assigned multiple addresses
- Potential cause: an incorrect match -- it may make sense to incorporate that dwellings/households may have only a single match within the disambiguation process
    - possible approach: two levels of bipartite matching for household and dwellings
    - possible approach: incorporate into initial bipartite matching

- Potential cause: dwellings/households that are referred to by multiple addresses - ei corner building, maybe a historical address change, streets with two names (Avenue of the Americas/6th Ave), etc.
    - standardization can help 

### Use Bipartite Matching to Get A Single Address for Dwellings 

In [2]:
census_1850 = pd.read_csv("data/census_1850_indexUpdate.csv")
disambiguated_1850 = pd.read_csv("data/1850_disambiguated.csv")

In [3]:
disambiguated_1850_selected = disambiguated_1850[disambiguated_1850["selected"] == 1] 

In [4]:
disambiguated_1850_selected["unique_ward"] = disambiguated_1850_selected.apply(lambda row: str(row.CENSUS_WARD_NUM) + "_" + str(row.CENSUS_DWELLIN_NUM), axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [5]:
disambiguated_1850_selected.nunique()

BLOCK_NUM               1054
CD_ADDRESS              9038
CD_FIRST_NAME            647
CD_ID                  12348
CD_LAST_NAME            3866
CD_OCCUPATION           1734
CD_X                    7696
CD_Y                    7737
CENSUS_AGE                92
CENSUS_DWELLING_SEQ     3043
CENSUS_DWELLIN_NUM      1333
CENSUS_ID              12348
CENSUS_INDEX           12348
CENSUS_IPUMS_UID       12348
CENSUS_NAMEFRST          695
CENSUS_NAMELAST         4018
CENSUS_OCCSTR           1140
CENSUS_WARD_NUM           18
OBJECTID               12348
age_score                  2
anchor                     2
cd_count                  38
census_count               6
confidence_score          46
group_ID                3236
in_cluster                 3
jw_fn                     23
jw_ln                     32
jw_score                 100
key                        1
letter                   107
node_ID                  215
occ_listed                 2
spatial_weight           147
selected      

In [6]:
disambiguated_1850_selected[["CENSUS_WARD_NUM", "CENSUS_DWELLIN_NUM","unique_ward", "CD_ADDRESS"]].head(30)

Unnamed: 0,CENSUS_WARD_NUM,CENSUS_DWELLIN_NUM,unique_ward,CD_ADDRESS
0,1,1.0,1_1.0,37 South
1,1,1.0,1_1.0,47 Wall
2,1,1.0,1_1.0,12 Coenties slip
9,1,354.0,1_354.0,8 Depeyster
25,1,1.0,1_1.0,25 South
26,1,1.0,1_1.0,30 Washington
27,1,1.0,1_1.0,76 Wall
33,1,403.0,1_403.0,10 Wall
38,1,404.0,1_404.0,57 Pine
42,1,254.0,1_254.0,54 Wall


In [7]:
disambiguated_1850_selected.head()

Unnamed: 0,BLOCK_NUM,CD_ADDRESS,CD_FIRST_NAME,CD_ID,CD_LAST_NAME,CD_OCCUPATION,CD_X,CD_Y,CENSUS_AGE,CENSUS_DWELLING_SEQ,...,jw_ln,jw_score,key,letter,node_ID,occ_listed,spatial_weight,selected,graph_ID,unique_ward
0,3794,37 South,Henry,CD_26119,Brookman,com,-74.00825,40.703128,32,1.0,...,1.0,1.0,,,,1,2.0,1.0,0,1_1.0
1,716,47 Wall,M,CD_1932,Bingham,com,-74.009808,40.706465,55,1.0,...,1.0,1.0,,,,1,2.0,1.0,1,1_1.0
2,3802,12 Coenties slip,Thomas,CD_17221,Mayhew,com,-74.013287,40.703637,42,1.0,...,1.0,1.0,,,,1,1.9,1.0,2,1_1.0
9,687,8 Depeyster,John,CD_3399,Williams,porterhouse,-74.006297,40.70563,36,877.0,...,1.0,1.0,0.0,N13,N13_0,1,1.81,1.0,3,1_354.0
25,3781,25 South,Lorenzo,CD_31204,Nickerson,merchant,-74.008971,40.702672,30,1.0,...,1.0,1.0,,,,1,2.0,1.0,4,1_1.0


In [8]:
disambiguated_1850_selected.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12348 entries, 0 to 33041
Data columns (total 37 columns):
BLOCK_NUM              12348 non-null int64
CD_ADDRESS             12348 non-null object
CD_FIRST_NAME          12348 non-null object
CD_ID                  12348 non-null object
CD_LAST_NAME           12348 non-null object
CD_OCCUPATION          11784 non-null object
CD_X                   12348 non-null float64
CD_Y                   12348 non-null float64
CENSUS_AGE             12348 non-null int64
CENSUS_DWELLING_SEQ    12306 non-null float64
CENSUS_DWELLIN_NUM     12315 non-null float64
CENSUS_ID              12348 non-null object
CENSUS_INDEX           12348 non-null int64
CENSUS_IPUMS_UID       12348 non-null object
CENSUS_NAMEFRST        12348 non-null object
CENSUS_NAMELAST        12348 non-null object
CENSUS_OCCSTR          9701 non-null object
CENSUS_WARD_NUM        12348 non-null int64
OBJECTID               12348 non-null int64
age_score              12348 non-null 

In [61]:
def get_matches_dwelling(df, cd_id = 'CD_ID', census_id = 'CENSUS_ID', weight = 'spatial_weight'):
    
    #This is to preserve order
    df = df.copy()
    df[cd_id] = "A_ " + df[cd_id].astype(str)
    df[census_id] = "B_ " + df[census_id].astype(str)
    
    b_edges = [(row[cd_id], row[census_id], row[weight]) for index, row in df.iterrows()]
    b = nx.Graph()
    b.add_nodes_from(df[cd_id].unique(), bipartite = 0)
    b.add_nodes_from(df[census_id].unique(), bipartite = 1)
    b.add_weighted_edges_from(b_edges)

    # algorithm is too expensive if we perform it on entire graph. moreover, graph is actually disconnected into sub_graphs. apply algorithm on subgraphs instead
    subgraphs = [b.subgraph(c) for c in nx.connected_components(b)]
    matches = [list(nx.max_weight_matching(graph, maxcardinality = True)) for graph in subgraphs]
    matches = [sorted(list(item)) for sublist in matches for item in sublist] # unnest and convert pairs from tuple to list
    matches = pd.DataFrame(matches, columns=[cd_id, census_id])
    
    matches.loc[:,cd_id] = matches[cd_id].apply(lambda x: x.strip("A_ "))
    matches.loc[:,census_id] = matches[census_id].apply(lambda x: x.strip("B_ "))

    return matches

In [62]:
match = get_matches_dwelling(disambiguated_1850_selected, cd_id = "unique_ward", census_id = "CD_ADDRESS")

In [63]:
match.head(30)

Unnamed: 0,unique_ward,CD_ADDRESS
0,1_356.0,34 Beaver
1,1_171.0,95 Cedar
2,1_141.0,13 Water
3,17_192.0,373} Bow» ery
4,1_545.0,41 Beaver
5,1_523.0,24 Thames
6,1_244.0,7 S . William
7,1_97.0,49 Pearl & 36 Bridge
8,1_28.0,132 Broadway
9,1_134.0,42 Whitehall


In [64]:
match.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6953 entries, 0 to 6952
Data columns (total 2 columns):
unique_ward    6953 non-null object
CD_ADDRESS     6953 non-null object
dtypes: object(2)
memory usage: 108.8+ KB


In [65]:
disambiguated_1850_selected.nunique()

BLOCK_NUM               1054
CD_ADDRESS              9038
CD_FIRST_NAME            647
CD_ID                  12348
CD_LAST_NAME            3866
CD_OCCUPATION           1734
CD_X                    7696
CD_Y                    7737
CENSUS_AGE                92
CENSUS_DWELLING_SEQ     3043
CENSUS_DWELLIN_NUM      1333
CENSUS_ID              12348
CENSUS_INDEX           12348
CENSUS_IPUMS_UID       12348
CENSUS_NAMEFRST          695
CENSUS_NAMELAST         4018
CENSUS_OCCSTR           1140
CENSUS_WARD_NUM           18
OBJECTID               12348
age_score                  2
anchor                     2
cd_count                  38
census_count               6
confidence_score          46
group_ID                3236
in_cluster                 3
jw_fn                     23
jw_ln                     32
jw_score                 100
key                        1
letter                   107
node_ID                  215
occ_listed                 2
spatial_weight           147
selected      

In [66]:
disambiguated_1850_selected = disambiguated_1850_selected[["CENSUS_IPUMS_UID", "unique_ward"]]
disambiguated_1850_selected.drop_duplicates("unique_ward", inplace = True)
matched = disambiguated_1850_selected.merge(match, how = "left", on = "unique_ward", validate = "one_to_one")
dwelling_addresses = census_1850.merge(matched, how = "left", on = "CENSUS_IPUMS_UID")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [67]:
matched.nunique()

CENSUS_IPUMS_UID    7533
unique_ward         7533
CD_ADDRESS          6939
dtype: int64

In [68]:
dwelling_addresses.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 515630 entries, 0 to 515629
Data columns (total 26 columns):
CENSUS_SERIALP             515630 non-null int64
CENSUS_AGE                 515630 non-null int64
CENSUS_SEX                 515630 non-null int64
CENSUS_MARST               515630 non-null int64
CENSUS_RACE                515630 non-null int64
CENSUS_LABFORCE            515630 non-null int64
CENSUS_IMPREL              515630 non-null int64
CENSUS_OCCSTR              160426 non-null object
CENSUS_NAMELAST            515546 non-null object
CENSUS_NAMEFRST            515046 non-null object
CENSUS_SEQ_NUM             515630 non-null int64
CENSUS_HH_NUM              515630 non-null int64
CENSUS_IPUMS_UID           515630 non-null object
CENSUS_CITY                515630 non-null int64
CENSUS_PAGENO_HOUSEHOLD    515630 non-null int64
CENSUS_WARD_NUM            515630 non-null int64
CENSUS_REEL_HOUSEHOLD      515630 non-null int64
CENSUS_PLACE               515630 non-null object
CE

In [69]:
print("Proportion of census data assigned addresses:", dwelling_addresses.CD_ADDRESS.count()/515630)

Proportion of census data assigned addresses: 0.013484475302057677


In [75]:
#Function for filling in households/dwelling numbers if relevant
def check_quant(x, nones, exceptions, col, tuple = False):

    c = x["CD_ADDRESS"].nunique()
    if c == 0:
        if tuple:
            nones.append((x["CENSUS_WARD_NUM"].iloc[0], x[col].iloc[0]))
        else:
            nones.append(x[col].iloc[0])
    elif c > 1:
        if tuple:
            exceptions.append((x["CENSUS_WARD_NUM"].iloc[0], x[col].iloc[0]))
        else:
            exceptions.append(x[col].iloc[0])
    elif c == 1:
        x["CD_ADDRESS"] = x["CD_ADDRESS"].ffill().bfill()
  
    return x

In [76]:
dwellings_noadd = []
dwellings = [] # keep track of any dwellings with multiple addresses 
Census_hh_dw = dwelling_addresses.groupby(["CENSUS_WARD_NUM","CENSUS_DWELLING_NUM"]).apply(lambda x: check_quant(x, dwellings_noadd, dwellings, "CENSUS_DWELLING_NUM", tuple = True))

In [77]:
print("Proportion of census data assigned addresses:", Census_hh_dw.CD_ADDRESS.count()/515630)

Proportion of census data assigned addresses: 0.4702383492038865


In [78]:
#Four dwellings less than the earlier interpolation one
len(dwellings)

0

In [79]:
len(dwellings_noadd)

12881

In [82]:
Census_hh_dw.groupby(["CENSUS_WARD_NUM","CENSUS_DWELLING_NUM"]).ngroups

19825

In [83]:
print("Proportion of Dwellings Without Addresses:", len(dwellings_noadd)/Census_hh_dw.groupby(["CENSUS_WARD_NUM","CENSUS_DWELLING_NUM"]).ngroups)

Proportion of Dwellings Without Addresses: 0.6497351828499369


In [80]:
Census_hh_dw.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 515630 entries, 0 to 515629
Data columns (total 26 columns):
CENSUS_SERIALP             514718 non-null float64
CENSUS_AGE                 514718 non-null float64
CENSUS_SEX                 514718 non-null float64
CENSUS_MARST               514718 non-null float64
CENSUS_RACE                514718 non-null float64
CENSUS_LABFORCE            514718 non-null float64
CENSUS_IMPREL              514718 non-null float64
CENSUS_OCCSTR              160059 non-null object
CENSUS_NAMELAST            514635 non-null object
CENSUS_NAMEFRST            514134 non-null object
CENSUS_SEQ_NUM             514718 non-null float64
CENSUS_HH_NUM              514718 non-null float64
CENSUS_IPUMS_UID           514718 non-null object
CENSUS_CITY                514718 non-null float64
CENSUS_PAGENO_HOUSEHOLD    514718 non-null float64
CENSUS_WARD_NUM            514718 non-null float64
CENSUS_REEL_HOUSEHOLD      514718 non-null float64
CENSUS_PLACE              

### Interlude: Geog Variable

- Invesitgation shows that institutions have one address as expected

In [85]:
Census_hh_dw["CENSUS_GEOG"].unique()

array(['NEW YORK WARD 1 EASTERN DIVISION', nan,
       'NEW YORK WARD 1 WESTERN DIVISION', 'NEW YORK WARD 2',
       'NEW YORK WARD 3', 'NEW YORK WARD 4', 'NEW YORK WARD 5',
       'NEW YORK WARD 6', 'NEW YORK WARD 7 DISTRICT 1',
       'NEW YORK WARD 7 DISTRICT 2', 'NEW YORK WARD 8 DISTRICT 1',
       'NEW YORK WARD 8', 'NEW YORK WARD 9 DISTRICT 1',
       'NEW YORK WARD 9 DISTRICT 2', 'NEW YORK WARD 9 DISTRICT 3',
       'NEW YORK WARD 10', 'NEW YORK WARD 11', 'NEW YORK WARD 12',
       'NEW YORK WARD 13', 'NEW YORK WARD 14',
       'NEW YORK WARD 15 WESTERN HALF', 'NEW YORK WARD 15 EASTERN HALF',
       'NEW YORK WARD 16 DISTRICT 1', 'NEW YORK WARD 16 DISTRICT 2',
       'NEW YORK WARD 16 DISTRICT 3', 'NEW YORK WARD 17',
       'NEW YORK WARD 18', 'NEW YORK WARD 18 DISTRICT 2',
       'NEW YORK WARD 18 BELLEVUE HOSPITAL',
       'NEW YORK WARD 18 HOUSE OF REFUGE', 'NEW YORK WARD 19'],
      dtype=object)

In [86]:
hospital = Census_hh_dw[Census_hh_dw["CENSUS_GEOG"] == 'NEW YORK WARD 18 BELLEVUE HOSPITAL']

In [87]:
hospital.nunique()

CENSUS_SERIALP             438
CENSUS_AGE                  61
CENSUS_SEX                   2
CENSUS_MARST                 2
CENSUS_RACE                  2
CENSUS_LABFORCE              2
CENSUS_IMPREL                1
CENSUS_OCCSTR               46
CENSUS_NAMELAST            322
CENSUS_NAMEFRST            117
CENSUS_SEQ_NUM             479
CENSUS_HH_NUM              422
CENSUS_IPUMS_UID           479
CENSUS_CITY                  1
CENSUS_PAGENO_HOUSEHOLD      3
CENSUS_WARD_NUM              1
CENSUS_REEL_HOUSEHOLD        1
CENSUS_PLACE                 1
CENSUS_DWELLING_NUM          1
CENSUS_DWELLING_SEQ          1
CENSUS_DWELLING_SIZE         3
CENSUS_GEOG                  1
CENSUS_LINE                  1
CENSUS_INDEX               479
unique_ward                  1
CD_ADDRESS                   1
dtype: int64

In [88]:
refuge = Census_hh_dw[Census_hh_dw["CENSUS_GEOG"] == 'NEW YORK WARD 18 HOUSE OF REFUGE']
refuge.nunique()

CENSUS_SERIALP             372
CENSUS_AGE                  29
CENSUS_SEX                   2
CENSUS_MARST                 2
CENSUS_RACE                  3
CENSUS_LABFORCE              2
CENSUS_IMPREL                1
CENSUS_OCCSTR               34
CENSUS_NAMELAST            330
CENSUS_NAMEFRST            142
CENSUS_SEQ_NUM             411
CENSUS_HH_NUM              394
CENSUS_IPUMS_UID           411
CENSUS_CITY                  1
CENSUS_PAGENO_HOUSEHOLD      1
CENSUS_WARD_NUM              1
CENSUS_REEL_HOUSEHOLD        1
CENSUS_PLACE                 1
CENSUS_DWELLING_NUM          1
CENSUS_DWELLING_SEQ          1
CENSUS_DWELLING_SIZE         2
CENSUS_GEOG                  1
CENSUS_LINE                  2
CENSUS_INDEX               411
unique_ward                  1
CD_ADDRESS                   1
dtype: int64

It seems that there are fewer NA's (no address at all for a dwelling then when using the previous nested approach). Let's try nested and see if that helps, though I am pretty confused about why it would?

### Use Weights Directly to Get Single Address for Dwelling

In [38]:
#Function for filling in households/dwelling numbers if relevant
#Check how idxmax is working
def dwelling_weight_fill(x, nones):

    c = x["CD_ADDRESS"].nunique()
    if c == 0:
        nones.append(1)
        #nones.append((x.loc["CENSUS_WARD_NUM"].iloc[0], x[col].iloc[0]))

    elif c >= 1:
        x.reset_index(drop = True, inplace = True)
        index = x["spatial_weight"].idxmax()
        x["CD_ADDRESS"] = x.iloc[index].loc["CD_ADDRESS"]
  
    return x

In [39]:
dwellings = []
Dwellings_weight = CensusDis1850.groupby(["CENSUS_WARD_NUM", "CENSUS_DWELLING_NUM"]).apply(lambda x: dwelling_weight_fill(x, dwellings)) 

In [40]:
len(dwellings)

12301

In [41]:
print("Proportion of census data assigned addresses:", Dwellings_weight.CD_ADDRESS.count()/515630)

Proportion of census data assigned addresses: 0.5070050229815953


In [42]:
print("Proportion of Dwellings Without Addresses:", len(dwellings)/CensusDis1850.groupby(["CENSUS_WARD_NUM","CENSUS_DWELLING_NUM"]).ngroups)

Proportion of Dwellings Without Addresses: 0.6204791929382093


This matches exactly with the initial calculations

### Look At How Often We Have Dwellings With the Same Address

In [88]:
org_same_add = defaultdict(list)
def same_address(x, d, col):
    d[(x["CENSUS_WARD_NUM"].iloc[0], x[col].iloc[0])].append(x.dropna(subset = ["CD_ADDRESS"]).CD_ADDRESS.unique())
for index, group in CensusDis1850.groupby(["CENSUS_WARD_NUM","CENSUS_DWELLING_NUM"]):
    same_address(group, org_same_add, "CENSUS_DWELLING_NUM")

In [104]:
CensusDis1850.groupby("CENSUS_WARD_NUM").ngroups

19

In [105]:
which_dwellings = []
count = 0

for index,df in CensusDis1850.groupby("CENSUS_WARD_NUM"):
    org_same_add = defaultdict(list)
    for i, group in df.groupby("CENSUS_DWELLING_NUM"):
        same_address(group, org_same_add, "CENSUS_DWELLING_NUM")
    
    #check for same name (limit to within ward for efficiency)
    for key1 in org_same_add:
        for key2 in org_same_add:
            if key1 != key2:
                if org_same_add[key1][0].any() in org_same_add[key2][0]:
                    count += 1 
                    which_dwellings.append((key1,key2))
                    
    if index % 2 == 0:
        print("Finished up to index", str(index))

Finished up to index 2
Finished up to index 4
Finished up to index 6
Finished up to index 8
Finished up to index 10
Finished up to index 12
Finished up to index 14
Finished up to index 16
Finished up to index 18


In [108]:
with open('data/dwellings_same_address.txt', 'w') as filehandle:
    for listitem in which_dwellings:
        line = ' '.join(str(x) for x in listitem)
        filehandle.write(line + '\n')

In [109]:
print("Number of Dwellings that could share the same address:", count)

Number of Dwellings that could share the same address: 4039


In [93]:
Dwellings_weight.reset_index(drop = True, inplace = True)
same_add_selected = Dwellings_weight.groupby(["CENSUS_WARD_NUM", "CENSUS_DWELLING_NUM"]).first()
num_same = same_add_selected["CD_ADDRESS"].count() - same_add_selected["CD_ADDRESS"].nunique()
print("Number of Dwellings with Same Address Selected:", num_same)

Number of Dwellings with Same Address Selected: 1065


In [110]:
print("Proportion of Dwellings with Same Address:", num_same/Dwellings_weight.groupby(["CENSUS_WARD_NUM", "CENSUS_DWELLING_NUM"]).ngroups)

Proportion of Dwellings with Same Address: 0.05372005044136192


In [111]:
print("Dwellings with same address selected out of possiblity:", num_same/count)

Dwellings with same address selected out of possiblity: 0.26367912849715275


### Conclusion

Although some level of manual checking is warranted, based on these results, filling in dwelling number based on height weight of the possible options seems like the best approach.