In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

%load_ext autoreload
%autoreload 2
import pandas as pd
import networkx as nx
import numpy as np
import disambiguation
import disambiguation.analysis as da
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import random
import re
import disambiguation.processing as dp 
from disambiguation.confidence_score_tuning import confidence_score_tuning_v02
from disambiguation.confidence_score_tuning import confidence_score
import caffeine
caffeine.on(display = False)

#### Get and Format Data

In [2]:
#elastic_match = pd.read_csv("../../Data/matches.csv")
elastic_match = pd.read_csv("../data/es-1880-28-9-2020.csv", sep='\t', engine='python')

In [26]:
elastic_match_2 = pd.read_csv("../data/es-1850-22-9-2020.csv", sep = '\t', engine = 'python')
elastic_match_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63312 entries, 0 to 63311
Data columns (total 33 columns):
CENSUS_INDEX            63312 non-null int64
CENSUS_IPUMS_UID        63312 non-null object
CENSUS_SERIAL           63312 non-null int64
CENSUS_HH_NUM           63312 non-null int64
CENSUS_SEQ_NUM          63312 non-null int64
CENSUS_REEL             63312 non-null int64
CENSUS_PAGENUM          63312 non-null int64
CENSUS_LINE             63312 non-null int64
CENSUS_AGE              63312 non-null int64
CENSUS_GENDER           63312 non-null int64
CENSUS_RACE             63312 non-null int64
CENSUS_LABFORCE         63312 non-null int64
CENSUS_OCCUPATION       37973 non-null object
CENSUS_IMPREL           63312 non-null int64
CENSUS_FIRST_NAME       63312 non-null object
CENSUS_LAST_NAME        63312 non-null object
CENSUS_DWELLING_NUM     63178 non-null float64
CENSUS_DWELLING_SEQ     63080 non-null float64
CENSUS_DWELLING_SIZE    63312 non-null int64
CENSUS_GEOG             6322

In [27]:
elastic_match_2.CD_RECORD_ID.nunique()

25089

In [32]:
elastic_match_2.CD_RECORD_ID.nunique()/len(elastic_match_2)

0.3962755875663381

In [29]:
elastic_match_3 = pd.read_csv("../data/es_1850_mn_IndexUpdate.csv")
elastic_match_3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33042 entries, 0 to 33041
Data columns (total 17 columns):
CENSUS_IPUMS_UID       33042 non-null object
CENSUS_NAMEFRST        33042 non-null object
CENSUS_NAMELAST        33042 non-null object
CENSUS_WARD_NUM        33042 non-null int64
CENSUS_AGE             33042 non-null int64
CENSUS_DWELLIN_NUM     32946 non-null float64
CENSUS_DWELLING_SEQ    32922 non-null float64
CENSUS_OCCSTR          19420 non-null object
OBJECTID               33042 non-null int64
CD_FIRST_NAME          33042 non-null object
CD_LAST_NAME           33042 non-null object
CD_ADDRESS             33042 non-null object
BLOCK_NUM              33042 non-null int64
CD_OCCUPATION          31686 non-null object
CENSUS_INDEX           33042 non-null int64
CD_X                   33042 non-null float64
CD_Y                   33042 non-null float64
dtypes: float64(4), int64(5), object(8)
memory usage: 4.3+ MB


In [31]:
elastic_match_3.OBJECTID.nunique()/len(elastic_match_3)

0.3786998365716361

#### Create sample (a small number of Wards)

In [4]:
#get sample based on historical context
wards = [3,9,10,18,21,22]
match_sample = elastic_match[elastic_match.CD_WARD_NUM.isin(wards)].copy()

In [5]:
print(len(match_sample))

48684


#### Run Confidence Score Tuning

In [14]:
#Columns and weights
param_grid = [{"columns": ['jw_score','cd_count_inverse','census_count_inverse', 'occ_listed', 'age_score'], "weights":[0.7,0.0,0.05,0.15,0.1]}]
              #{"columns": ['jw_score','cd_count_inverse','census_count_inverse', 'occ_listed', 'age_score'], "weights":[0.55,0.18,0.18,0.05,0.04]},
              #{"columns": ['jw_score','cd_count_inverse','census_count_inverse', 'occ_listed', 'age_score'], "weights":[0.7,0.1,0.1,0.05,0.05]},
              #{"columns": ['jw_score','cd_count_inverse','census_count_inverse', 'occ_listed', 'age_score'], "weights":[0.6,0.1,0.1,0.1,0.1]},
              #{"columns": ['jw_score','cd_count_inverse','census_count_inverse', 'occ_listed', 'age_score'], "weights":[0.6,0.15,0.15,0.05,0.05]}]

In [15]:
match_sample = dp.col_for_disamb(match_sample, "CD_RECORD_ID", "CENSUS_MERGEID")
match_sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48684 entries, 2 to 170836
Data columns (total 43 columns):
CENSUS_INDEX            48684 non-null int64
CENSUS_OBJECTID         48684 non-null int64
CENSUS_MERGEID          48684 non-null object
CENSUS_SERIAL           48684 non-null int64
CENSUS_REEL             48684 non-null int64
CENSUS_VOLUME           48684 non-null int64
CENSUS_PAGENUM          48684 non-null int64
CENSUS_FIRST_NAME       48684 non-null object
CENSUS_LAST_NAME        48684 non-null object
CENSUS_RELATE           48684 non-null int64
CENSUS_AGE              48684 non-null int64
CENSUS_GENDER           48684 non-null int64
CENSUS_RACE             48684 non-null object
CENSUS_OCCUPATION       48679 non-null object
CENSUS_MATCH_ADDR       48684 non-null object
CENSUS_AD               48684 non-null float64
CENSUS_X                41988 non-null float64
CENSUS_Y                41988 non-null float64
OBJECTID                48684 non-null int64
CD_INDEX               

In [21]:
match_sample.CD_ID.nunique()

5825

In [22]:
match_sample.CENSUS_ID.nunique()

30442

In [None]:
len(match)

In [16]:
match_sample.head()

Unnamed: 0,CENSUS_INDEX,CENSUS_OBJECTID,CENSUS_MERGEID,CENSUS_SERIAL,CENSUS_REEL,CENSUS_VOLUME,CENSUS_PAGENUM,CENSUS_FIRST_NAME,CENSUS_LAST_NAME,CENSUS_RELATE,...,jw_score,occ_listed,age_score,cd_count,census_count,census_count_inverse,cd_count_inverse,CD_ID,CENSUS_ID,confidence_score_0
2,306184,306184,4610188284721WILLIAMASH,6410006,875,1,289,WILLIAM,ASH,1200,...,1.0,1,1,1,1,1.0,1.0,CD_MN_1880_WILLIAM_ASH_16,CENSUS_4610188284721WILLIAMASH,1.0
7,120522,120522,461072007THOMAS W.BRITT,6371449,870,1,51,THOMAS,BRITT,301,...,0.886,0,0,13,2,0.5,0.076923,CD_MN_1880_THOMAS_BARRATT_25,CENSUS_461072007THOMAS W.BRITT,0.6202
8,699731,699731,461041318183THOMASBRADY,6497824,887,1,87,THOMAS,BRADY,301,...,0.766,1,1,13,2,0.5,0.076923,CD_MN_1880_THOMAS_BARRATT_25,CENSUS_461041318183THOMASBRADY,0.8362
9,705643,705643,461041736442THOMASMC BRIDE,6499154,887,1,148,THOMAS,BRIDE,301,...,0.766,1,1,13,2,0.5,0.076923,CD_MN_1880_THOMAS_BARRATT_25,CENSUS_461041736442THOMASMC BRIDE,0.8362
10,710232,710232,4610419104THOMASBRADY,6500243,887,1,198,THOMAS,BRADY,301,...,0.766,0,0,13,2,0.5,0.076923,CD_MN_1880_THOMAS_BARRATT_25,CENSUS_4610419104THOMASBRADY,0.5362


In [17]:
for i in range(len(param_grid)):
    name = "confidence_score_" + str(i)
    match_sample.loc[:, name] = confidence_score(match_sample, param_grid[i]["columns"], param_grid[i]["weights"])

In [18]:
print(match_sample["confidence_score_0"].value_counts())
print(match_sample["confidence_score_0"].value_counts().iloc[0]/len(match_sample))

1.000000    3644
0.975000    1705
0.966667     937
0.900000     637
0.962500     591
            ... 
0.358267       1
0.670200       1
0.656943       1
0.634267       1
0.658200       1
Name: confidence_score_0, Length: 3443, dtype: int64
0.07485005340563634


In [19]:
for col in param_grid[0]["columns"]:
    print(match_sample[col].value_counts())
    print(col, ":", match_sample[col].value_counts().iloc[0]/len(match_sample))

1.000    10634
0.792     1338
0.880     1255
0.802     1230
0.820     1154
         ...  
0.700        1
0.712        1
0.900        1
0.526        1
0.300        1
Name: jw_score, Length: 461, dtype: int64
jw_score : 0.21842905266617368
0.500000    1772
1.000000    1707
0.333333    1674
0.250000    1640
0.166667    1500
            ... 
0.013158      76
0.013333      75
0.015873      63
0.017241      58
0.017857      56
Name: cd_count_inverse, Length: 91, dtype: int64
cd_count_inverse : 0.03639799523457399
1.000000    20570
0.500000    11372
0.333333     6051
0.250000     4168
0.200000     3195
0.142857     1960
0.166667     1032
0.111111      252
0.100000       60
0.083333       24
Name: census_count_inverse, dtype: int64
census_count_inverse : 0.42252074603565853
1    41411
0     7273
Name: occ_listed, dtype: int64
occ_listed : 0.8506080026292006
1    36644
0    12040
Name: age_score, dtype: int64
age_score : 0.7526908224467997


In [81]:
match_sample.census_count_inverse.value_counts()[1]/len(match_sample)

1.000000    20570
0.500000    11372
0.333333     6051
0.250000     4168
0.200000     3195
0.142857     1960
0.166667     1032
0.111111      252
0.100000       60
0.083333       24
Name: census_count_inverse, dtype: int64

In [20]:
tuning_results = confidence_score_tuning_v02(param_grid, match_sample)

Running
Creating dictionary of sub dfs (1/4)...
Applying algorithms iteratively (2/4)...
Number of Subgraphs: 3644
Reached: 0
Reached: 1000
Reached: 2000
Reached: 3000
Cleaning output (3/4)...


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  final = pd.concat(sub_groups)


Disambiguating (4/4)...
Total time: 372.17486691474915
Done! :)


KeyError: ('CD_ADDRESS', 'occurred at index 0')

In [45]:
tuning_results = confidence_score_tuning_v02(param_grid, match_sample)

Running
Creating dictionary of sub dfs (1/4)...
Applying algorithms iteratively (2/4)...
Number of Subgraphs: 348
Reached: 0
40
24
17
213
32
72
101
24
95
234


KeyboardInterrupt: 

In [29]:
tuning_results = confidence_score_tuning_v02(param_grid, match_sample)

Running
Creating dictionary of sub dfs (1/4)...
Applying algorithms iteratively (2/4)...
Number of Subgraphs: 348
Reached: 0
part 1 time: 0.07006597518920898 	 length of df: 1600
part 1 time: 0.03008103370666504 	 length of df: 576
part 1 time: 0.014436721801757812 	 length of df: 289
part 1 time: 2.1014790534973145 	 length of df: 45369
part 1 time: 0.047982215881347656 	 length of df: 1024
part 1 time: 0.22450613975524902 	 length of df: 5184
part 1 time: 0.42418599128723145 	 length of df: 10201
part 1 time: 0.029577970504760742 	 length of df: 576
part 1 time: 0.43213510513305664 	 length of df: 9025
part 1 time: 2.4902751445770264 	 length of df: 54756
part 1 time: 0.6221218109130859 	 length of df: 14641
part 1 time: 0.04511094093322754 	 length of df: 961
part 1 time: 0.02347421646118164 	 length of df: 529
part 1 time: 0.15934109687805176 	 length of df: 3721
part 1 time: 5.264247894287109 	 length of df: 77284
part 1 time: 1.659060001373291 	 length of df: 34225
part 1 time: 0

KeyboardInterrupt: 

In [None]:
display(pd.DataFrame.from_dict(tuning_results))

In [20]:
print(tuning_results)

{}
