In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

%load_ext autoreload
%autoreload 2
import pandas as pd
import networkx as nx
import numpy as np
import disambiguation
from disambiguation import Disambiguator, Disambiguator1880
import disambiguation.analysis as da
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import random
import re
import disambiguation.processing as dp 
from disambiguation import Benchmark

#### Get and Format Data

In [2]:
#elastic_match = pd.read_csv("../../Data/matches.csv")
elastic_match = pd.read_csv("../data/es-1880-21-5-2020.csv", sep='\t', engine='python')

In [3]:
census = pd.read_csv("../data/census_1880_mn_v04.csv")
def census_for_disamb(census):
    census_latlng_tuning = census.copy()
    census_latlng_tuning['CENSUS_ID'] = 'CENSUS_' + census_latlng_tuning['OBJECTID.x'].astype(str)
    census_latlng_tuning = census_latlng_tuning.loc[:, ['CENSUS_ID', 'CENSUS_X', 'CENSUS_Y']]
    census_latlng_tuning.loc[census_latlng_tuning.CENSUS_Y > 1000, 'CENSUS_Y'] = 40.799935
    return census_latlng_tuning

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
#cd_latlng
latlng = pd.read_csv("../data/cd_1880.csv")

#### Format data

In [5]:
match = dp.elastic_to_disamb(elastic_match, latlng)

#### Create sample (a small number of Wards)

In [6]:
# def get_sample(df, number = 5):
#     wards = df.WARD_NUM.unique()
#     wards_selected = random.sample(range(min(wards),max(wards)+1), number)
#     return df[df.WARD_NUM.isin(wards_selected)]
    
# match_sample = get_sample(match, number = 1)

In [7]:
# #Print wards in sample for documentation notes
# print("Wards Included in Sample:",match_sample.WARD_NUM.unique())

In [6]:
#get sample based on historical context
wards = [3,9,10,18,21,22]
match_sample = match[match.WARD_NUM.isin(wards)]

#### Confidence Score Tuning Functions

#### Note: The version used here does not incorporate the cf score in the benchmark weights

In [7]:
#function to get confidence score including specified columns and weights
def confidence_score(df, columns, weights):
    return [sum(row[col]*w for col, w in zip(columns,weights)) for index,row in df.iterrows()]

In [8]:
def confidence_score_tuning(param_grid, df_allcols, df_census, df_cd):
    #Store results
    results = {}
    df = df_allcols.copy()
    
    #Get confidence score for each value in grid
    for i in range(len(param_grid)):
        name = "confidence_score_"+str(i)
        df.loc[:,name] = confidence_score(df_allcols, param_grid[i]["columns"], param_grid[i]["weights"])
        
    #Create benchmark object
    benchmark = Benchmark(df, df_census, df_cd)
    
    #Format census data for tuning
    census_tuning = census_for_disamb(df_census)
    
   # try:
    for i in range(len(param_grid)):

        name = "confidence_score_"+str(i)

        #Run disambiguation process (use betweeness and clustering -- based on Jolene's work)
        basic = disambiguation.Disambiguator1880(df, confidence = name)

        try:
            basic.run_disambiguation()
        except:
            continue

        result = basic.get_result() #.to_csv("..data/confidence_score_tuning/confidence_score_"+str(i))

        #Results analysis
        basic.merge_census_var(census_tuning)
        basic.set_var() 

        #benchmarking
        benchmark.set_confidence(name)
        benchmark.set_disambiguated(result)
        benchmark.run_benchmarking()

        #Store results
        results[name] = {"columns":param_grid[i]["columns"], "weights":param_grid[i]["weights"], "Match Rate":basic.get_match_rate(), "Address Success":basic.get_addr_success(),"Under 12":basic.get_under12_selections(), "confusion matrix":benchmark.get_confusion_matrix()}
        
    #will return results so far even if exception occurs
        #Spit out the best columns and weights (Add this in when decide what makes something the best)
        #For now simply output the analysis
    return results
    

#### Run Confidence Score Tuning

In [17]:
#Columns and weights
param_grid = [{"columns": ['jw_score','cd_count_inverse','census_count_inverse', 'occ_listed', 'age_score'], "weights":[0.5,0.2,0.2,0.05,0.05]},
              {"columns": ['jw_score','cd_count_inverse','census_count_inverse', 'occ_listed', 'age_score'], "weights":[0.55,0.18,0.18,0.05,0.04]},
             {"columns": ['jw_score','cd_count_inverse','census_count_inverse', 'occ_listed', 'age_score'], "weights":[0.7,0.1,0.1,0.05,0.05]}, #Best outcome
             {"columns": ['jw_score','cd_count_inverse','census_count_inverse', 'occ_listed', 'age_score'], "weights":[0.6,0.1,0.1,0.1,0.1]},
              {"columns": ['jw_score', 'occ_listed', 'age_score'], "weights":[0.8,0.10,0.10]},
             {"columns": ['jw_score','census_count_inverse', 'occ_listed', 'age_score'], "weights":[0.6,0.15,0.1,0.15]},
             {"columns": ['jw_score','cd_count_inverse','census_count_inverse', 'occ_listed', 'age_score'], "weights":[0.8,0.05,0.05,0.05,0.05]},
             {"columns": ['jw_score','cd_count_inverse','census_count_inverse', 'occ_listed', 'age_score'], "weights":[0.6,0.15,0.15,0.05,0.05]}]

In [20]:
tuning_results = confidence_score_tuning(param_grid, match_sample, census, latlng)

Running
Creating dictionary of sub dfs (1/4)...
Applying algorithms iteratively (2/4)...
Number of Subgraphs: 15333
Reached: 0
Reached: 1000
Reached: 2000
Reached: 3000
Reached: 4000
Reached: 5000
Reached: 6000
Reached: 7000
Reached: 8000
Reached: 9000
Reached: 10000
Reached: 11000
Reached: 12000
Reached: 13000
Reached: 14000
Reached: 15000
Cleaning output (3/4)...


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  final = pd.concat(sub_groups)


Disambiguating (4/4)...
Total time: 206.41312289237976
Done! :)
Running
Creating dictionary of sub dfs (1/4)...
Applying algorithms iteratively (2/4)...
Number of Subgraphs: 15333
Reached: 0
Reached: 1000
Reached: 2000
Reached: 3000
Reached: 4000
Reached: 5000
Reached: 6000
Reached: 7000
Reached: 8000
Reached: 9000
Reached: 10000
Reached: 11000
Reached: 12000
Reached: 13000
Reached: 14000
Reached: 15000
Cleaning output (3/4)...


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  final = pd.concat(sub_groups)


Disambiguating (4/4)...
Total time: 217.6718089580536
Done! :)
Running
Creating dictionary of sub dfs (1/4)...
Applying algorithms iteratively (2/4)...
Number of Subgraphs: 15333
Reached: 0
Reached: 1000
Reached: 2000
Reached: 3000
Reached: 4000
Reached: 5000
Reached: 6000
Reached: 7000
Reached: 8000
Reached: 9000
Reached: 10000
Reached: 11000
Reached: 12000
Reached: 13000
Reached: 14000
Reached: 15000
Cleaning output (3/4)...


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  final = pd.concat(sub_groups)


Disambiguating (4/4)...
Total time: 217.9502329826355
Done! :)
Running
Creating dictionary of sub dfs (1/4)...
Applying algorithms iteratively (2/4)...
Number of Subgraphs: 0
Cleaning output (3/4)...
Running
Creating dictionary of sub dfs (1/4)...
Applying algorithms iteratively (2/4)...
Number of Subgraphs: 26289
Reached: 0
Running
Creating dictionary of sub dfs (1/4)...
Applying algorithms iteratively (2/4)...
Number of Subgraphs: 23671
Reached: 0
Reached: 1000
Reached: 2000
Reached: 3000
Reached: 4000
Reached: 5000
Reached: 6000
Reached: 7000
Reached: 8000
Reached: 9000
Reached: 10000
Reached: 11000
Reached: 12000
Reached: 13000
Reached: 14000
Reached: 15000
Reached: 16000
Reached: 17000
Reached: 18000
Reached: 19000
Reached: 20000
Reached: 21000
Reached: 22000
Reached: 23000
Cleaning output (3/4)...


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  final = pd.concat(sub_groups)


Disambiguating (4/4)...
Total time: 257.7328791618347
Done! :)
Running
Creating dictionary of sub dfs (1/4)...
Applying algorithms iteratively (2/4)...
Number of Subgraphs: 0
Cleaning output (3/4)...
Running
Creating dictionary of sub dfs (1/4)...
Applying algorithms iteratively (2/4)...
Number of Subgraphs: 15333
Reached: 0
Reached: 1000
Reached: 2000
Reached: 3000
Reached: 4000
Reached: 5000
Reached: 6000
Reached: 7000
Reached: 8000
Reached: 9000
Reached: 10000
Reached: 11000
Reached: 12000
Reached: 13000
Reached: 14000
Reached: 15000
Cleaning output (3/4)...


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  final = pd.concat(sub_groups)


Disambiguating (4/4)...
Total time: 210.45189690589905
Done! :)


In [21]:
display(pd.DataFrame.from_dict(tuning_results))

Unnamed: 0,confidence_score_0,confidence_score_1,confidence_score_2,confidence_score_5,confidence_score_7
columns,"[jw_score, cd_count_inverse, census_count_inve...","[jw_score, cd_count_inverse, census_count_inve...","[jw_score, cd_count_inverse, census_count_inve...","[jw_score, census_count_inverse, occ_listed, a...","[jw_score, cd_count_inverse, census_count_inve..."
weights,"[0.5, 0.2, 0.2, 0.05, 0.05]","[0.55, 0.18, 0.18, 0.05, 0.04]","[0.7, 0.1, 0.1, 0.05, 0.05]","[0.6, 0.15, 0.1, 0.15]","[0.6, 0.15, 0.15, 0.05, 0.05]"
Match Rate,98.48,98.48,98.48,98.48,98.48
Address Success,"{'n_perfect_match_chosen': 8446, 'n_perfect_ma...","{'n_perfect_match_chosen': 8444, 'n_perfect_ma...","{'n_perfect_match_chosen': 8445, 'n_perfect_ma...","{'n_perfect_match_chosen': 8442, 'n_perfect_ma...","{'n_perfect_match_chosen': 8448, 'n_perfect_ma..."
Under 12,2.6,2.68,2.68,2.45,2.65
confusion matrix,"[[30643, 3775], [3775, 11334]]","[[30652, 3766], [3766, 11343]]","[[30653, 3765], [3765, 11344]]","[[30627, 3791], [3791, 11318]]","[[30648, 3770], [3770, 11339]]"


#### Tune Explore Effect of not including confidence score in benchmark

Not including the confidence score in the benchmarking process means changing the results of the benchmarking process. Generally, more errors are indicated, and the best weighting scheme becomes what we used originally.
- To me not including the confidence score for the benchmark makes more sense, but I'm concerned because I feel like I don't understand why it was included in the first place and there must have been a reason

In [15]:
benchmark_test = confidence_score_tuning(param_grid, match_sample, census, latlng)

Running
Creating dictionary of sub dfs (1/4)...
Applying algorithms iteratively (2/4)...
Number of Subgraphs: 15333
Reached: 0
Reached: 1000
Reached: 2000
Reached: 3000
Reached: 4000
Reached: 5000
Reached: 6000
Reached: 7000
Reached: 8000
Reached: 9000
Reached: 10000
Reached: 11000
Reached: 12000
Reached: 13000
Reached: 14000
Reached: 15000
Cleaning output (3/4)...


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  final = pd.concat(sub_groups)


Disambiguating (4/4)...
Done! :)
Running
Creating dictionary of sub dfs (1/4)...
Applying algorithms iteratively (2/4)...
Number of Subgraphs: 15333
Reached: 0
Reached: 1000
Reached: 2000
Reached: 3000
Reached: 4000
Reached: 5000
Reached: 6000
Reached: 7000
Reached: 8000
Reached: 9000
Reached: 10000
Reached: 11000
Reached: 12000
Reached: 13000
Reached: 14000
Reached: 15000
Cleaning output (3/4)...


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  final = pd.concat(sub_groups)


Disambiguating (4/4)...
Done! :)
Running
Creating dictionary of sub dfs (1/4)...
Applying algorithms iteratively (2/4)...
Number of Subgraphs: 15333
Reached: 0
Reached: 1000
Reached: 2000
Reached: 3000
Reached: 4000
Reached: 5000
Reached: 6000
Reached: 7000
Reached: 8000
Reached: 9000
Reached: 10000
Reached: 11000
Reached: 12000
Reached: 13000
Reached: 14000
Reached: 15000
Cleaning output (3/4)...


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  final = pd.concat(sub_groups)


Disambiguating (4/4)...
Done! :)
Running
Creating dictionary of sub dfs (1/4)...
Applying algorithms iteratively (2/4)...
Number of Subgraphs: 15333
Reached: 0
Reached: 1000
Reached: 2000
Reached: 3000
Reached: 4000
Reached: 5000
Reached: 6000
Reached: 7000
Reached: 8000
Reached: 9000
Reached: 10000
Reached: 11000
Reached: 12000
Reached: 13000
Reached: 14000
Reached: 15000
Cleaning output (3/4)...


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  final = pd.concat(sub_groups)


Disambiguating (4/4)...
Done! :)


In [16]:
display(pd.DataFrame.from_dict(benchmark_test))

Unnamed: 0,confidence_score_0,confidence_score_1,confidence_score_2,confidence_score_3
columns,"[jw_score, cd_count_inverse, census_count_inve...","[jw_score, cd_count_inverse, census_count_inve...","[jw_score, cd_count_inverse, census_count_inve...","[jw_score, cd_count_inverse, census_count_inve..."
weights,"[0.5, 0.2, 0.2, 0.05, 0.05]","[0.55, 0.18, 0.18, 0.05, 0.04]","[0.7, 0.1, 0.1, 0.05, 0.05]","[0.6, 0.15, 0.15, 0.05, 0.05]"
Match Rate,98.48,98.48,98.48,98.48
Address Success,"{'n_perfect_match_chosen': 8434, 'n_perfect_ma...","{'n_perfect_match_chosen': 8429, 'n_perfect_ma...","{'n_perfect_match_chosen': 8428, 'n_perfect_ma...","{'n_perfect_match_chosen': 8431, 'n_perfect_ma..."
Under 12,2.6,2.66,2.7,2.63
confusion matrix,"[[30536, 3882], [3882, 11227]]","[[30533, 3885], [3885, 11224]]","[[30517, 3901], [3901, 11208]]","[[30531, 3887], [3887, 11222]]"
