In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

%load_ext autoreload
%autoreload 2
import pandas as pd
import networkx as nx
import numpy as np
import disambiguation
from disambiguation import Disambiguator, Disambiguator1880
import disambiguation.analysis as da
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import random
import re
import disambiguation.processing as dp 
from disambiguation import Benchmark

#### Get and Format Data

In [2]:
#elastic_match = pd.read_csv("../../Data/matches.csv")
elastic_match = pd.read_csv("../data/es-1880-21-5-2020.csv", sep='\t', engine='python')

In [3]:
elastic_match.columns

Index(['OBJECTID.x', 'CENSUS_NAMEFRSTB', 'CENSUS_NAMELASTB', 'CENSUS_AGE',
       'CENSUS_OCCLABELB', 'CENSUS_MATCH_ADDR', 'CENSUS_SEGMENT_ID',
       'WARD_NUM', 'CD_ED', 'OBJECTID', 'MATCH_ADDR', 'CD_FIRST_NAME',
       'CD_LAST_NAME', 'CD_OCCUPATION', 'CD_FINAL_HOUSENUM'],
      dtype='object')

In [3]:
census = pd.read_csv("../data/census_1880_mn_v04.csv")
def census_for_disamb(census):
    census_latlng_tuning = census.copy()
    census_latlng_tuning['CENSUS_ID'] = 'CENSUS_' + census_latlng_tuning['OBJECTID.x'].astype(str)
    census_latlng_tuning = census_latlng_tuning.loc[:, ['CENSUS_ID', 'CENSUS_X', 'CENSUS_Y']]
    census_latlng_tuning.loc[census_latlng_tuning.CENSUS_Y > 1000, 'CENSUS_Y'] = 40.799935
    return census_latlng_tuning

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
#cd_latlng
latlng = pd.read_csv("../data/cd_1880.csv")

#### Format data

In [5]:
match = dp.elastic_to_disamb(elastic_match, latlng)

#### Create sample (a small number of Wards)

In [6]:
# def get_sample(df, number = 5):
#     wards = df.WARD_NUM.unique()
#     wards_selected = random.sample(range(min(wards),max(wards)+1), number)
#     return df[df.WARD_NUM.isin(wards_selected)]
    
# match_sample = get_sample(match, number = 1)

In [7]:
# #Print wards in sample for documentation notes
# print("Wards Included in Sample:",match_sample.WARD_NUM.unique())

In [8]:
#get sample based on historical context
wards = [3,9,10,18,21,22]
match_sample = match[match.WARD_NUM.isin(wards)]

#### Confidence Score Tuning Functions

#### Notes:
- Start with elastic search output with added columns relavant for cf score
- Goal: set up so this is easy to use later on when if we add more inputs to confidence score, experiment with different string measures, etc.
- Once functions are finalized, put them into a script for easy access

In [6]:
#function to get confidence score including specified columns and weights
def confidence_score(df, columns, weights):
    return [sum(row[col]*w for col, w in zip(columns,weights)) for index,row in df.iterrows()]

In [7]:
def confidence_score_tuning(param_grid, df_allcols, df_census, df_cd):
    #Store results
    results = {}
    df = df_allcols.copy()
    
    #Get confidence score for each value in grid
    for i in range(len(param_grid)):
        name = "confidence_score_"+str(i)
        df.loc[:,name] = confidence_score(df_allcols, param_grid[i]["columns"], param_grid[i]["weights"])
        
    #Create benchmark object
    benchmark = Benchmark(df, df_census, df_cd)
    
    #Format census data for tuning
    census_tuning = census_for_disamb(df_census)
    
   # try:
    for i in range(len(param_grid)):

        name = "confidence_score_"+str(i)

        #Run disambiguation process (use betweeness and clustering -- based on Jolene's work)
        basic = disambiguation.Disambiguator1880(df, confidence = name)

        #try:
        basic.run_disambiguation()
#         except:
#             continue

        result = basic.get_result() #.to_csv("..data/confidence_score_tuning/confidence_score_"+str(i))

        #Results analysis
        basic.merge_census_var(census_tuning)
        basic.set_var() 

        #benchmarking
        benchmark.set_confidence(name)
        benchmark.set_disambiguated(result)
        benchmark.run_benchmarking()

        #Store results
        results[name] = {"columns":param_grid[i]["columns"], "weights":param_grid[i]["weights"], "Match Rate":basic.get_match_rate(), "Address Success":basic.get_addr_success(),"Under 12":basic.get_under12_selections(), "confusion matrix":benchmark.get_confusion_matrix()}
        
    #will return results so far even if exception occurs
        #Spit out the best columns and weights (Add this in when decide what makes something the best)
        #For now simply output the analysis
    return results
    

#### Run Confidence Score Tuning

In [11]:
#Columns and weights
param_grid = [{"columns": ['jw_score','cd_count_inverse','census_count_inverse', 'occ_listed', 'age_score'], "weights":[0.5,0.2,0.2,0.05,0.05]},
              {"columns": ['jw_score','cd_count_inverse','census_count_inverse', 'occ_listed', 'age_score'], "weights":[0.55,0.18,0.18,0.05,0.04]},
             {"columns": ['jw_score','cd_count_inverse','census_count_inverse', 'occ_listed', 'age_score'], "weights":[0.7,0.1,0.1,0.05,0.05]},
             #{"columns": ['jw_score','cd_count_inverse','census_count_inverse', 'occ_listed', 'age_score'], "weights":[0.6,0.1,0.1,0.1,0.1]}
             {"columns": ['jw_score','cd_count_inverse','census_count_inverse', 'occ_listed', 'age_score'], "weights":[0.6,0.15,0.15,0.05,0.05]}]

In [12]:
tuning_results = confidence_score_tuning(param_grid, match_sample, census, latlng)

Running
Creating dictionary of sub dfs (1/4)...
Applying algorithms iteratively (2/4)...
Number of Subgraphs: 15333
Reached: 0
Reached: 1000
Reached: 2000
Reached: 3000
Reached: 4000
Reached: 5000
Reached: 6000
Reached: 7000
Reached: 8000
Reached: 9000
Reached: 10000
Reached: 11000
Reached: 12000
Reached: 13000
Reached: 14000
Reached: 15000
Cleaning output (3/4)...


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  final = pd.concat(sub_groups)


Disambiguating (4/4)...
Done! :)
Running
Creating dictionary of sub dfs (1/4)...
Applying algorithms iteratively (2/4)...
Number of Subgraphs: 15333
Reached: 0
Reached: 1000
Reached: 2000
Reached: 3000
Reached: 4000
Reached: 5000
Reached: 6000
Reached: 7000
Reached: 8000
Reached: 9000
Reached: 10000
Reached: 11000
Reached: 12000
Reached: 13000
Reached: 14000
Reached: 15000
Cleaning output (3/4)...


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  final = pd.concat(sub_groups)


Disambiguating (4/4)...
Done! :)
Running
Creating dictionary of sub dfs (1/4)...
Applying algorithms iteratively (2/4)...
Number of Subgraphs: 15333
Reached: 0
Reached: 1000
Reached: 2000
Reached: 3000
Reached: 4000
Reached: 5000
Reached: 6000
Reached: 7000
Reached: 8000
Reached: 9000
Reached: 10000
Reached: 11000
Reached: 12000
Reached: 13000
Reached: 14000
Reached: 15000
Cleaning output (3/4)...


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  final = pd.concat(sub_groups)


Disambiguating (4/4)...
Done! :)
Running
Creating dictionary of sub dfs (1/4)...
Applying algorithms iteratively (2/4)...
Number of Subgraphs: 15333
Reached: 0
Reached: 1000
Reached: 2000
Reached: 3000
Reached: 4000
Reached: 5000
Reached: 6000
Reached: 7000
Reached: 8000
Reached: 9000
Reached: 10000
Reached: 11000
Reached: 12000
Reached: 13000
Reached: 14000
Reached: 15000
Cleaning output (3/4)...


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  final = pd.concat(sub_groups)


Disambiguating (4/4)...
Done! :)


In [14]:
display(pd.DataFrame.from_dict(tuning_results))

Unnamed: 0,confidence_score_0,confidence_score_1,confidence_score_2,confidence_score_3
columns,"[jw_score, cd_count_inverse, census_count_inve...","[jw_score, cd_count_inverse, census_count_inve...","[jw_score, cd_count_inverse, census_count_inve...","[jw_score, cd_count_inverse, census_count_inve..."
weights,"[0.5, 0.2, 0.2, 0.05, 0.05]","[0.55, 0.18, 0.18, 0.05, 0.04]","[0.7, 0.1, 0.1, 0.05, 0.05]","[0.6, 0.15, 0.15, 0.05, 0.05]"
Match Rate,98.48,98.48,98.48,98.48
Address Success,"{'n_perfect_match_chosen': 8446, 'n_perfect_ma...","{'n_perfect_match_chosen': 8438, 'n_perfect_ma...","{'n_perfect_match_chosen': 8436, 'n_perfect_ma...","{'n_perfect_match_chosen': 8440, 'n_perfect_ma..."
Under 12,2.6,2.66,2.71,2.64
confusion matrix,"[[33268, 1150], [1150, 13959]]","[[33264, 1154], [1154, 13955]]","[[33260, 1158], [1158, 13951]]","[[33279, 1139], [1139, 13970]]"


#### Tune Explore Effect of not including confidence score in benchmark

Not including the confidence score in the benchmarking process means changing the results of the benchmarking process. Generally, more errors are indicated, and the best weighting scheme becomes what we used originally.
- To me not including the confidence score for the benchmark makes more sense, but I'm concerned because I feel like I don't understand why it was included in the first place and there must have been a reason

In [15]:
benchmark_test = confidence_score_tuning(param_grid, match_sample, census, latlng)

Running
Creating dictionary of sub dfs (1/4)...
Applying algorithms iteratively (2/4)...
Number of Subgraphs: 15333
Reached: 0
Reached: 1000
Reached: 2000
Reached: 3000
Reached: 4000
Reached: 5000
Reached: 6000
Reached: 7000
Reached: 8000
Reached: 9000
Reached: 10000
Reached: 11000
Reached: 12000
Reached: 13000
Reached: 14000
Reached: 15000
Cleaning output (3/4)...


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  final = pd.concat(sub_groups)


Disambiguating (4/4)...
Done! :)
Running
Creating dictionary of sub dfs (1/4)...
Applying algorithms iteratively (2/4)...
Number of Subgraphs: 15333
Reached: 0
Reached: 1000
Reached: 2000
Reached: 3000
Reached: 4000
Reached: 5000
Reached: 6000
Reached: 7000
Reached: 8000
Reached: 9000
Reached: 10000
Reached: 11000
Reached: 12000
Reached: 13000
Reached: 14000
Reached: 15000
Cleaning output (3/4)...


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  final = pd.concat(sub_groups)


Disambiguating (4/4)...
Done! :)
Running
Creating dictionary of sub dfs (1/4)...
Applying algorithms iteratively (2/4)...
Number of Subgraphs: 15333
Reached: 0
Reached: 1000
Reached: 2000
Reached: 3000
Reached: 4000
Reached: 5000
Reached: 6000
Reached: 7000
Reached: 8000
Reached: 9000
Reached: 10000
Reached: 11000
Reached: 12000
Reached: 13000
Reached: 14000
Reached: 15000
Cleaning output (3/4)...


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  final = pd.concat(sub_groups)


Disambiguating (4/4)...
Done! :)
Running
Creating dictionary of sub dfs (1/4)...
Applying algorithms iteratively (2/4)...
Number of Subgraphs: 15333
Reached: 0
Reached: 1000
Reached: 2000
Reached: 3000
Reached: 4000
Reached: 5000
Reached: 6000
Reached: 7000
Reached: 8000
Reached: 9000
Reached: 10000
Reached: 11000
Reached: 12000
Reached: 13000
Reached: 14000
Reached: 15000
Cleaning output (3/4)...


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  final = pd.concat(sub_groups)


Disambiguating (4/4)...
Done! :)


In [16]:
display(pd.DataFrame.from_dict(benchmark_test))

Unnamed: 0,confidence_score_0,confidence_score_1,confidence_score_2,confidence_score_3
columns,"[jw_score, cd_count_inverse, census_count_inve...","[jw_score, cd_count_inverse, census_count_inve...","[jw_score, cd_count_inverse, census_count_inve...","[jw_score, cd_count_inverse, census_count_inve..."
weights,"[0.5, 0.2, 0.2, 0.05, 0.05]","[0.55, 0.18, 0.18, 0.05, 0.04]","[0.7, 0.1, 0.1, 0.05, 0.05]","[0.6, 0.15, 0.15, 0.05, 0.05]"
Match Rate,98.48,98.48,98.48,98.48
Address Success,"{'n_perfect_match_chosen': 8434, 'n_perfect_ma...","{'n_perfect_match_chosen': 8429, 'n_perfect_ma...","{'n_perfect_match_chosen': 8428, 'n_perfect_ma...","{'n_perfect_match_chosen': 8431, 'n_perfect_ma..."
Under 12,2.6,2.66,2.7,2.63
confusion matrix,"[[30536, 3882], [3882, 11227]]","[[30533, 3885], [3885, 11224]]","[[30517, 3901], [3901, 11208]]","[[30531, 3887], [3887, 11222]]"


#### Tune Focusing on Weighting the inverses less

In [9]:
#Columns and weights
param_grid = [#{"columns": ['jw_score','cd_count_inverse', 'occ_listed', 'age_score'], "weights":[0.6,0.15,0.1,0.15]},
             {"columns": ['jw_score', 'occ_listed', 'age_score'], "weights":[0.8,0.10,0.10]},
             {"columns": ['jw_score','census_count_inverse', 'occ_listed', 'age_score'], "weights":[0.6,0.15,0.1,0.15]},
             {"columns": ['jw_score','cd_count_inverse','census_count_inverse', 'occ_listed', 'age_score'], "weights":[0.8,0.05,0.05,0.05,0.05]}]

In [22]:
tuning_results_inverses = confidence_score_tuning(param_grid, match_sample, census, latlng)

Running
Creating dictionary of sub dfs (1/4)...
Applying algorithms iteratively (2/4)...
Number of Subgraphs: 26289
Reached: 0
Running
Creating dictionary of sub dfs (1/4)...
Applying algorithms iteratively (2/4)...
Number of Subgraphs: 23671
Reached: 0
Reached: 1000
Reached: 2000
Reached: 3000
Reached: 4000
Reached: 5000
Reached: 6000
Reached: 7000
Reached: 8000
Reached: 9000
Reached: 10000
Reached: 11000
Reached: 12000
Reached: 13000
Reached: 14000
Reached: 15000
Reached: 16000
Reached: 17000
Reached: 18000
Reached: 19000
Reached: 20000
Reached: 21000
Reached: 22000
Reached: 23000
Cleaning output (3/4)...


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  final = pd.concat(sub_groups)


Disambiguating (4/4)...
Done! :)


In [23]:
display(pd.DataFrame.from_dict(tuning_results_inverses))

Unnamed: 0,confidence_score_1
Address Success,"{'n_perfect_match_chosen': 8431, 'n_perfect_ma..."
Match Rate,98.48
Under 12,2.53
columns,"[jw_score, census_count_inverse, occ_listed, a..."
confusion matrix,"[[33097, 1321], [1321, 13788]]"
weights,"[0.6, 0.15, 0.1, 0.15]"


#### Explore issues with dismabiguation process at some weighting schemes

In [32]:
debugging = confidence_score_tuning(param_grid, match_sample, census, latlng)

Running
Creating dictionary of sub dfs (1/4)...
Applying algorithms iteratively (2/4)...
Number of Subgraphs: 26289
Reached: 0


Unnamed: 0,LONG,LAT,letter
0,-73.986717,40.734097,N0
1,-73.98673,40.734094,N1
2,-73.986717,40.734097,N1
3,-73.98673,40.734094,N2


Index(['OBJECTID.x', 'CENSUS_NAMEFRSTB', 'CENSUS_NAMELASTB', 'CENSUS_AGE',
       'CENSUS_OCCLABELB', 'CENSUS_MATCH_ADDR', 'CENSUS_SEGMENT_ID',
       'WARD_NUM', 'CD_ED', 'OBJECTID', 'MATCH_ADDR', 'CD_FIRST_NAME',
       'CD_LAST_NAME', 'CD_OCCUPATION', 'CD_FINAL_HOUSENUM', 'jw_fn', 'jw_ln',
       'jw_score', 'occ_listed', 'age_score', 'cd_count', 'census_count',
       'confidence_score', 'CD_ID', 'CENSUS_ID', 'census_count_inverse',
       'cd_count_inverse', 'LONG', 'LAT', 'confidence_score_0',
       'confidence_score_1', 'confidence_score_2', 'anchor', 'group_ID',
       'node_ID', 'letter', 'in_cluster'],
      dtype='object')
post merge Index(['OBJECTID.x_x', 'CENSUS_NAMEFRSTB_x', 'CENSUS_NAMELASTB_x',
       'CENSUS_AGE_x', 'CENSUS_OCCLABELB_x', 'CENSUS_MATCH_ADDR_x',
       'CENSUS_SEGMENT_ID_x', 'WARD_NUM_x', 'CD_ED_x', 'OBJECTID_x',
       'MATCH_ADDR_x', 'CD_FIRST_NAME_x', 'CD_LAST_NAME_x', 'CD_OCCUPATION_x',
       'CD_FINAL_HOUSENUM_x', 'jw_fn_x', 'jw_ln_x', 'jw_score_x

Unnamed: 0,LONG_x,LONG_y,LAT_x,LAT_y,letter_x,letter_y
0,-73.986717,-73.986717,40.734097,40.734097,N0,N0
1,-73.986717,-73.98673,40.734097,40.734094,N0,N1
2,-73.986717,-73.986717,40.734097,40.734097,N0,N1
3,-73.986717,-73.98673,40.734097,40.734094,N0,N2
4,-73.98673,-73.986717,40.734094,40.734097,N1,N0


Unnamed: 0,LONG_x,LONG_y,LAT_x,LAT_y,letter_x,letter_y
1,-73.986717,-73.98673,40.734097,40.734094,N0,N1
2,-73.986717,-73.986717,40.734097,40.734097,N0,N1
7,-73.98673,-73.98673,40.734094,40.734094,N1,N2
11,-73.986717,-73.98673,40.734097,40.734094,N1,N2


Index(['OBJECTID.x_x', 'CENSUS_NAMEFRSTB_x', 'CENSUS_NAMELASTB_x',
       'CENSUS_AGE_x', 'CENSUS_OCCLABELB_x', 'CENSUS_MATCH_ADDR_x',
       'CENSUS_SEGMENT_ID_x', 'WARD_NUM_x', 'CD_ED_x', 'OBJECTID_x',
       'MATCH_ADDR_x', 'CD_FIRST_NAME_x', 'CD_LAST_NAME_x', 'CD_OCCUPATION_x',
       'CD_FINAL_HOUSENUM_x', 'jw_fn_x', 'jw_ln_x', 'jw_score_x',
       'occ_listed_x', 'age_score_x', 'cd_count_x', 'census_count_x',
       'confidence_score_x', 'CD_ID_x', 'CENSUS_ID_x',
       'census_count_inverse_x', 'cd_count_inverse_x', 'LONG_x', 'LAT_x',
       'confidence_score_0_x', 'confidence_score_1_x', 'confidence_score_2_x',
       'anchor_x', 'group_ID_x', 'node_ID_x', 'letter_x', 'in_cluster_x',
       'key', 'OBJECTID.x_y', 'CENSUS_NAMEFRSTB_y', 'CENSUS_NAMELASTB_y',
       'CENSUS_AGE_y', 'CENSUS_OCCLABELB_y', 'CENSUS_MATCH_ADDR_y',
       'CENSUS_SEGMENT_ID_y', 'WARD_NUM_y', 'CD_ED_y', 'OBJECTID_y',
       'MATCH_ADDR_y', 'CD_FIRST_NAME_y', 'CD_LAST_NAME_y', 'CD_OCCUPATION_y',
       'C

Unnamed: 0,LONG,LAT,letter
0,-73.98673,40.734094,N0
1,-73.986717,40.734097,N0


Index(['OBJECTID.x', 'CENSUS_NAMEFRSTB', 'CENSUS_NAMELASTB', 'CENSUS_AGE',
       'CENSUS_OCCLABELB', 'CENSUS_MATCH_ADDR', 'CENSUS_SEGMENT_ID',
       'WARD_NUM', 'CD_ED', 'OBJECTID', 'MATCH_ADDR', 'CD_FIRST_NAME',
       'CD_LAST_NAME', 'CD_OCCUPATION', 'CD_FINAL_HOUSENUM', 'jw_fn', 'jw_ln',
       'jw_score', 'occ_listed', 'age_score', 'cd_count', 'census_count',
       'confidence_score', 'CD_ID', 'CENSUS_ID', 'census_count_inverse',
       'cd_count_inverse', 'LONG', 'LAT', 'confidence_score_0',
       'confidence_score_1', 'confidence_score_2', 'anchor', 'group_ID',
       'node_ID', 'letter', 'in_cluster'],
      dtype='object')
post merge Index(['OBJECTID.x_x', 'CENSUS_NAMEFRSTB_x', 'CENSUS_NAMELASTB_x',
       'CENSUS_AGE_x', 'CENSUS_OCCLABELB_x', 'CENSUS_MATCH_ADDR_x',
       'CENSUS_SEGMENT_ID_x', 'WARD_NUM_x', 'CD_ED_x', 'OBJECTID_x',
       'MATCH_ADDR_x', 'CD_FIRST_NAME_x', 'CD_LAST_NAME_x', 'CD_OCCUPATION_x',
       'CD_FINAL_HOUSENUM_x', 'jw_fn_x', 'jw_ln_x', 'jw_score_x

Unnamed: 0,LONG_x,LONG_y,LAT_x,LAT_y,letter_x,letter_y
0,-73.98673,-73.98673,40.734094,40.734094,N0,N0
1,-73.98673,-73.986717,40.734094,40.734097,N0,N0
2,-73.986717,-73.98673,40.734097,40.734094,N0,N0
3,-73.986717,-73.986717,40.734097,40.734097,N0,N0


Unnamed: 0,LONG_x,LONG_y,LAT_x,LAT_y,letter_x,letter_y


Index(['OBJECTID.x_x', 'CENSUS_NAMEFRSTB_x', 'CENSUS_NAMELASTB_x',
       'CENSUS_AGE_x', 'CENSUS_OCCLABELB_x', 'CENSUS_MATCH_ADDR_x',
       'CENSUS_SEGMENT_ID_x', 'WARD_NUM_x', 'CD_ED_x', 'OBJECTID_x',
       'MATCH_ADDR_x', 'CD_FIRST_NAME_x', 'CD_LAST_NAME_x', 'CD_OCCUPATION_x',
       'CD_FINAL_HOUSENUM_x', 'jw_fn_x', 'jw_ln_x', 'jw_score_x',
       'occ_listed_x', 'age_score_x', 'cd_count_x', 'census_count_x',
       'confidence_score_x', 'CD_ID_x', 'CENSUS_ID_x',
       'census_count_inverse_x', 'cd_count_inverse_x', 'LONG_x', 'LAT_x',
       'confidence_score_0_x', 'confidence_score_1_x', 'confidence_score_2_x',
       'anchor_x', 'group_ID_x', 'node_ID_x', 'letter_x', 'in_cluster_x',
       'key', 'OBJECTID.x_y', 'CENSUS_NAMEFRSTB_y', 'CENSUS_NAMELASTB_y',
       'CENSUS_AGE_y', 'CENSUS_OCCLABELB_y', 'CENSUS_MATCH_ADDR_y',
       'CENSUS_SEGMENT_ID_y', 'WARD_NUM_y', 'CD_ED_y', 'OBJECTID_y',
       'MATCH_ADDR_y', 'CD_FIRST_NAME_y', 'CD_LAST_NAME_y', 'CD_OCCUPATION_y',
       'C

ValueError: Cannot set a frame with no defined index and a value that cannot be converted to a Series

In [10]:
debugging = confidence_score_tuning(param_grid, match, census, latlng)

Running
Creating dictionary of sub dfs (1/4)...
Applying algorithms iteratively (2/4)...
Number of Subgraphs: 78742
Reached: 0


ValueError: Cannot set a frame with no defined index and a value that cannot be converted to a Series

This issue is not caused by sampling since it occurs over the entire dataset too.

In [34]:
param_grid = [{"columns": ['jw_score','cd_count_inverse','census_count_inverse', 'occ_listed', 'age_score'], "weights":[0.6,0.1,0.1,0.1,0.1]}]
debugging = confidence_score_tuning(param_grid, match_sample, census, latlng)

Running
Creating dictionary of sub dfs (1/4)...


Unnamed: 0,OBJECTID.x,CENSUS_NAMEFRSTB,CENSUS_NAMELASTB,CENSUS_AGE,CENSUS_OCCLABELB,CENSUS_MATCH_ADDR,CENSUS_SEGMENT_ID,WARD_NUM,CD_ED,OBJECTID,...,cd_count,census_count,confidence_score,CD_ID,CENSUS_ID,census_count_inverse,cd_count_inverse,LONG,LAT,confidence_score_0
0,862548,STEPHEN,ZELLER,40,IRON MOULDER,"504 55TH ST W, NYC-Manhattan, NY",3789,22,513.0,3,...,1,1,0.95,CD_3,CENSUS_862548,1.0,1.0,-73.989856,40.767868,0.9424
1,795510,OSCAR,ZOLLIKOFFER,70,"PRESIDENT, METROPOLITAN GAS CO.","210 46TH ST W, NYC-Manhattan, NY",3357,22,469.0,4,...,1,1,1.0,CD_4,CENSUS_795510,1.0,1.0,-73.985856,40.758819,1.0
6,330206,HERMAN,ZOPFS,32,"WAITER, RESTAURANT","51 ALLEN ST, NYC-Manhattan, NY",976,10,199.0,15,...,1,1,1.0,CD_15,CENSUS_330206,1.0,1.0,-73.991892,40.716711,1.0
7,855850,MICHAEL,ZOPPE,23,ENGINEER,"442 53RD ST W, NYC-Manhattan, NY",3681,22,508.0,16,...,1,1,1.0,CD_16,CENSUS_855850,1.0,1.0,-73.989711,40.766168,1.0
8,796365,RICKA,ZORKOWSKIE,42,KEEPING HOUSE,"253 44TH ST W, NYC-Manhattan, NY",3293,22,470.0,18,...,1,1,0.97,CD_18,CENSUS_796365,1.0,1.0,-73.988171,40.758155,0.9604


Applying algorithms iteratively (2/4)...
Number of Subgraphs: 0
Cleaning output (3/4)...


ValueError: No objects to concatenate

Here the issue is there are no subgraphs, apparently we don't have any anchor points with this weight scheme. So let's try it out simply using the entire graph if there are no subgraphs
- I altered the disambiguation file to test this

In [None]:
param_grid = [{"columns": ['jw_score','cd_count_inverse','census_count_inverse', 'occ_listed', 'age_score'], "weights":[0.6,0.1,0.1,0.1,0.1]}]
debugging = confidence_score_tuning(param_grid, match_sample, census, latlng)

Running
Creating dictionary of sub dfs (1/4)...
Applying algorithms iteratively (2/4)...
Number of Subgraphs: 0
Reached: 0


The kernel died -- I think this approach is simply too computationally expensive, even with the sample -- but it may not be an issue over the whole dataset?

In [12]:
param_grid = [{"columns": ['jw_score','cd_count_inverse','census_count_inverse', 'occ_listed', 'age_score'], "weights":[0.6,0.1,0.1,0.1,0.1]}]
debugging = confidence_score_tuning(param_grid, match, census, latlng)

Running
Creating dictionary of sub dfs (1/4)...
Applying algorithms iteratively (2/4)...
Number of Subgraphs: 0
Cleaning output (3/4)...


ValueError: No objects to concatenate

This issue occurs over the entire dataset -- so changing the sample won't help. I think at the moment the best approach would simply be not using weighting schemes that cause this error.