In [16]:
import pandas as pd
from pyjarowinkler import distance

### The code below is a demonstration of the obtaining the confidence score on results obtained from the Elastic Search based matching process (Amogh Mishar's implementation)

Using a dummy file `test.csv`. It has the same column names and order as they would be in actual results.

***Just replace 'test.csv' with the correct name of the results file, and make sure it is in the same directory***

In [17]:
df_results = pd.read_csv("test.csv", sep="\t")

In [18]:
df_results.head()

Unnamed: 0,OBJECTID.x,CENSUS_NAMEFRSTB,CENSUS_NAMELASTB,CENSUS_AGE,CENSUS_OCCLABELB,CENSUS_MATCH_ADDR,CENSUS_SEGMENT_ID,WARD_NUM,CD_ED,OBJECTID,MATCH_ADDR,CD_FIRST_NAME,CD_LAST_NAME,CD_OCCUPATION,CD_FINAL_HOUSENUM
0,522583,LEOPOLD,ZOLLMANN,30,SHOE MAKER,"342 HOUSTON ST E, NYC-Manhattan, NY",1283,11,305.0,13,"344 E HOUSTON ST, New York, NY",Leopold,Zollmann,shoes,344
1,330206,HERMAN,ZOPFS,32,"WAITER, RESTAURANT","51 ALLEN ST, NYC-Manhattan, NY",976,10,199.0,15,"51 ALLEN ST, New York, NY",Herman,Zopfs,waiter,51
2,313477,JACOB,ZORN,12,APPRENTICE SHOE MAKER,"57 FORSYTH ST, NYC-Manhattan, NY",915,10,193.0,21,"57 FORSYTH ST, New York, NY",Jacob,Zorn,barber,57
3,313482,JACOB,ZORN,45,BARBER,"57 FORSYTH ST, NYC-Manhattan, NY",915,10,193.0,21,"57 FORSYTH ST, New York, NY",Jacob,Zorn,barber,57
4,369292,JOHN,ZORN,21,TRUSSES,"217 BOWERY ST, NYC-Manhattan, NY",1384,17,216.0,24,"217 BOWERY ST, New York, NY",John,Zorn,trussmkr,217


In [19]:
df_results["jaro_distance_firstname"] = df_results.apply(lambda x: distance.get_jaro_distance(x["CD_FIRST_NAME"],
                                        x["CENSUS_NAMEFRSTB"], winkler=True, scaling=0.1), axis = 1)

In [20]:
df_results["jaro_distance_lastname"] = df_results.apply(lambda x: distance.get_jaro_distance(x["CD_LAST_NAME"],
                                        x["CENSUS_NAMELASTB"], winkler=True, scaling=0.1), axis = 1)

In [21]:
# Let's get counts of the potential matches for each cd record in the results
counts = df_results.groupby("OBJECTID")["OBJECTID.x"].count().reset_index().rename(columns = 
                                                                                   {'OBJECTID.x':'num_matches'})

In [22]:
df_results_ext = df_results.merge(counts, on='OBJECTID', how='inner')

In [23]:
# Only checking isnull() or == '*' because any other entry would mean that the occupation was not left blank,
# as Dan had suggested
df_results_ext['census_occupation_listed'] = df_results_ext.CENSUS_OCCLABELB.apply(lambda x: 0 
                                                    if (not x) or (x == '*') else 1)

In [24]:
df_results_ext['jaro_winkler_aggr_score'] = .4*df_results_ext['jaro_distance_firstname'] + \
                                                .6*df_results_ext['jaro_distance_lastname']

In [25]:
df_results_ext.head()

Unnamed: 0,OBJECTID.x,CENSUS_NAMEFRSTB,CENSUS_NAMELASTB,CENSUS_AGE,CENSUS_OCCLABELB,CENSUS_MATCH_ADDR,CENSUS_SEGMENT_ID,WARD_NUM,CD_ED,OBJECTID,MATCH_ADDR,CD_FIRST_NAME,CD_LAST_NAME,CD_OCCUPATION,CD_FINAL_HOUSENUM,jaro_distance_firstname,jaro_distance_lastname,num_matches,census_occupation_listed,jaro_winkler_aggr_score
0,522583,LEOPOLD,ZOLLMANN,30,SHOE MAKER,"342 HOUSTON ST E, NYC-Manhattan, NY",1283,11,305.0,13,"344 E HOUSTON ST, New York, NY",Leopold,Zollmann,shoes,344,1.0,1.0,1,1,1.0
1,330206,HERMAN,ZOPFS,32,"WAITER, RESTAURANT","51 ALLEN ST, NYC-Manhattan, NY",976,10,199.0,15,"51 ALLEN ST, New York, NY",Herman,Zopfs,waiter,51,1.0,1.0,1,1,1.0
2,313477,JACOB,ZORN,12,APPRENTICE SHOE MAKER,"57 FORSYTH ST, NYC-Manhattan, NY",915,10,193.0,21,"57 FORSYTH ST, New York, NY",Jacob,Zorn,barber,57,1.0,1.0,2,1,1.0
3,313482,JACOB,ZORN,45,BARBER,"57 FORSYTH ST, NYC-Manhattan, NY",915,10,193.0,21,"57 FORSYTH ST, New York, NY",Jacob,Zorn,barber,57,1.0,1.0,2,1,1.0
4,369292,JOHN,ZORN,21,TRUSSES,"217 BOWERY ST, NYC-Manhattan, NY",1384,17,216.0,24,"217 BOWERY ST, New York, NY",John,Zorn,trussmkr,217,1.0,1.0,1,1,1.0


## Calculating the confidence score
#### Using the following weightage for the 3 components in the confidence score
1. **50%** - Jaro-Winkler distance
2. **35%** - No. of matches (conflicts)
3. **15%** - Absence of occupation in the census (*)

In [26]:
df_results_ext['confidence_score'] = .5*df_results_ext.jaro_winkler_aggr_score + .35*(1/df_results_ext.num_matches) +\
                                        + .15*df_results_ext.census_occupation_listed

In [27]:
df_results_ext['confidence_score'] = df_results_ext.confidence_score.round(decimals=2)

In [None]:
# UNCOMMENT AND RUN TO WRITE THE RESULTS TO A FILE

# output_file_name = "final_results.csv" # FEEL FREE TO CHANGE THE NAME
# df_results_ext.to_csv(output_file_name, index=False)