## Reproduce results of Scheme A

Paper: "Statistical supervised meta-ensemble algorithm for data linkage"

Kha Vo, Jitendra Jonnagaddala, Siaw-Teng Liaw

February 2019

Jounal of Biomedical Informatics

Paper: "Statistical supervised meta-ensemble algorithm for data linkage"

Kha Vo, Jitendra Jonnagaddala, Siaw-Teng Liaw

February 2019

Jounal of Biomedical Informatics


In [1]:
import recordlinkage as rl, pandas as pd, numpy as np
from sklearn.model_selection import KFold
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.utils import shuffle
from recordlinkage.preprocessing import phonetic
from numpy.random import choice
import collections, numpy
from IPython.display import clear_output
from sklearn.model_selection import train_test_split, KFold

In [2]:
from utils import (
    generate_true_links,
    generate_false_links,
    swap_fields_flag,
    join_names_space,
    join_names_dash,
    abb_surname,
    reset_day,
    set_random_seed
)
from training_utils import train_model, classify, evaluation, blocking_performance

In [None]:
set_random_seed()

In [3]:
trainset = 'febrl_UNSW_train'
testset = 'febrl_UNSW_test'

In [4]:
## I did not touch these yet b/c there are differences
## - Andrew
def extract_features(df, links):
    c = rl.Compare()
    c.string('given_name', 'given_name', method='jarowinkler', label='y_name')
    c.string('given_name_soundex', 'given_name_soundex', method='jarowinkler', label='y_name_soundex')
    c.string('given_name_nysiis', 'given_name_nysiis', method='jarowinkler', label='y_name_nysiis')
    c.string('surname', 'surname', method='jarowinkler', label='y_surname')
    c.string('surname_soundex', 'surname_soundex', method='jarowinkler', label='y_surname_soundex')
    c.string('surname_nysiis', 'surname_nysiis', method='jarowinkler', label='y_surname_nysiis')
    c.exact('street_number', 'street_number', label='y_street_number')
    c.string('address_1', 'address_1', method='levenshtein', threshold=0.7, label='y_address1')
    c.string('address_2', 'address_2', method='levenshtein', threshold=0.7, label='y_address2')
    c.exact('postcode', 'postcode', label='y_postcode')
    c.exact('day', 'day', label='y_day')
    c.exact('month', 'month', label='y_month')
    c.exact('year', 'year', label='y_year')
        
    # Build features
    feature_vectors = c.compute(links, df, df)
    return feature_vectors


def generate_train_X_y(df,train_true_links):
    # This routine is to generate the feature vector X and the corresponding labels y
    # with exactly equal number of samples for both classes to train the classifier.
    pos = extract_features(df, train_true_links)
    train_false_links = generate_false_links(df, len(train_true_links))    
    neg = extract_features(df, train_false_links)
    X = pos.values.tolist() + neg.values.tolist()
    y = [1]*len(pos)+[0]*len(neg)
    X, y = shuffle(X, y, random_state=0)
    X = np.array(X)
    y = np.array(y)
    return X, y


In [5]:
## TRAIN SET CONSTRUCTION

# Import
print("Import train set...")
df_train = pd.read_csv(trainset+".csv", index_col = "rec_id")
train_true_links = generate_true_links(df_train)
print("Train set size:", len(df_train), ", number of matched pairs: ", str(len(train_true_links)))

# Preprocess train set
df_train['postcode'] = df_train['postcode'].astype(str)
df_train['given_name_soundex'] = phonetic(df_train['given_name'], method='soundex')
df_train['given_name_nysiis'] = phonetic(df_train['given_name'], method='nysiis')
df_train['surname_soundex'] = phonetic(df_train['surname'], method='soundex')
df_train['surname_nysiis'] = phonetic(df_train['surname'], method='nysiis')

# Final train feature vectors and labels
X_train, y_train = generate_train_X_y(df_train)
print("Finished building X_train, y_train")

Import train set...


100%|██████████| 3001/3001 [00:00<00:00, 3462.50it/s]
  s = s.str.replace(r"[\-\_\s]", "")


Train set size: 5000 , number of matched pairs:  1165


  s = s.str.replace(r"[\-\_\s]", "")
  s = s.str.replace(r"[\-\_\s]", "")
  s = s.str.replace(r"[\-\_\s]", "")
100%|██████████| 1165/1165 [00:00<00:00, 1291.04it/s]


Finished building X_train, y_train


In [6]:
df_train['given_name'].head()

rec_id
rec-1496-org      mitchell
rec-552-dup-3       harley
rec-988-dup-1     madeline
rec-1716-dup-1    isabelle
rec-1213-org        taylor
Name: given_name, dtype: object

In [10]:
# Blocking Criteria: declare non-match of all of the below fields disagree
# Import
print("Import test set...")
df_test = pd.read_csv(testset+".csv", index_col = "rec_id")
test_true_links = generate_true_links(df_test)
leng_test_true_links = len(test_true_links)
print("Test set size:", len(df_test), ", number of matched pairs: ", str(leng_test_true_links))

print("BLOCKING PERFORMANCE:")
blocking_fields = ["given_name", "surname", "postcode"]
all_candidate_pairs = []
for field in blocking_fields:
    block_indexer = rl.BlockIndex(on=field)
    candidates = block_indexer.index(df_test)
    # Comment(alecmori): blocking_performance takes two arguments, I think it's these two
    # detects = blocking_performance(candidates, test_true_links, df_test)
    detects = blocking_performance(candidates, df_test)
    all_candidate_pairs = candidates.union(all_candidate_pairs)
    print("Number of pairs of matched "+ field +": "+str(len(candidates)), ", detected ",
         detects,'/'+ str(leng_test_true_links) + " true matched pairs, missed " + 
          str(leng_test_true_links-detects) )
# Comment(alecmori): blocking_performance takes two arguments, I think it's these two
# detects = blocking_performance(all_candidate_pairs, test_true_links, df_test)
detects = blocking_performance(all_candidate_pairs, df_test)
print("Number of pairs of at least 1 field matched: " + str(len(all_candidate_pairs)), ", detected ",
     detects,'/'+ str(leng_test_true_links) + " true matched pairs, missed " + 
          str(leng_test_true_links-detects) )

Import test set...


100%|██████████| 5000/5000 [00:02<00:00, 2168.40it/s]


Test set size: 10000 , number of matched pairs:  5000
BLOCKING PERFORMANCE:
Number of pairs of matched given_name: 154898 , detected  3287 /5000 true matched pairs, missed 1713
Number of pairs of matched surname: 170843 , detected  3325 /5000 true matched pairs, missed 1675
Number of pairs of matched postcode: 53197 , detected  4219 /5000 true matched pairs, missed 781
Number of pairs of at least 1 field matched: 372073 , detected  4894 /5000 true matched pairs, missed 106


In [11]:
## TEST SET CONSTRUCTION

# Preprocess test set
print("Processing test set...")
print("Preprocess...")
df_test['postcode'] = df_test['postcode'].astype(str)
df_test['given_name_soundex'] = phonetic(df_test['given_name'], method='soundex')
df_test['given_name_nysiis'] = phonetic(df_test['given_name'], method='nysiis')
df_test['surname_soundex'] = phonetic(df_test['surname'], method='soundex')
df_test['surname_nysiis'] = phonetic(df_test['surname'], method='nysiis')

# Test feature vectors and labels construction
print("Extract feature vectors...")
df_X_test = extract_features(df_test, all_candidate_pairs)
vectors = df_X_test.values.tolist()
labels = [0]*len(vectors)
feature_index = df_X_test.index
for i in range(0, len(feature_index)):
    if df_test.loc[feature_index[i][0]]["match_id"]==df_test.loc[feature_index[i][1]]["match_id"]:
        labels[i] = 1
X_test, y_test = shuffle(vectors, labels, random_state=0)
X_test = np.array(X_test)
y_test = np.array(y_test)
print("Count labels of y_test:",collections.Counter(y_test))
print("Finished building X_test, y_test")

Processing test set...
Preprocess...
Extract feature vectors...


  s = s.str.replace(r"[\-\_\s]", "")
  s = s.str.replace(r"[\-\_\s]", "")
  s = s.str.replace(r"[\-\_\s]", "")
  s = s.str.replace(r"[\-\_\s]", "")


Count labels of y_test: Counter({0: 367179, 1: 4894})
Finished building X_test, y_test


In [12]:
## BASE LEARNERS CLASSIFICATION AND EVALUATION
# Choose model
print("BASE LEARNERS CLASSIFICATION PERFORMANCE:")
modeltype = 'svm' # choose between 'svm', 'lg', 'nn'
modeltype_2 = 'rbf'  # 'linear' or 'rbf' for svm, 'l1' or 'l2' for lg, 'relu' or 'logistic' for nn
modelparam_range = [.001,.002,.005,.01,.02,.05,.1,.2,.5,1,5,10,20,50,100,200,500,1000,2000,5000] # C for svm, C for lg, alpha for NN
print("Model:",modeltype,", Param_1:",modeltype_2, ", tuning range:", modelparam_range)
precision = []
sensitivity = []
Fscore = []
nb_false = []

for modelparam in modelparam_range:
    md = train_model(modeltype, modelparam, X_train, y_train, modeltype_2)
    final_result = classify(md, X_test)
    final_eval = evaluation(y_test, final_result)
    precision += [final_eval['precision']]
    sensitivity += [final_eval['sensitivity']]
    Fscore += [final_eval['F-score']]
    nb_false  += [final_eval['no_false']]
    
print("No_false:",nb_false,"\n")
print("Precision:",precision,"\n")
print("Sensitivity:",sensitivity,"\n")
print("F-score:", Fscore,"\n")
print("")

BASE LEARNERS CLASSIFICATION PERFORMANCE:
Model: svm , Param_1: rbf , tuning range: [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000]
No_false: [149, 688, 2813, 3841, 6361, 5923, 5974, 3730, 5302, 5653, 11689, 11072, 12022, 11997, 11997, 11997, 11997, 11996, 11997, 11997] 

Precision: [0.974025974025974, 0.8789189189189189, 0.635217673814165, 0.5603576751117735, 0.43478260869565216, 0.4524095828323004, 0.45027624309392267, 0.5675644299976782, 0.4799725112900059, 0.46399089097637347, 0.29502262443438915, 0.30645363408521303, 0.28923713778829097, 0.28966538347645837, 0.28966538347645837, 0.28966538347645837, 0.28966538347645837, 0.2896825396825397, 0.28966538347645837, 0.28966538347645837] 

Sensitivity: [0.9961176951369023, 0.9967306906416019, 0.9987740089906008, 0.9987740089906008, 0.9991826726604005, 0.9993870044953004, 0.9991826726604005, 0.9989783408255006, 0.9989783408255006, 0.9991826726604005, 0.9991826726604005, 0.999387004

In [14]:
## ENSEMBLE CLASSIFICATION AND EVALUATION

print("BAGGING PERFORMANCE:\n")
modeltypes = ['svm', 'nn', 'lg'] 
modeltypes_2 = ['linear', 'relu', 'l2']
modelparams = [0.005, 100, 0.2]
nFold = 10
kf = KFold(n_splits=nFold)
model_raw_score = [0]*3
model_binary_score = [0]*3
model_i = 0
for model_i in range(3):
    modeltype = modeltypes[model_i]
    modeltype_2 = modeltypes_2[model_i]
    modelparam = modelparams[model_i]
    print(modeltype, "per fold:")
    iFold = 0
    result_fold = [0]*nFold
    final_eval_fold = [0]*nFold
    for train_index, valid_index in kf.split(X_train):
        X_train_fold = X_train[train_index]
        y_train_fold = y_train[train_index]
        md =  train_model(modeltype, modelparam, X_train_fold, y_train_fold, modeltype_2)
        result_fold[iFold] = classify(md, X_test)
        final_eval_fold[iFold] = evaluation(y_test, result_fold[iFold])
        print("Fold", str(iFold), final_eval_fold[iFold])
        iFold = iFold + 1
    bagging_raw_score = np.average(result_fold, axis=0)
    bagging_binary_score  = np.copy(bagging_raw_score)
    bagging_binary_score[bagging_binary_score > 0.5] = 1
    bagging_binary_score[bagging_binary_score <= 0.5] = 0
    bagging_eval = evaluation(y_test, bagging_binary_score)
    print(modeltype, "bagging:", bagging_eval)
    print('')
    model_raw_score[model_i] = bagging_raw_score
    model_binary_score[model_i] = bagging_binary_score

BAGGING PERFORMANCE:

svm per fold:
Fold 0 {'no_false': 194, 'confusion_matrix': [4877, 177, 17, 367002], 'precision': 0.9649782350613375, 'sensitivity': 0.996526358806702, 'no_links': 5054, 'F-score': 0.9804985926819461}
Fold 1 {'no_false': 245, 'confusion_matrix': [4878, 229, 16, 366950], 'precision': 0.9551595848834933, 'sensitivity': 0.9967306906416019, 'no_links': 5107, 'F-score': 0.9755024497550244}
Fold 2 {'no_false': 269, 'confusion_matrix': [4877, 252, 17, 366927], 'precision': 0.9508676155195944, 'sensitivity': 0.996526358806702, 'no_links': 5129, 'F-score': 0.9731617280255411}
Fold 3 {'no_false': 206, 'confusion_matrix': [4877, 189, 17, 366990], 'precision': 0.9626924595341493, 'sensitivity': 0.996526358806702, 'no_links': 5066, 'F-score': 0.9793172690763051}
Fold 4 {'no_false': 192, 'confusion_matrix': [4877, 175, 17, 367004], 'precision': 0.965360253365004, 'sensitivity': 0.996526358806702, 'no_links': 5052, 'F-score': 0.9806957570882766}
Fold 5 {'no_false': 227, 'confusio

In [15]:
thres = .99

print("STACKING PERFORMANCE:\n")
stack_raw_score = np.average(model_raw_score, axis=0)
stack_binary_score = np.copy(stack_raw_score)
stack_binary_score[stack_binary_score > thres] = 1
stack_binary_score[stack_binary_score <= thres] = 0
stacking_eval = evaluation(y_test, stack_binary_score)
print(stacking_eval)

STACKING PERFORMANCE:

{'no_false': 120, 'confusion_matrix': [4875, 101, 19, 367078], 'precision': 0.9797025723472669, 'sensitivity': 0.9961176951369023, 'no_links': 4976, 'F-score': 0.9878419452887538}
