### Imports

In [145]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
import joblib
import re
from sklearn.metrics import f1_score
import recordlinkage as rl
from numpy.random import choice

# Models to use
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC

### Only run once to download to drive initially

In [146]:
#nltk.download('punkt')

### Load the patient record dataset

In [147]:
data = pd.read_csv("../test/test5.csv")

In [148]:
#print(data['rec_id'].apply(lambda x: 1 if '-dup-' in x else 0))
#print(data['rec_id'].apply(lambda x: x.split('-')[0] if '-dup-' in x else np.nan))

### Clean up record IDs

In [149]:
# Cleanup records
data['rec_id'] = data['rec_id'].str.replace("rec-", "")
data['rec_id'] = data['rec_id'].str.replace("-org", "")

data.head()

Unnamed: 0,rec_id,culture,sex,given_name,surname,street_number,address_1,date_of_birth,phone_number,national_identifier,blocking_number,state,address_2
0,0,eng,f,audri,hambledon,5.0,burford place,19390120,2511120932,18597484,4,,
1,1,eng,f,jody,macdougall,80.0,carstensz street,19110201,7703368,73022768,3,sa,
2,2,eng,f,deandrea,jeffers,7.0,deloraine street,19330308,613990563,10639456,9,qld,
3,3,eng,f,tommie,traves,24.0,wilshire street,19940531,2513050578,27730848,5,,ferndale
4,4,eng,f,jeri,edwardson,17.0,plant road,19350506,7645660,24358245,0,sa,


In [150]:
data.tail()

Unnamed: 0,rec_id,culture,sex,given_name,surname,street_number,address_1,date_of_birth,phone_number,national_identifier,blocking_number,state,address_2
1795,523-dup-0,eng,m,det a,garde nar,7.0,victoria street,19510 211,2775392.0,17858385,7,nsw,knowsley park
1796,849-dup-0,eng,f,caitrona,keorgejon,46.0,currong street,19990422,2977950.0,54672668,7,nsw,
1797,75-dup-0,eng,f,emil a,mckell ar,47.0,crowder circuit,1904067,2513104854.0,63767305,8,vic,
1798,131-dup-0,eng,f,atilha,mcnahb,2.0,dobell circuit,,,36030263,3,nsw,mount sandiman
1799,755-dup-0,eng,,sezttsica,bramjon,86.0,western hill street,19580316,8227367.0,30562800,3,vic,rowethorpe


### Handle Missing Values and Clean Up

In [151]:
# Combine 'street_number', 'address_1', and 'address_2' into one field and clean it
data['address'] = data['street_number'].astype(str) + ' ' + data['address_1'].astype(str) + ' ' + data['address_2'].astype(str)
data['address'] = data['address'].apply(lambda x: re.sub('[^A-Za-z0-9]+', ' ', x))

data['date_of_birth']  = pd.to_datetime(data['date_of_birth'], errors = 'coerce')
data['day'] = data['date_of_birth'].dt.strftime('%d')
data['month'] = data['date_of_birth'].dt.strftime('%m')
data['year'] = data['date_of_birth'].dt.strftime('%Y')

# Drop the 'blocking_number' field
data = data.drop(columns=['blocking_number', 'street_number', 'address_1', 'address_2', 'date_of_birth', 'culture', 'state'])

for col in ["surname", "given_name", "address"]:
    data[col] = data[col].fillna('')
    data[col] = data[col].astype(str)

### Deal with Duplicates

In [152]:
# Identify known duplicates based on the rec_id column
duplicates = data[data["rec_id"].str.contains("-dup-")]

# Create a dictionary mapping each duplicate record to its corresponding original record
originals = {}
for i, row in duplicates.iterrows():
    original_id = row["rec_id"].replace("-dup-0", "")
    if original_id in originals:
        originals[original_id].append(i)
    else:
        originals[original_id] = [i]

# Create a new column called "match" to indicate whether a record is a duplicate or not
data["match"] = 0
data["match_id"] = data["rec_id"]
for original_id, duplicates in originals.items():
    data.loc[duplicates, "match"] = 1
    data.loc[duplicates, "match_id"] = original_id
    # this ensures that both the duplicate and the match are in the same training set
    data.loc[data['rec_id'] == original_id, 'match'] = 1
    data.loc[data['rec_id'] == original_id, 'match_id'] = original_id

data["match_id"] = data["match_id"].astype(int)

# get all fields
all_fields = data.columns.values.tolist()
print("All fields:", all_fields)

All fields: ['rec_id', 'sex', 'given_name', 'surname', 'phone_number', 'national_identifier', 'address', 'day', 'month', 'year', 'match', 'match_id']


In [153]:
# Test true links
def generate_true_links(df): 
    # although the match_id column is included in the original df to imply the true links,
    # this function will create the true_link object identical to the true_links properties
    # of recordlinkage toolkit, in order to exploit "Compare.compute()" from that toolkit
    # in extract_function() for extracting features quicker.
    # This process should be deprecated in the future release of the UNSW toolkit.
    df["rec_id"] = df.index.values.tolist()
    indices_1 = []
    indices_2 = []
    processed = 0
    for match_id in df["match_id"].unique():
        if match_id != -1:    
            processed = processed + 1
            # print("In routine generate_true_links(), count =", processed)
            # clear_output(wait=True)
            linkages = df.loc[df['match_id'] == match_id]
            for j in range(len(linkages)-1):
                for k in range(j+1, len(linkages)):
                    indices_1 = indices_1 + [linkages.iloc[j]["rec_id"]]
                    indices_2 = indices_2 + [linkages.iloc[k]["rec_id"]]    
    links = pd.MultiIndex.from_arrays([indices_1,indices_2])
    return links

def generate_false_links(df, size):
    # A counterpart of generate_true_links(), with the purpose to generate random false pairs
    # for training. The number of false pairs in specified as "size".
    df["rec_id"] = df.index.values.tolist()
    indices_1 = []
    indices_2 = []
    unique_match_id = df["match_id"].unique()
    unique_match_id = unique_match_id[~np.isnan(unique_match_id)] # remove nan values
    for j in range(size):
            false_pair_ids = choice(unique_match_id, 2)
            candidate_1_cluster = df.loc[df['match_id'] == false_pair_ids[0]]
            candidate_1 = candidate_1_cluster.iloc[choice(range(len(candidate_1_cluster)))]
            candidate_2_cluster = df.loc[df['match_id'] == false_pair_ids[1]]
            candidate_2 = candidate_2_cluster.iloc[choice(range(len(candidate_2_cluster)))]    
            indices_1 = indices_1 + [candidate_1["rec_id"]]
            indices_2 = indices_2 + [candidate_2["rec_id"]]  
    links = pd.MultiIndex.from_arrays([indices_1,indices_2])
    return links

def swap_fields_flag(f11, f12, f21, f22):
    return ((f11 == f22) & (f12 == f21)).astype(float)

def join_names_space(f11, f12, f21, f22):
    return ((f11+" "+f12 == f21) | (f11+" "+f12 == f22)| (f21+" "+f22 == f11)| (f21+" "+f22 == f12)).astype(float)

def join_names_dash(f11, f12, f21, f22):
    return ((f11+"-"+f12 == f21) | (f11+"-"+f12 == f22)| (f21+"-"+f22 == f11)| (f21+"-"+f22 == f12)).astype(float)

def abb_surname(f1, f2):
    return ((f1[0]==f2) | (f1==f2[0])).astype(float)

def reset_day(f11, f12, f21, f22):
    return (((f11 == 1) & (f12 == 1))|((f21 == 1) & (f22 == 1))).astype(float)

def extract_features(df, links):
    c = rl.Compare()
    c.string('given_name', 'given_name', method='levenshtein', label='y_name_leven')
    c.string('surname', 'surname', method='levenshtein', label='y_surname_leven')  
    c.string('given_name', 'given_name', method='jarowinkler', label='y_name_jaro')
    c.string('surname', 'surname', method='jarowinkler', label='y_surname_jaro')  
    # c.string('postcode', 'postcode', method='jarowinkler', label='y_postcode')      
    # exact_fields = ['postcode', 'address_1', 'address_2', 'street_number']
    # for field in exact_fields:
    #     c.exact(field, field, label='y_'+field+'_exact')
    #c.compare_vectorized(reset_day,('day', 'month'), ('day', 'month'),label='reset_day_flag')    
    # c.compare_vectorized(swap_fields_flag,('day', 'month'), ('day', 'month'),label='swap_day_month')    
    # c.compare_vectorized(swap_fields_flag,('surname', 'given_name'), ('surname', 'given_name'),label='swap_names')    
    # c.compare_vectorized(join_names_space,('surname', 'given_name'), ('surname', 'given_name'),label='join_names_space')
    # c.compare_vectorized(join_names_dash,('surname', 'given_name'), ('surname', 'given_name'),label='join_names_dash')
    # c.compare_vectorized(abb_surname,'surname', 'surname',label='abb_surname')
    # Build features
    feature_vectors = c.compute(links, df, df)
    return feature_vectors

# see example of true links
true_links_data = generate_true_links(data)
#print(true_links_data)

false_links_data = generate_false_links(data, len(true_links_data))
#print(false_links_data)

# get negative and positive features
pos = extract_features(data, true_links_data)
print('Pos\n')
print(pos)

neg = extract_features(data, false_links_data)
print('Neg\n')
print(neg)

Pos

          y_name_leven  y_surname_leven  y_name_jaro  y_surname_jaro
1   1493      0.750000             0.80     0.883333        0.893333
2   1791      0.875000             1.00     0.950000        1.000000
4   1684      1.000000             0.75     1.000000        0.950000
5   1326      0.857143             0.40     0.942857        0.600000
6   1562      0.500000             1.00     0.875556        1.000000
...                ...              ...          ...             ...
995 1351      0.750000             0.60     0.925000        0.805000
996 1245      0.666667             0.00     0.805556        0.000000
997 1184      1.000000             1.00     1.000000        1.000000
998 1247      0.571429             0.75     0.771429        0.908333
999 1360      1.000000             0.75     1.000000        0.883333

[800 rows x 4 columns]
Neg

           y_name_leven  y_surname_leven  y_name_jaro  y_surname_jaro
555  612       0.333333         0.000000     0.555556        0.41203

In [154]:
print('Head')
data.head()

Head


Unnamed: 0,rec_id,sex,given_name,surname,phone_number,national_identifier,address,day,month,year,match,match_id
0,0,f,audri,hambledon,2511120932,18597484,5 0 burford place nan,20,1,1939,0,0
1,1,f,jody,macdougall,7703368,73022768,80 0 carstensz street nan,1,2,1911,1,1
2,2,f,deandrea,jeffers,613990563,10639456,7 0 deloraine street nan,8,3,1933,1,2
3,3,f,tommie,traves,2513050578,27730848,24 0 wilshire street ferndale,31,5,1994,0,3
4,4,f,jeri,edwardson,7645660,24358245,17 0 plant road nan,6,5,1935,1,4


In [155]:
print('Tail')
data.tail()

Tail


Unnamed: 0,rec_id,sex,given_name,surname,phone_number,national_identifier,address,day,month,year,match,match_id
1795,1795,m,det a,garde nar,2775392.0,17858385,7 0 victoria street knowsley park,,,,1,523
1796,1796,f,caitrona,keorgejon,2977950.0,54672668,46 0 currong street nan,22.0,4.0,1999.0,1,849
1797,1797,f,emil a,mckell ar,2513104854.0,63767305,47 0 crowder circuit nan,7.0,6.0,1904.0,1,75
1798,1798,f,atilha,mcnahb,,36030263,2 0 dobell circuit mount sandiman,,,,1,131
1799,1799,,sezttsica,bramjon,8227367.0,30562800,86 0 western hill street rowethorpe,16.0,3.0,1958.0,1,755


### Encode data for machine learning

In [156]:
# Function to get the mean embedding for a list of tokens
def get_mean_embedding(tokens):
    embeddings = [model_w2v.wv[token] for token in tokens if token in model_w2v.wv]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(model_w2v.vector_size)  # Return a zero vector if no valid tokens

# Remove extra spaces from string columns
string_columns = ['given_name', 'surname', 'address']
for col in string_columns:
    data[col] = data[col].astype(str).str.strip().str.lower()

    # Label encoding for 'sex'
label_encoder_columns = ['sex']
for col in label_encoder_columns:
    label_encoder = LabelEncoder()
    data[col] = label_encoder.fit_transform(data[col].astype(str).str.lower())

# Tokenize the string columns and store them in a new DataFrame
tokenized_data = pd.DataFrame()
for col in string_columns:
    tokenized_data[col] = data[col].apply(word_tokenize)

# Generate the Word2Vec embeddings for the tokenized string columns
vector_size = 200
model_w2v = Word2Vec(tokenized_data.values.flatten(), vector_size=vector_size, min_count=1, workers=4)

# Save the trained model
model_w2v.save("word2vec_properties.model")

# Define the weights for the string columns
weights = {
    'given_name': 1, 
    'surname': 1, 
    'address': 1, 
    'phone_number': 1, 
    'national_identifier': 1,
    'sex': 1,
    'date_of_birth': 1
}

# Apply the Word2Vec embeddings to the tokenized string columns
# New way gives issues
# for col in string_columns:
#     data[col] = tokenized_data[col].apply(get_mean_embedding).values

# Old Way
embedded_data = pd.DataFrame()
for col in string_columns:
    embeddings = np.vstack(tokenized_data[col].apply(get_mean_embedding).values)
    temp_df = pd.DataFrame(embeddings, columns=[f"{col}_embed_{i}" for i in range(embeddings.shape[1])])
    temp_df *= weights[col]  # Apply the weights
    embedded_data = pd.concat([embedded_data, temp_df], axis=1)

# Replace the original string columns with the embedded columns
data = pd.concat([data.drop(columns=string_columns), embedded_data], axis=1)

# Impute missing values for numerical columns with the mean value
numerical_columns = ['phone_number', 'national_identifier']
for col in numerical_columns:
    data[col] = pd.to_numeric(data[col], errors='coerce')
    mean_value = data[col].mean() 
    data[col].fillna(value=mean_value, inplace=True) 

# imputer = SimpleImputer(strategy='mean')
# data[numerical_columns] = imputer.fit_transform(data[numerical_columns])

# Standardize numerical columns
# scaler = StandardScaler()
# data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

### Check Vector Columns & Sample Data

In [157]:
data.head()

Unnamed: 0,rec_id,sex,phone_number,national_identifier,day,month,year,match,match_id,given_name_embed_0,...,address_embed_190,address_embed_191,address_embed_192,address_embed_193,address_embed_194,address_embed_195,address_embed_196,address_embed_197,address_embed_198,address_embed_199
0,0,0,2511121000.0,18597484.0,20,1,1939,0,0,0.001556,...,0.028127,0.006696,-0.004842,-0.021773,0.02725,0.001877,0.005991,-0.036931,-0.016302,-0.019821
1,1,0,7703368.0,73022768.0,1,2,1911,1,1,-0.004509,...,0.028563,0.006779,-0.005574,-0.023863,0.029619,0.002801,0.00611,-0.037842,-0.016008,-0.019694
2,2,0,613990600.0,10639456.0,8,3,1933,1,2,0.00199,...,0.031771,0.006614,-0.004118,-0.024944,0.031909,0.002218,0.00634,-0.040786,-0.018715,-0.02081
3,3,0,2513051000.0,27730848.0,31,5,1994,0,3,-0.001892,...,0.024389,0.004425,-0.002529,-0.019023,0.023348,0.002352,0.004817,-0.030894,-0.011959,-0.016415
4,4,0,7645660.0,24358245.0,6,5,1935,1,4,-0.001987,...,0.023014,0.00707,-0.004024,-0.018493,0.023192,0.003244,0.004636,-0.031213,-0.014972,-0.014459


In [158]:
# print('surname\n')
# print(data['surname'][0])
# print('given_name\n')
# print(data['given_name'][0])
# print('address\n')
# print(data['address'][0])


### Initialize machine learning models

In [159]:
def train_model(modeltype, modelparam, train_vectors, train_labels, modeltype_2):
    if modeltype == 'rf': # Random Forest
        model = RandomForestClassifier(n_estimators=100, random_state=42, criterion=modeltype_2, max_depth=modelparam)
        model.fit(train_vectors, train_labels)
    elif modeltype == 'gbm': # Gradient Boosted Trees
        model = GradientBoostingClassifier(n_estimators=100, random_state=42, loss=modeltype_2, learning_rate=modelparam)
        model.fit(train_vectors, train_labels) 
    elif modeltype == 'sc': # StackingClassifier
        if modelparam < 2:
            modelparam = 2
        if modelparam > 1000:
            modelparam = 1000
        estimators = [
            ('svr', make_pipeline(StandardScaler(),
                                LinearSVC(random_state=42)))
        ]
        #model = StackingClassifier(estimators=estimators, stack_method=modeltype_2, cv=modelparam, n_jobs=-1)
        model = StackingClassifier(estimators=estimators, cv=modelparam, n_jobs=-1, final_estimator=LogisticRegression())
        model.fit(train_vectors, train_labels) 
    elif modeltype == 'lg': # Logistic Regression
        model = LogisticRegression(C=modelparam, penalty = modeltype_2,class_weight=None, dual=False, fit_intercept=True, 
                                   intercept_scaling=1, max_iter=5000, multi_class='ovr', 
                                   n_jobs=1, random_state=42)
        model.fit(train_vectors, train_labels)
    elif modeltype == 'nn': # Neural Network
        model = MLPClassifier(solver='lbfgs', alpha=modelparam, hidden_layer_sizes=(256, ), 
                              activation = modeltype_2,random_state=42, batch_size='auto', 
                              learning_rate='constant',  learning_rate_init=0.001, 
                              power_t=0.5, max_iter=30000, shuffle=True, 
                              tol=0.0001, verbose=False, warm_start=False, momentum=0.9, 
                              nesterovs_momentum=True, early_stopping=False, 
                              validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
        model.fit(train_vectors, train_labels)
    return model

def classify(model, test_vectors):
    result = model.predict(test_vectors)
    return result

def evaluation(test_labels, result):
    a = accuracy_score(test_labels, result)
    c = confusion_matrix(test_labels, result)
    p = precision_score(test_labels, result)
    f = f1_score(test_labels, result)
    r = recall_score(test_labels, result)
    
    metrics_result = {
        'confusion_matrix': c,
        'precision': p,
        'F-score': f, 
        'accuracy_score': a,
        'recall_score': r,
        'sensitivity': 'not set'
    }
    return metrics_result

In [160]:
# Split the data into features (X) and labels (y)
X = data.drop(columns=['rec_id', 'match'])
y = data['match']

In [161]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

# Use models
modeltype = ['sc', 'nn', 'lg', 'rf', 'gbm'] # choose between 'rf', 'gbm', 'lg', 'sc', 'nn'
modeltype_2 = ['predict', 'relu', 'l2', 'gini', 'log_loss']  # 'l2' or 'none' for lg, 'relu' or 'logistic' for nn, 'log_loss', 'deviance', or 'exponential' for gbm, 'gini', 'entropy', or 'log_loss' for rf, "auto", 'predict_proba', 'decision_function', 'predict' for sc
#modelparam_range = [0.001, 2000, 0.005]
modelparam_range = [100,200,500,1000] # C for svm, C for lg, alpha for NN

for i in range(len(modeltype)):
    print("Model:",modeltype[i],", Param_1:",modeltype_2[i], ", tuning range:", modelparam_range)
    precision = []
    sensitivity = []
    Fscore = []
    confusionMatrix = []
    accuracyScore = []
    recallScore = []
    scores = []

    for train_index, test_index in kfold.split(X, y):
        #print(f"Current Test Index: {test_index}\n")
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        for modelparam in modelparam_range:
            #print(f"Training {modeltype[i]} with model param {modelparam}\nu")
            md = train_model(modeltype[i], modelparam, X_train, y_train, modeltype_2[i])
            final_result = classify(md, X_test)
            final_eval = evaluation(y_test, final_result)
            precision += [final_eval['precision']]
            sensitivity += [final_eval['sensitivity']]
            Fscore += [final_eval['F-score']]
            confusionMatrix  += [final_eval['confusion_matrix']]
            accuracyScore  += [final_eval['accuracy_score']]
            recallScore  += [final_eval['recall_score']]
            scores += md.score(X_test, y_test)
            #print(f"Prediction Score {accuracy_score(y_test, final_result)}\n")

    print("Precision:",precision,"\n")
    print("Sensitivity:",sensitivity,"\n")
    print("F-score:", Fscore,"\n")
    print("Accuracy Score:", accuracyScore,"\n")
    print("Confusion Matrix:", confusionMatrix,"\n")
    print("Recall Score:", recallScore,"\n")
    print("Model Scores:", scores,"\n")
    print("")

Model: sc , Param_1: predict , tuning range: [100, 200, 500, 1000]


ValueError: Input X contains NaN.
LinearSVC does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
# # Initialize the classifiers
# rf = RandomForestClassifier(n_estimators=100, random_state=42)
# svm = LinearSVC(random_state=42)
# gbm = GradientBoostingClassifier(n_estimators=100, random_state=42)

# # Perform Stratified K-Fold Cross-Validation
# # when using the boolean of match 0 or 1
# #kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# # when using match id numberic values
# kfold = KFold(n_splits=2, shuffle=True, random_state=42)
# rf_accuracies = []
# svm_accuracies = []
# gbm_accuracies = []

# for train_index, test_index in kfold.split(X, y):
#     X_train, X_test = X.iloc[train_index], X.iloc[test_index]
#     y_train, y_test = y.iloc[train_index], y.iloc[test_index]

#     # Train and evaluate the classifiers
#     for clf, acc_list in [(rf, rf_accuracies), (svm, svm_accuracies), (gbm, gbm_accuracies)]:
#         clf.fit(X_train, y_train)
#         y_pred = clf.predict(X_test)
#         accuracy = accuracy_score(y_test, y_pred)
#         acc_list.append(accuracy)

# # Calculate the average accuracy for each classifier
# rf_avg_accuracy = np.mean(rf_accuracies)
# svm_avg_accuracy = np.mean(svm_accuracies)
# gbm_avg_accuracy = np.mean(gbm_accuracies)

# print("Random Forest Average Accuracy: {:.2f}".format(rf_avg_accuracy))
# print("Support Vector Machine Average Accuracy: {:.2f}".format(svm_avg_accuracy))
# print("Gradient Boosting Machine Average Accuracy: {:.2f}".format(gbm_avg_accuracy))

# # Select the best model
# best_model = None
# best_accuracy = 0

# for model, accuracy in [(rf, rf_avg_accuracy), (svm, svm_avg_accuracy), (gbm, gbm_avg_accuracy)]:
#     if accuracy > best_accuracy:
#         best_model = model
#         best_accuracy = accuracy

# print("Best Model: {}".format(type(best_model).__name__))
# print("Best Model Average Accuracy: {:.2f}".format(best_accuracy))

# # Save the best model
# joblib.dump(rf, 'best_rf_model.pkl')
# joblib.dump(svm, 'best_svm_model.pkl')
# joblib.dump(gbm, 'best_gbm_model.pkl')

# # Save the fitted SimpleImputer and StandardScaler instances
# #joblib.dump(imputer, 'imputer.pkl')
# #joblib.dump(scaler, 'scaler.pkl')