# HMNI Model Building

In [None]:
import sys
import os

In [None]:
hmni_path = os.path.abspath(os.path.join('..'))
if hmni_path not in sys.path:
    sys.path.append(hmni_path+"\\hmni")

In [None]:
import pandas as pd
pd.set_option('display.max_columns',500)
pd.set_option('display.max_rows',1000)
pd.set_option('display.max_colwidth', None)

In [None]:
alt_names = pd.read_csv('name_pairs.txt', sep=",", header=None)

In [None]:
alt_names.columns=['name_a', 'name_b']

In [None]:
alt_names.sample(10)

In [None]:
import unidecode
from fuzzywuzzy import fuzz
from syllable_tokenizer import SyllableTokenizer
ST = SyllableTokenizer()
from abydos.distance import (IterativeSubString, BISIM, DiscountedLevenshtein, Prefix, LCSstr, MLIPNS, Strcmp95,
MRA, Editex, SAPS, FlexMetric, JaroWinkler, HigueraMico, Sift4, Eudex, ALINE, Covington, PhoneticEditDistance)
from abydos.phonetic import PSHPSoundexFirst, Ainsworth
pshp_soundex_first = PSHPSoundexFirst()

In [None]:
iss = IterativeSubString()
bisim = BISIM()
dlev = DiscountedLevenshtein()
prefix = Prefix()
lcs = LCSstr()
mlipns = MLIPNS()
strcmp95 = Strcmp95()
mra = MRA()
editex = Editex()
saps = SAPS()
flexmetric = FlexMetric()
jaro = JaroWinkler(mode='Jaro')
higuera_mico = HigueraMico()
sift4 = Sift4()
eudex = Eudex()
aline = ALINE()
covington = Covington()
phonetic_edit = PhoneticEditDistance()

In [None]:
algos = [iss,bisim,dlev,prefix,lcs,mlipns,strcmp95,mra,editex,saps,
         flexmetric,jaro,higuera_mico,sift4,eudex,aline,covington,phonetic_edit]

algo_names = ['iterativesubstring','bisim','discountedlevenshtein','prefix','lcsstr','mlipns','strcmp95','mra','editex','saps',
              'flexmetric','jaro','higueramico','sift4','eudex','aline','covington','phoneticeditdistance']

In [None]:
from abydos.phones import *

In [None]:
pe = Ainsworth()
def sum_ipa(name_a, name_b):
    feat1 = ipa_to_features(pe.encode(name_a))
    feat2 = ipa_to_features(pe.encode(name_b))
    score = sum(cmp_features(f1, f2) for f1, f2 in zip(feat1, feat2))/len(feat1)
    return score

In [None]:
import re

In [None]:
def featurize(df):
    if len(df.columns)==3:
        df.columns=['a', 'b', 'target']
    elif len(df.columns)==2:
        df.columns=['a', 'b']
    else:
        df = df.rename(columns={ df.columns[0]: 'a', df.columns[1]: 'b' })
    df['name_a'] = df.apply(lambda row: re.sub(
        '[^a-zA-Z]+', '', unidecode.unidecode(row['a']).lower().strip()), axis=1)
    df['name_b'] = df.apply(lambda row: re.sub(
        '[^a-zA-Z]+', '', unidecode.unidecode(row['b']).lower().strip()), axis=1)
    df['syll_a'] = df.apply(lambda row: ST.tokenize(row.name_a),axis=1)
    df['syll_a'] = df.apply(lambda row: ST.tokenize(row.name_a),axis=1)
    df['syll_b'] = df.apply(lambda row: ST.tokenize(row.name_b),axis=1)
    df['partial'] = df.apply(lambda row: fuzz.partial_ratio(row.syll_a,row.syll_b),axis=1)
    df['tkn_sort'] = df.apply(lambda row: fuzz.token_sort_ratio(row.syll_a,row.syll_b),axis=1)
    df['tkn_set'] = df.apply(lambda row: fuzz.token_set_ratio(row.syll_a,row.syll_b),axis=1)
    df['sum_ipa'] = df.apply(lambda row: sum_ipa(row.name_a, row.name_b),axis=1)
    df['pshp_soundex_first'] = df.apply(
        lambda row: 1 if pshp_soundex_first.encode(row.name_a) == pshp_soundex_first.encode(row.name_b) else 0,axis=1)
    for i, algo in enumerate(algos):
            df[algo_names[i]] = df.apply(lambda row: algo.sim(row.name_a, row.name_b),axis=1)
            
    df.drop(['syll_a','syll_b'], axis=1, inplace=True)
    return df

In [None]:
# Unused featurize code
'''
def removesuffix(name):
    # high-frequency suffixes
    suffixes =['acska','ander','andra','annah','annes','anuel','arina','athan','cilla','cillo','delia','echka',
               'ecita','ecito','eczek','eczka','ediah','elina','eline','eniek','erick','erina','essie','ester',
               'henka','ienka','illie','islav','lbert','lenka','linda','linha','lphia','nchik','nette','ninha',
               'ochka','olina','oncho','onnie','oshka''oslav','rance','rence','rilla','rinda','risse','slava',
               'stina','stine','tenka','ushka','ustin','yinka','yusha','zuela','zuelo','akun','alle','anda',
               'ande','anka','anna','anne','bert','chan','chen','chka','chuk','ciek','cita','citp','czek','czka',
               'elia','ella','elle','ence','enia','enka','enki','enne','erre','erry','etta','ette','iana','illa',
               'ille','illo','illy','imir','inda','inha','inho','inka','inyu','isse','lava','lein','lina','line',
               'llie','ncho','ndra','nnie','oche','omir','onne','rick','rina','shka','slav','ssie','ster','tina',
               'tine','uela','uelo','uina','usha','usia','yika','ale','ana','ane','ari','chk','cho','cia','csi',
               'czo','ela','ele','ell','ena','ene','ert','eta','ete','han','hka','iah','ica','ick','ico','ika',
               'ina','ine','ino','isa','ita','ito','lav','lia','lie','lka','lki','lla','lle','lly','mir','nce',
               'nda','nha','nia','nie','njo','nka','nna','nne','nny','onk','ora','rre','rry','sha','shk','sia',
               'sie','ska','ski','sku','son','sse','tta','tte','ush','yok','zek','zka','ah','am','an','as','av',
               'ca','ce','ck','da','de','do','dy','ek','el','en','er','es','et','ey','ge','ha','ho','ia','ic',
               'ie','ig','ik','in','is','ja','ke','ko','la','le','ll','lo','na','ne','ni','on','or','ot','ra',
               're','ri','ry','sa','se','si','ta','te','to','un','us','ya','a','e','i','l','o','y']   
    for suffix in suffixes:
        if name.endswith(suffix):
            if len(name)>len(suffix)+1:
                return name[:-(len(suffix))]
    return name
'''

In [None]:
'''
# Remove frequent prefix/suffix
df['stemmed_a'] = df.apply(lambda row: removesuffix(row.name_a),axis=1)
df['stemmed_b'] = df.apply(lambda row: removesuffix(row.name_b),axis=1)

# Remove Vowels
df['consonant_a'] = df.apply(lambda row: re.sub("[aeiouAEIOU]","",row.name_a),axis=1)  
df['consonant_b'] = df.apply(lambda row: re.sub("[aeiouAEIOU]","",row.name_b),axis=1)

for i, metric in enumerate(dist_metrics):
    print(type(metric))
    try:
        df[dist_names[i]+'_stem'] = df.apply(lambda row: metric.sim(row.name_a, row.name_b),axis=1)
    except:
         continue

for i, metric in enumerate(dist_metrics):
    print(type(metric))
    try:
        df[dist_names[i]+'_cons'] = df.apply(lambda row: metric.sim(row.name_a, row.name_b),axis=1)
    except:
         continue
'''

In [None]:
# Positive Class
alt_names['target']=1

In [None]:
from itertools import combinations
import random
random.seed(30)

In [None]:
# Use combinatorics to generate negative class
all_names = alt_names.loc[:, 'name_a':'name_b'].values.tolist()
unique_names = list(set([item for items in all_names for item in items]))
alt_pairs = list(zip(alt_names.name_a, alt_names.name_b))+ list(zip(alt_names.name_b, alt_names.name_a))
comb = list(combinations(unique_names, 2))
non_alt = list(set(comb) - set(alt_pairs))
# Undersample the negative class for 1:4 class imbalance instead of 1:1000 extreme class imbalance
non_alt = pd.DataFrame(random.choices(non_alt, k=70040), columns=['name_a', 'name_b'])

In [None]:
print('positive class ratio 1:{}'.format(int(len(non_alt)/len(alt_names))))

In [None]:
'''
# Use combinatorics to generate negative class
all_names = alt_names.loc[:, 'name_a':'name_b'].values.tolist()
unique_names = list(set([item for items in all_names for item in items]))
alt_pairs = list(zip(alt_names.name_a, alt_names.name_b))+ list(zip(alt_names.name_b, alt_names.name_a))
#comb = list(islice(combinations(unique_names, 2), len(df)+len(alt_pairs)))
comb = list(combinations(unique_names, 2))
non_alt = list(set(comb) - set(alt_pairs))
# Undersample the negative class for equal class balance
non_alt = pd.DataFrame(non_alt[:len(alt_names)])
non_alt.columns=['name_a', 'name_b']
'''

In [None]:
non_alt.sample(10)

In [None]:
# Negative Class
non_alt['target']=0
df = pd.concat([alt_names, non_alt])
non_alt = None
alt_names = None

In [None]:
df = featurize(df)

In [None]:
df.sample(5)

In [None]:
y = df.target
X = df.drop('target',1)

## AUTOML Model Building

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.20,
    random_state=1)

In [None]:
#from tpot import TPOTClassifier # conda install -c conda-forge tpot xgboost dask dask-ml scikit-mdr skrebate

In [None]:
'''
pipeline_optimizer = TPOTClassifier(
        scoring = 'f1', # Harmonic Mean of Precision and Recall
        generations=200,
        verbosity=2,
        n_jobs=-1) # Utilizes all available CPU cores
pipeline_optimizer.fit(X_train.drop(['a', 'b', 'name_a', 'name_b'],1), y_train)
'''

In [None]:
#print(pipeline_optimizer.score(X_test.drop(['a', 'b', 'name_a', 'name_b'],1), y_test))

In [None]:
#pipeline_optimizer.export('tpot_exported_pipeline_final.py')

## Submodel 1: Exported TPOT Pipeline

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler

In [None]:
def base_model_1(X_train, y_train, X_test, export=False):
    exported_pipeline = make_pipeline(
        MaxAbsScaler(),
        MinMaxScaler(),
        RandomForestClassifier(
            bootstrap=False,
            criterion="gini",
            max_features=0.25,
            min_samples_leaf=1,
            min_samples_split=4,
            n_estimators=100)
    )
    exported_pipeline.fit(X_train, y_train)
    if export==True:
        return exported_pipeline
    else:
        y_pred = exported_pipeline.predict_proba(X_test)
        return [p[1] for p in y_pred]

## Submodel 2: Deep LSTM siamese network
###   Modified from REPO: https://github.com/dhwajraj/deep-siamese-text-similarity

In [None]:
import tensorflow as tf # ==1.13.1
import numpy as np
import os
import random
from input_helpers import InputHelper
from siamese_network import SiameseLSTM

In [None]:
def base_model_2(X_train, y_train, X_test, export=False):
    
    # Train Model
    embedding_dim = 300 # Dimensionality of character embedding
    dropout_keep_prob = 0.8 # Dropout keep probability
    hidden_units = 50
    batch_size = 64
    num_epochs = 300  # Number of training epochs
    evaluate_every = 1000  # Evaluate model on dev set after this many steps
    max_document_length = 15
    out_dir = os.getcwd()+'\\' # where to save exported models

    inpH = InputHelper()
    train_set, dev_set, vocab_processor, sum_no_of_batches = \
        inpH.get_datasets(
        X_train[['name_a', 'name_b']],
        y_train,
        max_document_length=max_document_length,
        percent_dev=10,
        batch_size=64)


    #print('starting graph def')
    graph = tf.Graph()
    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
        sess = tf.Session(config=session_conf)
        #print('started session')
        with sess.as_default():
            siameseModel = SiameseLSTM(
                sequence_length=max_document_length,
                vocab_size=len(vocab_processor.vocabulary_),
                embedding_size=embedding_dim,
                hidden_units=hidden_units,
                batch_size=batch_size,
            )

            # Define Training procedure
            global_step = tf.Variable(0, name='global_step', trainable=False)
            optimizer = tf.train.AdamOptimizer(1e-3)
            #print('initialized siameseModel object')

        grads_and_vars = optimizer.compute_gradients(siameseModel.loss)
        tr_op_set = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
        #print('defined training_ops')
        
        if export==True:
            saver = tf.train.Saver(tf.global_variables(), max_to_keep=100)
            # Write vocabulary
            vocab_processor.save(os.path.join(out_dir, 'vocab'))

        # Initialize all variables
        sess.run(tf.global_variables_initializer())

        def train_step(x1_batch, x2_batch, y_batch):
            # A single training step
            if random.random() > 0.5:
                feed_dict = {
                    siameseModel.input_x1: x1_batch,
                    siameseModel.input_x2: x2_batch,
                    siameseModel.input_y: y_batch,
                    siameseModel.dropout_keep_prob: dropout_keep_prob,
                }
            else:
                feed_dict = {
                    siameseModel.input_x1: x2_batch,
                    siameseModel.input_x2: x1_batch,
                    siameseModel.input_y: y_batch,
                    siameseModel.dropout_keep_prob: dropout_keep_prob,
                }
            (_, step, loss, accuracy, dist, sim) = \
                sess.run([tr_op_set, global_step, siameseModel.loss, siameseModel.accuracy,
                          siameseModel.distance, siameseModel.temp_sim], feed_dict)

        def dev_step(x1_batch, x2_batch, y_batch):
            # A single training step
            if random.random() > 0.5:
                feed_dict = {
                    siameseModel.input_x1: x1_batch,
                    siameseModel.input_x2: x2_batch,
                    siameseModel.input_y: y_batch,
                    siameseModel.dropout_keep_prob: 1.0,
                }
            else:
                feed_dict = {
                    siameseModel.input_x1: x2_batch,
                    siameseModel.input_x2: x1_batch,
                    siameseModel.input_y: y_batch,
                    siameseModel.dropout_keep_prob: 1.0,
                }
            (step, loss, accuracy, sim) = \
                sess.run([global_step, siameseModel.loss, siameseModel.accuracy,
                          siameseModel.temp_sim], feed_dict)
            return accuracy

        # Generate batches
        batches = inpH.batch_iter(list(zip(train_set[0], train_set[1],
                                           train_set[2])), batch_size, num_epochs)
        max_validation_acc = 0.0
        for nn in range(sum_no_of_batches * num_epochs):
            batch = next(batches)
            if len(batch) < 1:
                continue
            (x1_batch, x2_batch, y_batch) = zip(*batch)
            if len(y_batch) < 1:
                continue
            train_step(x1_batch, x2_batch, y_batch)
            current_step = tf.train.global_step(sess, global_step)
            sum_acc = 0.0
            if current_step % evaluate_every == 0:
                dev_batches = inpH.batch_iter(list(zip(dev_set[0], dev_set[1], dev_set[2])), batch_size, 1)
                for db in dev_batches:
                    if len(db) < 1:
                        continue
                    (x1_dev_b, x2_dev_b, y_dev_b) = zip(*db)
                    if len(y_dev_b) < 1:
                        continue
                    acc = dev_step(x1_dev_b, x2_dev_b, y_dev_b)
                    sum_acc = sum_acc + acc
            if sum_acc > max_validation_acc:
                max_validation_acc = sum_acc
            
                if export==True:
                    # save model
                    saver.save(sess, out_dir, global_step=current_step)
                    tf.train.write_graph(sess.graph.as_graph_def(), out_dir, 'siamese_network.pb', as_text=False)
                
                #print('model {} with sum_accuracy={}'.format(nn, max_validation_acc))     
        if export==True:
            return
        
        # RUN OOF INFERENCE
        x1_temp= np.asarray(X_test['name_a'].tolist())
        x2_temp= np.asarray(X_test['name_b'].tolist())
        
        x1 = np.asarray(list(vocab_processor.transform(x1_temp)))
        x2 = np.asarray(list(vocab_processor.transform(x2_temp)))

        (predictions, sim) = sess.run([siameseModel.distance, siameseModel.temp_sim], {
                siameseModel.input_x1: x1,
                siameseModel.input_x2: x2,
                siameseModel.dropout_keep_prob: 1.0,
            })
                
        sim = predictions.tolist()
        sim = [1-x for x in sim]
        #print(sim)
        return sim

In [None]:
from sklearn.model_selection import StratifiedKFold

In [None]:
# Stratified K-Folds cross-validator
meta_training = pd.DataFrame()

stratified_kfold = StratifiedKFold(n_splits = 10, shuffle=True, random_state=1)
fold=1
for train_index, test_index in stratified_kfold.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    oof_pred = X_test[['name_a', 'name_b']]
    
    oof_pred['predict_proba'] = base_model_1(X_train.drop(['a', 'b', 'name_a', 'name_b'],1),
                                      y_train,
                                      X_test.drop(['a', 'b', 'name_a', 'name_b'],1))

    oof_pred['siamese_sim'] = base_model_2(X_train[['name_a', 'name_b']],
                                      y_train,
                                      X_test[['name_a', 'name_b']])
    
    oof_pred['target'] = y_test.tolist()
    print('completed fold {} of 10'.format(fold))
    fold+=1

    meta_training = meta_training.append(oof_pred)

In [None]:
meta_training.sample(100)

## Meta-model: Logistic Regression

In [None]:
df = featurize(meta_training)

In [None]:
df.head()

## Grid Search for precision

In [None]:
cols = [col for col in df.columns if col not in ['a', 'b', 'name_a', 'name_b', 'target', 'predict_proba', 'siamese_sim']]
comb2 = list(combinations(cols, 2))
comb3 = list(combinations(cols, 3))
colgrid = [(col,)for col in cols]+comb2+comb3

In [None]:
# Grid search cross validation
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df, df.target, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)

In [None]:
clf = LogisticRegression()
grid_clf = GridSearchCV(clf, param_grid = {'C':np.logspace(-4, 4, 20)}, scoring = 'precision', verbose=0)

scores=[]
for cols in colgrid:
    grid_clf.fit(X_train[['predict_proba', 'siamese_sim']+list(cols)], y_train)
    y_pred = grid_clf.predict(X_val[['predict_proba', 'siamese_sim']+list(cols)])
    tn, fp, fn, tp = confusion_matrix(y_val, y_pred).ravel()
    scores.append([str(cols), tn, fp, fn, tp])

In [None]:
scores_df = pd.DataFrame(scores)

In [None]:
scores_df.columns = ['features', 'tn', 'fp', 'fn', 'tp']

In [None]:
scores_df.head()

In [None]:
scores_df['error'] = scores_df['fp']+(scores_df['fn']).astype(int)

In [None]:
scores_df = scores_df.sort_values(['error', 'fp'])

In [None]:
scores_df.head(20)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df, df.target, test_size=0.2, random_state=1)

In [None]:
selected_cols = ['predict_proba', 'siamese_sim','tkn_set', 'iterativesubstring', 'strcmp95']

In [None]:
clf = LogisticRegression()
grid_clf = GridSearchCV(clf, param_grid = {'C':np.logspace(-4, 4, 20)}, scoring = 'precision')
grid_clf.fit(X_train[selected_cols], y_train)
y_pred = grid_clf.predict(X_test[selected_cols])

In [None]:
print(grid_clf.best_params_)

## Evaluation

In [None]:
from sklearn.metrics import classification_report

In [None]:
def evaluate(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    conf_matrix = pd.DataFrame(data = cm,columns = ['Predicted: 0', 'Predicted: 1'], index = ['Actual: 0', 'Actual: 1'])
    print(classification_report(y_test, y_pred))
    print(conf_matrix)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    print('\ntp: {} fp: {} fn: {} tp: {}'.format(tn, fp, fn, tp))

In [None]:
evaluate(y_test, y_pred)

In [None]:
# roc curve and auc
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt

In [None]:
fig, ax = plt.subplots(1,1, figsize=(7,7))
baseline_probs = [0 for _ in range(len(y_test))]
# keep probabilities for the positive outcome only
probs = grid_clf.predict_proba(X_test[selected_cols])
probs = probs[:, 1]
# calculate scores
baseline_auc = roc_auc_score(y_test, baseline_probs)
auc = roc_auc_score(y_test, probs)
auc= ('AUC=%.3f' % (auc))
# calculate roc curves
baseline_fpr, baseline_tpr, _ = roc_curve(y_test, baseline_probs)
fpr, tpr, _ = roc_curve(y_test, probs)
# plot the roc curve for the model
ax.plot(baseline_fpr, baseline_tpr, color='gray')
ax.plot(fpr, tpr, marker='.', color='black')
# axis labels
ax.set_xlabel('False Positive Rate',fontsize=12)
ax.set_ylabel('True Positive Rate',fontsize=12)
ax.set_title('Receiver Operating Characteristic', fontsize=16)
plt.text(.6, .3, auc, fontsize=13)

## Export Models and Pipeline

In [None]:
import joblib
import json

In [None]:
base_1 =base_model_1(X.drop(['a', 'b', 'name_a', 'name_b'],1), y, X_test=None, export=True)
joblib.dump(base_1, filename='base.pkl')

In [None]:
base_model_2(X[['name_a', 'name_b']], y, X_test=None, export=True)

In [None]:
joblib.dump(grid_clf.best_estimator_, filename='meta.pkl')