In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

In [3]:
temp_filename = 'data/block_'
filename = temp_filename + str(1) + '.csv'
df = pd.read_csv(filename)
for i in range(2, 11):
    filename = temp_filename + str(i) + '.csv'
    df_c = pd.read_csv(filename)
    df = pd.concat([df, df_c])

In [4]:
df.shape

(5749132, 12)

In [5]:
df.head()

Unnamed: 0,id_1,id_2,cmp_fname_c1,cmp_fname_c2,cmp_lname_c1,cmp_lname_c2,cmp_sex,cmp_bd,cmp_bm,cmp_by,cmp_plz,is_match
0,37291,53113,0.833333333333333,?,1.0,?,1,1,1,1,0,True
1,39086,47614,1.0,?,1.0,?,1,1,1,1,1,True
2,70031,70237,1.0,?,1.0,?,1,1,1,1,1,True
3,84795,97439,1.0,?,1.0,?,1,1,1,1,1,True
4,36950,42116,1.0,?,1.0,1,1,1,1,1,1,True


In [6]:
df = df.replace({'?': np.nan})

In [7]:
(df.is_match.sum() / df.shape[0]) * 100     # only 0.364% of the data is considered matching

0.3640723504000256

In [13]:
# use all columns
# X = df[['cmp_fname_c1', 'cmp_lname_c1', 'cmp_sex', 'cmp_bd', 'cmp_bm', 'cmp_by', 'cmp_plz']].fillna(0)

In [44]:
# use name columns
# X = df[['cmp_fname_c1', 'cmp_lname_c1']].fillna(0)

In [37]:
# use birthday columns
X = df[['cmp_bd', 'cmp_bm', 'cmp_by', 'cmp_plz']].fillna(0)

In [38]:
y = df.is_match

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [40]:
clf = GaussianNB()

In [41]:
clf.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [42]:
clf.score(X_train, y_train)

0.9996940842148977

In [43]:
clf.score(X_test, y_test)

0.9997034336469747

In [44]:
y_pred = clf.predict(X_test)

In [45]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       1.00      1.00      1.00   1145678
        True       0.96      0.96      0.96      4149

    accuracy                           1.00   1149827
   macro avg       0.98      0.98      0.98   1149827
weighted avg       1.00      1.00      1.00   1149827



In [42]:
p_list = []   # precision
r_list = []   # recall
f_list = []   # f-1 score
for _ in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
    clf = GaussianNB()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict = True)
    p_list.append(report['True']['precision'])
    r_list.append(report['True']['recall'])
    f_list.append(report['True']['f1-score'])

In [43]:
print('Average precision score:', sum(p_list) / len(p_list))
print('Average recall score:', sum(r_list) / len(r_list))
print('Average f1-score score:', sum(f_list) / len(f_list))

Average precision score: 0.9982209980417773
Average recall score: 0.9543757800002991
Average f1-score score: 0.9758014123456565
