In [1]:
from Annotation_helper import *
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

Sets correct values for generated figures:

In [2]:
#Set all settings for the resulting figures

import seaborn as sns
sns.set()

SMALL_SIZE = 15
MEDIUM_SIZE = 17
BIGGER_SIZE = 19

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

plt.tight_layout()

<Figure size 432x288 with 0 Axes>

# Helper functions

In [3]:
def make_meshgrid(x, y, h=.02):
    x_min, x_max = x.min() - 1, x.max() + 1
    y_min, y_max = y.min() - 1, y.max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    return xx, yy

def plot_contours(ax, clf, xx, yy, **params):
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    out = ax.contourf(xx, yy, Z, **params)
    return out

def avg_dict_values(d):
    res = dict()
    for key in d:
        res[key] = np.mean(d[key])
    return res

def append_dict(d1, d2):
    res = dict()
    if d1 == {}:
        for key in d2:
            res[key] = [d2[key]]
        return d2
    else:
        for key in d1:
            if type(d1[key]) != list:
                res[key] = [d1[key]] + [d2[key]]
            else:
                res[key] = d1[key] + [d2[key]]
        return res

def evaluate_rule_based_methods(model, kf, X, y, thresh):
    all_scores = dict()
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        pred = model.predict(X_test.to_numpy(), thresh)
        scores = evaluate_prediction(pred, y_test.to_numpy(), model_name = "Exact Match", figure_folder=figure_folder)
        all_scores = append_dict(all_scores, scores)
    return all_scores

def evaluate_stat_based_methods(model, kf, X, y):
    all_scores = dict()
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        model.fit(X_train, y_train)
        pred = model.predict(X_test)
        scores = evaluate_prediction(pred, y_test, model_name = "Exact Match", figure_folder=figure_folder)
        all_scores = append_dict(all_scores, scores)
    return all_scores

In [4]:
n_folds = 5
scoring = ['precision', 'recall', 'f1']

# Loading data
Set paths to files:

In [5]:
data_folder = "../../Data"
structured_data_folder = data_folder + "/structured_data"
text_path = data_folder + "/text"
data_path = structured_data_folder + "/test.csv"
dist_folder = structured_data_folder + "/Distances"
figure_folder = "../Figures/Results/Embeddings"
model_folder = "../Models/Embeddings"
matches = pd.read_csv(structured_data_folder + "/Matches/Bas.csv", index_col=0)
figure_folder = "../Figures/Results/Classic/"
error_folder = structured_data_folder + "/Errors/"

Load transaction and permission datasets:

In [6]:
transactions = load_transactions(structured_data_folder + "\VOC Cochin Slave Transactions 1706-1801 - IISH - Database - 2018 - v1.csv")
permissions = load_permissions(structured_data_folder + "\VOC Cochin Slave Transport Permissions 1770-1795 - IISH - Database - 2018 - v1.csv")

  col = col.str.replace('[^a-zA-Z ]', '')


Load distances between datasets from multiple files and merge them into one dataframe:

In [7]:
distances = pd.read_csv(dist_folder + "/distances_0.csv", dtype={'permission_indx': 'uint16', 'transaction_indx': 'uint16', 'SlaafGender': 'bool', 'SlaafNaamNieuw': 'float16', 'BezitterGender': 'bool', 'BezitterVoornaam': 'float16', 'BezitterBeroep': 'float16', 'BezitterAchternaam': 'float16'})
for i in range(1, 99):
    df2 = pd.read_csv(f"{dist_folder}/distances_{i}.csv", dtype={'permission_indx': 'uint16', 'transaction_indx': 'uint16', 'SlaafGender': 'bool', 'SlaafNaamNieuw': 'float16', 'BezitterGender': 'bool', 'BezitterVoornaam': 'float16', 'BezitterBeroep': 'float16', 'BezitterAchternaam': 'float16'})
    distances = pd.concat([distances, df2])
distances.reset_index(inplace=True)

Split the data into a test and training set:

In [8]:
all_data = distances[distances['permission_indx'].isin(matches["permission_indx"].unique())]
all_data["Match"] = 0
for i, match in matches.iterrows():
    if match["transaction_indx"] != "None":
        indx = distances.query(f"permission_indx == {match['permission_indx']} & transaction_indx == {match['transaction_indx']}").index
        all_data.loc[indx, "Match"] = 1
data = all_data.iloc[:, :-1]
y = all_data.iloc[:, -1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_data["Match"] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [9]:
Classifiers = dict()
Scores = dict()

# Blocking
Perform blocking to reduce the size of the data

In [10]:
print(f"{data.shape[0]} rows before blocking")
mask = (data["SlaafNaamNieuw"] > 0.6)
X = data[mask]
y = y[mask]
print(f"{X.shape[0]} rows after blocking")

4981284 rows before blocking
79964 rows after blocking


In [11]:
# X = X.iloc[:,3:].to_numpy().astype(float)
# y = y.to_numpy().astype(float)

In [12]:
kf = KFold(n_splits=n_folds, random_state=42, shuffle=True)

# Rule based

In [13]:
class Rule_based_classifier():
    def __init__(self, alg):
        self.alg = alg
        
    def predict_proba(self, X_test):
        pred = self.alg(X_test)
        return pred
    
    def predict(self, X_test, thresh):
        pred = self.alg(X_test)
        return (pred >= thresh).astype(float)

## Exact Match

In [14]:
def exact_match(X_test):
    pred = []
    for x in X_test:
        pred.append((x == 1.).sum())
    return np.array(pred)/X_test.shape[1]

In [15]:
Classifiers["Exact Match"] = Rule_based_classifier(exact_match)
s = evaluate_rule_based_methods(Classifiers["Exact Match"], kf, X, y, 0.56)
Scores["Exact Match"] = avg_dict_values(s)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Fuzzy Match

In [16]:
def fuzzy_match(X_test):
    pred = []
    for x in X_test:
        pred.append((x > 0.8).sum())
    return np.array(pred)/X_test.shape[1]

In [17]:
Classifiers["Fuzzy Match"] = Rule_based_classifier(fuzzy_match)
s = evaluate_rule_based_methods(Classifiers["Fuzzy Match"], kf, X, y, 0.6)
Scores["Fuzzy Match"] = avg_dict_values(s)

# Statisitcal methods

# K nearest neighbors

In [18]:
model = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=2))
s = evaluate_stat_based_methods(model, kf, X, y)
Scores["KNeigbors classifier"] = avg_dict_values(s)

## Logistic Regression

In [19]:
model = make_pipeline(StandardScaler(), LogisticRegressionCV(scoring='f1', penalty='l2', max_iter=200))
s = evaluate_stat_based_methods(model, kf, X, y)
Scores["Logistic Regression"] = avg_dict_values(s)

## ADAboost

In [20]:
model = make_pipeline(StandardScaler(),AdaBoostClassifier(n_estimators=30, random_state=0))
s = evaluate_stat_based_methods(model, kf, X, y)
Scores["ADAboost"] = avg_dict_values(s)

## Naive Bayes

In [21]:
model = make_pipeline(Normalizer(), MultinomialNB())
s = evaluate_stat_based_methods(model, kf, X, y)
Scores["Naive Bayes"] = avg_dict_values(s)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Linear Support Vector Classifier

In [22]:
model = make_pipeline(StandardScaler(), LinearSVC(random_state=0))
s = evaluate_stat_based_methods(model, kf, X, y)
Scores["Linear SVC"] = avg_dict_values(s)



## Neural Network

In [23]:
model = make_pipeline(StandardScaler(), MLPClassifier(activation='relu', solver='adam', alpha=1e-5, hidden_layer_sizes=(30, 30), learning_rate='adaptive', random_state=1, verbose=False))
s = evaluate_stat_based_methods(model, kf, X, y)
Scores["Neural network"] = avg_dict_values(s)

# Results table

In [24]:
s = []
for c in Scores:
    s.append(pd.Series(Scores[c]))
results = pd.concat(s, axis=1)
results.columns = Scores.keys()
results.transpose()

Unnamed: 0,recall score:,precision score:,f1 score:
Exact Match,0.007692,0.2,0.014815
Fuzzy Match,0.845242,0.20468,0.327414
KNeigbors classifier,0.836439,0.968711,0.895255
Logistic Regression,0.844872,0.870557,0.856069
ADAboost,0.824957,0.911391,0.864964
Naive Bayes,0.0,0.0,0.0
Linear SVC,0.827464,0.876929,0.850153
Neural network,0.770228,0.960714,0.854118


# Error Analysis

In [25]:
def evaluate_stat_based_methods(model, kf, X, y):
    all_scores = dict()
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        model.fit(X_train, y_train)
        pred = model.predict(X_test)
        scores = evaluate_prediction(pred, y_test, model_name = "Exact Match", figure_folder=figure_folder)
        all_scores = append_dict(all_scores, scores)
    return all_scores

In [26]:
def add_prefix_to_columns(df, prefix):
    # Adds a given prefix to all column names
    cols = df.columns
    df.columns = [prefix + c for c in cols]
    return df

def get_mistakes(y_pred, y_true, X_df, t, p):
    # Filters out errors in predictions and saves them as csv files
    p.set_index("Entry-ID")
    mist_indx = np.invert(y_true == y_pred)
    mist_dist = X_df[mist_indx]
    print(mist_dist.shape)
    mist_p = p.loc[mist_dist['permission_indx'], :]
    mist_t = t.loc[mist_dist['transaction_indx'], :]
    vt = add_prefix_to_columns(mist_t, "transaction_")
    vp = add_prefix_to_columns(mist_p, 'permission_')
    vals = np.concatenate((vt, vp), axis=1)
    mist_data = pd.DataFrame(data=vals, columns=list(vt) + list(vp), index=X_df.index[mist_indx])
    print(mist_data.shape)
    print(y_test.shape)
    print(y_test[mist_data.index])
    mist_data["Match"] = y_test
    return mist_data

In [27]:
model = make_pipeline(StandardScaler(), LogisticRegressionCV(scoring='f1', penalty='l2', max_iter=200))

In [28]:
mistakes = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    mistakes.append(get_mistakes(pred, y_test, X_test, transactions, permissions))

(4, 12)
(4, 55)
(15993,)
320467     0
723052     1
2517684    0
2532984    0
Name: Match, dtype: int64
(8, 12)
(8, 55)
(15993,)
392371     1
834141     0
1415494    0
1848857    1
2469869    1
3325011    1
3696594    0
3800735    1
Name: Match, dtype: int64
(5, 12)
(5, 55)
(15993,)
483629     0
1843465    1
1954594    0
2347586    1
3368334    1
Name: Match, dtype: int64
(7, 12)
(7, 55)
(15993,)
264372     0
613615     1
715861     1
730239     1
1864048    1
2503088    0
4793270    1
Name: Match, dtype: int64
(6, 12)
(6, 55)
(15992,)
485025     0
2608334    0
3402552    1
3801093    1
4015334    1
4072281    0
Name: Match, dtype: int64


In [29]:
X_test["Match"] = y_test
X_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test["Match"] = y_test


Unnamed: 0,index,permission_indx,transaction_indx,SlaafGender,SlaafNaamNieuw,BezitterGender,BezitterVoornaam,BezitterTussen,BezitterAchternaam,BezitterBeroep,BezitterStatus,BezitterEtniciteit,Match
678,678,0,678,True,0.666504,False,0.000000,0.0,0.000,0.000000,0.0,0.0,0
1073,1073,0,1073,True,0.666504,False,0.000000,0.0,0.000,0.000000,0.0,0.0,0
15812,15812,2,1436,True,0.833496,False,0.000000,0.0,0.000,0.000000,0.0,0.0,0
19206,19206,2,4833,False,1.000000,False,0.000000,0.0,0.000,0.000000,0.0,0.0,0
19755,19755,2,5383,True,0.666504,False,0.000000,0.0,0.000,0.000000,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4974014,208370,692,7115,True,1.000000,True,0.449951,0.0,0.375,0.235352,1.0,0.0,0
4975384,209740,693,1288,True,0.833496,True,0.150024,0.0,0.125,0.000000,0.0,0.0,0
4976665,211021,693,2571,True,1.000000,True,0.199951,0.0,0.125,0.083313,1.0,0.0,0
4978069,212425,693,3975,True,0.833496,True,0.199951,0.0,0.125,0.083313,1.0,0.0,0


In [30]:
X_test["Match"].unique()

array([0, 1], dtype=int64)

In [31]:
mist_data = pd.concat(mistakes)
mist_data[mist_data["Match"] == 0].to_csv(error_folder + "False_Positives.csv")
mist_data[mist_data["Match"] == 1].to_csv(error_folder + "False_Negatives.csv")

In [32]:
mist_data[mist_data["Match"] == 0]

Unnamed: 0,transaction_Unnamed: 0,transaction_ID,transaction_Datum,transaction_VerkoperVoornaam,transaction_VerkoperPatroniem,transaction_VerkoperTussen,transaction_VerkoperAchternaam,transaction_VerkoperGender,transaction_VerkoperHerkomst,transaction_VerkoperBeroep,...,permission_BezitterStatus,permission_SlaafNaamNieuw,permission_SlaafNaamOrigineel,permission_SlaafKaste,permission_SlaafGender,permission_Opmerking,permission_Betaling,permission_Door[Kantlijn],permission_Entry-ID,Match
320467,4198,,,a,,,coettij,m,,,...,,fortuijn,,,m,,,hem zelfs,45,0
2517684,1886,,,abraham,,van,ros,m,,,...,,september,,,m,,enz,,351,0
2532984,2810,,,paijlo,,,,m,,lascorijn,...,,junij,,,m,,enz,,353,0
834141,333,,,coenje marcaar,,,migdim,m,,,...,voc,augustus,,,m,,enz,den toepas gerwasio pieres,117,0
1415494,6650,,,moekorra senapadie,,,arejen,o,,,...,voc,regina,,,v,,enz,door kluijn do obers,197,0
3696594,1964,,,abraham,,van,ros,m,,,...,,kalistra,,,v,,enz,hem selfs,515,0
483629,2035,,,bastiaan,,,chiko,m,,lascorijn,...,vrijman,junij,,,m,,enz,hem selfs,68,0
1954594,6650,,,moekorra senapadie,,,arejen,o,,,...,vrijman,regina,,,v,,mits betalende de geregtigheijt aan den pagter,den soldt a klijn,272,0
264372,5608,,,bastiaan,,de,rozario,m,,,...,,julij,,,,,enz,van hem selfs,37,0
2503088,1664,,,abraham,,van,ros,m,,,...,,september,,,m,,enz,,349,0
