In [1]:
from Annotation_helper import *

import os
import re  # For preprocessing
from gensim.models import Word2Vec
from gensim.models import FastText
from gensim.utils import simple_preprocess
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import PrecisionRecallDisplay
from sklearn.metrics import classification_report, confusion_matrix  
from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer

In [2]:
# roc curve and auc on an imbalanced dataset
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot

In [3]:
import time

In [4]:
transactions = load_transactions("../../Data/structured_data\VOC Cochin Slave Transactions 1706-1801 - IISH - Database - 2018 - v1.csv")
permissions = load_permissions("../../Data/structured_data\VOC Cochin Slave Transport Permissions 1770-1795 - IISH - Database - 2018 - v1.csv")
matches = pd.read_csv("Matches/Bas3.csv", index_col=0)
possible_matches = pd.read_csv("../../Data/structured_data/blocked_possible_matches.csv", index_col=0)
possible_matches["total"] = possible_matches["transaction_indx"].astype(str) + "_" + possible_matches["permission_indx"].astype(str)
perms = permissions.iloc[:, 4:-1]
trans = transactions.iloc[:, 2:-1]
text_path = "tests/Data"
data_path = "test.csv"
allFiles = []
for path, subdirs, files in os.walk(text_path):
    for name in files:
        allFiles.append(os.path.join(path, name))
Classifiers = dict()

In [5]:
matches = pd.read_csv("Matches/Bas3.csv", index_col=0)
# dist_folder = "Distances_levenshtein"
dist_folder = "filtered_Distances"
figure_folder = "../Figures/Results/Embeddings"

In [6]:
class text_loader:
    def __init__(self, file_names, max_files=None):
        self.file_names = file_names
        self.max_files = max_files
    
    def __iter__(self):
        self.file_n = 0
        self.row_n = 0
        self.lines = open(self.file_names[self.file_n], "r").read().split('\n')
        return self

    def __next__(self):
        if self.max_files:
            if self.file_n >= self.max_files:
                raise StopIteration
        if self.row_n >= len(self.lines):
            self.file_n += 1
            self.row_n = 0
            self.lines = open(self.file_names[self.file_n], "r").read().split('\n')
        if self.file_n <= len(self.file_names):
            if self.lines != "":
                self.row_n += 1
                res = simple_preprocess(str(self.lines[self.row_n - 1]))
                if len(res) > 4:
                    return res
                else:
                    return ['']
                    return next(self)
            else:
                self.row_n += 1
                return ['']
                return next(self)

def embed_dataframe(df, model):
    text = df.astype(str).agg(' '.join, axis=1)
    return pd.DataFrame(data = model.wv[text], index=df.index)
            
def evaluate_prediction(pred, y_true):
    cm = confusion_matrix(y_true, pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot()
    plt.show()
    print("recall score: ", recall_score(y_true, pred))
    print("precision score: ", precision_score(y_true, pred))
    print("f1 score: ", f1_score(y_true, pred))

In [7]:
force_new_model = False

In [8]:
def load_or_train_fasttext(loc):
    try:
        if force_new_model:
            raise None
        model = FastText.load(loc)
        print("Loaded model")
    except:
        print("Generating model")
        allFiles = []
        for path, subdirs, files in os.walk(text_path):
            for name in files:
                allFiles.append(os.path.join(path, name))
        tl = text_loader(allFiles, max_files = 10000)
        t = iter(tl)
        model = FastText(sentences=t, size=10, window=5, min_count=1, workers=32)
        model.save(loc)
    return model

In [9]:
model = load_or_train_fasttext("fasttext4.model")
print(len(model.wv.vocab))

Loaded model
134210


In [10]:
et = embed_dataframe(trans, model).astype('float16')
ep = embed_dataframe(perms, model).astype('float16')
et["indx"] = et.index
ep["indx"] = ep.index

In [11]:
allFiles = None
transactions = None
permissions = None
trans = None
perms = None

In [12]:
et.memory_usage()

Index    57504
0        14376
1        14376
2        14376
3        14376
4        14376
5        14376
6        14376
7        14376
8        14376
9        14376
indx     57504
dtype: int64

In [13]:
print(model.wv.most_similar('slaaf'))

[('slaff', 0.9964933395385742), ('slaaft', 0.9945201873779297), ('slaafje', 0.9941996335983276), ('slaaffje', 0.9937145709991455), ('slaafsje', 0.9927575588226318), ('slaaffnae', 0.9893152713775635), ('slaav', 0.989169716835022), ('slaa', 0.9886020421981812), ('slaaff', 0.9885046482086182), ('slaax', 0.9879209995269775)]


In [14]:
force_new_dist = False

In [15]:
try:
    if force_new_dist:
        raise None
    distances = pd.read_csv(data_path, index_col=0)
    print("Loaded model")
except:
    print("Generating dataset")
    n_batches = 20
    et_indexes = np.linspace(0, et.shape[0], n_batches).astype(int)
    ep_indexes = np.linspace(0, ep.shape[0], n_batches).astype(int)

    batch_results = []
    for i in range(1, n_batches):
        clear_output(wait=True)
        print(f"[{i}/{n_batches}]")
        ep_slice = ep.iloc[ep_indexes[i-1]:ep_indexes[i] + 1]
        res = ep_slice.merge(et, how='cross',suffixes=('_permission', '_transaction'))
#         print(res)
#         comb_index = res['indx_transaction'].astype(str) + "_" + res['indx_permission'].astype(str)
#         batch_results.append(res[comb_index.isin(possible_matches["total"])])
        batch_results.append(res)
#         break

    distances = pd.concat(batch_results)
    batch_results = None
    distances.reset_index(inplace=True, drop=True)
    distances.to_csv(data_path)

[5/10]


MemoryError: Unable to allocate 114. MiB for an array with shape (20, 2997396) and data type float16

In [None]:
X = distances[distances['indx_permission'].isin(matches["permission_indx"].unique())]
X["Match"] = 0
for i, match in matches.iterrows():
    if match["transaction_indx"] != "None":
        indx = distances.query(f"indx_permission == {match['permission_indx']} & indx_transaction == {match['transaction_indx']}").index
        X.loc[indx, "Match"] = 1
data = X.iloc[:, 3:-1]
y = X.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.3, random_state=42)

In [None]:
print(f"{X_train.shape[0]} rows before blocking")

In [None]:
data

In [None]:
# mask = (X_train["SlaafNaamNieuw"] > 0.3) & (X_train["SlaafGender"] > 0.3)

In [None]:
# X_train = X_train[mask]
# y_train = y_train[mask]

In [None]:
# print(f"{X_train.shape[0]} rows after blocking")

In [None]:
X_train = X_train.to_numpy().astype(float)
y_train = y_train.to_numpy().astype(float)

X_test = X_test.to_numpy().astype(float)
y_test = y_test.to_numpy().astype(float)

In [None]:
from sklearn.linear_model import LogisticRegressionCV

est = make_pipeline(StandardScaler(), LogisticRegressionCV(scoring='f1'))
LGClf = fit_and_test_classifier(est, X_train, X_test, y_train, y_test, model_name="Log_reg2", figure_folder=figure_folder)

Classifiers["Logistic Regression"] = LGClf

In [None]:
from sklearn.neighbors import KNeighborsClassifier
KNNClf = fit_and_test_classifier(KNeighborsClassifier(n_neighbors=1), X_train, X_test, y_train, y_test, model_name="KNN", figure_folder=figure_folder)

Classifiers["KNeigbors classifier"] = KNNClf

## 

In [None]:
from sklearn.neural_network import MLPClassifier

est = make_pipeline(StandardScaler(), MLPClassifier(activation='relu', solver='adam', alpha=1e-5, hidden_layer_sizes=(30, 30), learning_rate='adaptive', random_state=1, verbose=False))

NNClf = fit_and_test_classifier(est, X_train, X_test, y_train, y_test, model_name="MLP", figure_folder=figure_folder)
Classifiers["Neural network"] = NNClf

In [None]:
for name in Classifiers:
    model = Classifiers[name]
#     print("name:", name)
    # predict probabilities
    lr_probs = model.predict_proba(X_test)
    # keep probabilities for the positive outcome only
#     print("lr_probs", lr_probs)
    if type(lr_probs[0]) == np.ndarray:
        lr_probs = lr_probs[:, 1]
    # calculate scores
    lr_auc = roc_auc_score(y_test, lr_probs)
    # summarize scores
    print(name + ': ROC AUC=%.3f' % (lr_auc))
    # calculate roc curves
    lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)
    # plot the roc curve for the model
    pyplot.plot(lr_fpr, lr_tpr, marker='.', label=name)
    
ns_probs = [0 for _ in range(len(y_test))]
ns_auc = roc_auc_score(y_test, ns_probs)
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
pyplot.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
print('No Skill: ROC AUC=%.3f' % (ns_auc))

# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()

In [None]:
model.predict_proba(X_test)

In [None]:
pred = NNClf.predict_proba(X_test)
pred