In [1]:
from Annotation_helper import *

import os
import re  # For preprocessing
import time
from gensim.models import Word2Vec
from gensim.models import FastText
from gensim.utils import simple_preprocess
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

In [2]:
#Set all settings for the resulting figures

import seaborn as sns
sns.set()

SMALL_SIZE = 15
MEDIUM_SIZE = 17
BIGGER_SIZE = 19

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

plt.tight_layout()

<Figure size 432x288 with 0 Axes>

In [3]:
data_folder = "../../Data"
structured_data_folder = data_folder + "/structured_data"
text_path = data_folder + "/text"
data_path = structured_data_folder + "/test.csv"
dist_folder = structured_data_folder + "/filtered_Distances"
figure_folder = "../Figures/Results/Embeddings/"
model_folder = "../Models/Embeddings"
matches = pd.read_csv(structured_data_folder + "/Matches/Bas.csv", index_col=0)

In [4]:
transactions = load_transactions(structured_data_folder + "\VOC Cochin Slave Transactions 1706-1801 - IISH - Database - 2018 - v1.csv")
permissions = load_permissions(structured_data_folder + "\VOC Cochin Slave Transport Permissions 1770-1795 - IISH - Database - 2018 - v1.csv")
matches = pd.read_csv(structured_data_folder + "/Matches/Bas.csv", index_col=0)
possible_matches = pd.read_csv(structured_data_folder + "/blocked_possible_matches.csv", index_col=0)
possible_matches["total"] = possible_matches["transaction_indx"].astype(str) + "_" + possible_matches["permission_indx"].astype(str)
perms = permissions.iloc[:, 4:-1]
trans = transactions.iloc[:, 2:-1]

  col = col.str.replace('[^a-zA-Z ]', '')


In [5]:
kf = KFold(n_splits=5, random_state=42, shuffle=True)

In [6]:
def make_meshgrid(x, y, h=.02):
    x_min, x_max = x.min() - 1, x.max() + 1
    y_min, y_max = y.min() - 1, y.max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    return xx, yy

def plot_contours(ax, clf, xx, yy, **params):
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    out = ax.contourf(xx, yy, Z, **params)
    return out

def avg_dict_values(d):
    res = dict()
    for key in d:
        res[key] = np.mean(d[key])
    return res

def append_dict(d1, d2):
    res = dict()
    if d1 == {}:
        for key in d2:
            res[key] = [d2[key]]
        return d2
    else:
        for key in d1:
            if type(d1[key]) != list:
                res[key] = [d1[key]] + [d2[key]]
            else:
                res[key] = d1[key] + [d2[key]]
        return res

def evaluate_rule_based_methods(model, kf, X, y, thresh):
    all_scores = dict()
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        pred = model.predict(X_test.to_numpy(), thresh)
        scores = evaluate_prediction(pred, y_test.to_numpy(), model_name = "Exact Match", figure_folder=figure_folder)
        all_scores = append_dict(all_scores, scores)
    return all_scores

def evaluate_stat_based_methods(model, kf, X, y):
    all_scores = dict()
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        model.fit(X_train, y_train)
        pred = model.predict(X_test)
        scores = evaluate_prediction(pred, y_test, model_name = "Exact Match", figure_folder=figure_folder)
        all_scores = append_dict(all_scores, scores)
    return all_scores

In [7]:
class text_loader:
    # Iterator that loads text from nested folders containing txt files
    def __init__(self, file_names, max_files=None):
        self.file_names = file_names
        self.max_files = max_files
    
    def __iter__(self):
        self.file_n = 0
        self.row_n = 0
        self.lines = open(self.file_names[self.file_n], "r", errors="ignore").read().split('\n')
        return self

    def __next__(self):
        if self.max_files:
            if self.file_n >= self.max_files:
                raise StopIteration
        if self.row_n >= len(self.lines):
            self.file_n += 1
            self.row_n = 0
            self.lines = open(self.file_names[self.file_n], "r", errors="ignore").read().split('\n')
            if self.file_n % 100 == 0:
                clear_output(wait=True)
                print(f"file: [{self.file_n}/{self.max_files}]")
        if self.file_n <= len(self.file_names):
            if self.lines != "":
                self.row_n += 1
                res = simple_preprocess(str(self.lines[self.row_n - 1]))
                if len(res) > 4:
                    return res
                else:
                    return ['']
                    return next(self)
            else:
                self.row_n += 1
                return ['']
                return next(self)

def embed_dataframe(df, model):
    # Turn text in dataframes to word vectors
    text = df.astype(str).agg(' '.join, axis=1)
    return pd.DataFrame(data = model.wv[text], index=df.index)
            
# def evaluate_prediction(pred, y_true):
#     # Perform different evaluation functions and returns them as images and text
#     cm = confusion_matrix(y_true, pred)
#     disp = ConfusionMatrixDisplay(confusion_matrix=cm)
#     disp.plot()
#     plt.show()
#     print("recall score: ", recall_score(y_true, pred))
#     print("precision score: ", precision_score(y_true, pred))
#     print("f1 score: ", f1_score(y_true, pred))

In [8]:
def load_or_train_fasttext(loc, force_new_model=False):
    # Loads an existing model if availible, else generate and train a new model.

    # loc: Location to save the model.
    # force_new_model: If true: generate a new model, even if one if availible.
    try:
        if force_new_model:
            raise None
        model = FastText.load(loc)
        print("Loaded model")
    except:
        print("Generating model")
        allFiles = []
        for path, subdirs, files in os.walk(text_path):
            for name in files:
                allFiles.append(os.path.join(path, name))
        tl = text_loader(allFiles, max_files = 50000)
        t = iter(tl)
        model = FastText(sentences=t, size=300, window=5, min_count=3, workers=32)
        model.save(loc)
    return model

In [9]:
model = load_or_train_fasttext(model_folder + "/fasttext4.model", force_new_model=False)
print(len(model.wv.vocab))

Loaded model
134216


In [10]:
et = embed_dataframe(trans, model).astype('float16')
ep = embed_dataframe(perms, model).astype('float16')
et["indx"] = et.index
ep["indx"] = ep.index

In [11]:
# Reset some variables to save memory.
allFiles = None
transactions = None
permissions = None
trans = None
perms = None
model = None

In [12]:
def generate_distances(data_path, force_new_dist=False):
    try:
        if force_new_dist:
            raise None
        distances = pd.read_csv(data_path, index_col=0)
        print("Loaded model")
    except:
        print("Generating dataset")
        n_batches = 20
        et_indexes = np.linspace(0, et.shape[0], n_batches).astype(int)
        ep_indexes = np.linspace(0, ep.shape[0], n_batches).astype(int)

        batch_results = []
        for i in range(1, n_batches):
            clear_output(wait=True)
            print(f"[{i}/{n_batches}]")
            ep_slice = ep.iloc[ep_indexes[i-1]:ep_indexes[i] + 1]
            res = ep_slice.merge(et, how='cross',suffixes=('_permission', '_transaction'))
            comb_index = res['indx_transaction'].astype(str) + "_" + res['indx_permission'].astype(str)
            batch_results.append(res[comb_index.isin(possible_matches["total"])])

        distances = pd.concat(batch_results)
        batch_results = None
        distances.reset_index(inplace=True, drop=True)
        distances.to_csv(data_path)
    return distances

In [13]:
distances = generate_distances(data_path, force_new_dist=False)

Loaded model


In [14]:
Classifiers = dict()
Scores = dict()

In [15]:
X = distances[distances['indx_permission'].isin(matches["permission_indx"].unique())]
X["Match"] = 0
for i, match in matches.iterrows():
    if match["transaction_indx"] != "None":
        indx = distances.query(f"indx_permission == {match['permission_indx']} & indx_transaction == {match['transaction_indx']}").index
        X.loc[indx, "Match"] = 1
data = X.iloc[:, 3:-1]
y = X.iloc[:, -1]
X = data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["Match"] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


# Neural network

In [16]:
model = make_pipeline(StandardScaler(), MLPClassifier(activation='relu', solver='adam', alpha=1e-5, hidden_layer_sizes=(300, 300), learning_rate='adaptive', random_state=1, verbose=False))
s = evaluate_stat_based_methods(model, kf, X, y)
Scores["Neural network"] = avg_dict_values(s)
Scores["Neural network"]



KeyboardInterrupt: 

## 

In [None]:
s = []
for c in Scores:
    s.append(pd.Series(Scores[c]))
results = pd.concat(s, axis=1)
results.columns = Scores.keys()
results.transpose()