In [80]:
# import the required libraries
import os
import re
import csv
import pandas as pd
import numpy as np
import pyodbc


In [81]:
# To link the DRC with the boekzaallijst we decided to follow 6 strategies. 

# Strategy 1 first we generated a link based on: 
# 1. the first letter of the name, the full surname and the year of the first time someone acted as minister.
# 2. the first letter of the name, the full surname and the year of the first time someone acted as minister +1, since the boekzaallijst contains information about when someone graduated there can be one year in between.
# 3. The first 3 of the surname and the year of the first time someone acted as minister.   
# 4. The first 3 of the surname and the year of the first time someone acted as minister +1 (see 2).
# 5. Try to match the two options form 1 and 2 based on Levenshtein distances

# Before we start we load the "boekzaalijst" data from a csv file.

In [82]:
# Set variables for the project (i.e. the input location of the file to be processed and the output location) )

folderlink = '..//data//'
input_folder = 'input//'
input_file = os.path.join(folderlink+input_folder, 'boekzaallijst_27072023.csv')
folder_output = 'output//'
output_csv = folderlink+folder_output+'clerus_boekzaal.csv'
drc_database = 'DRC_05102023_merged.accdb'


# Panda settings for showing data
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [83]:
# Load the boekzaallijst dataset from a csv
years_to_integers = {'Jaar (Begin) Rol': pd.Int64Dtype(),'Jaar Beroepen': pd.Int64Dtype(), }
boekzaallijst = pd.read_csv(input_file, sep=';', dtype=years_to_integers, encoding='utf-8')

In [84]:
def get_first_letter(row, name_column, initial_column):
    name_letter = row[name_column][0] if pd.notnull(row[name_column]) else None
    initial_letter = row[initial_column][0] if pd.notnull(row[initial_column]) else None
    return name_letter or initial_letter

In [85]:
# Create the new field containing the 'First_Letter'
boekzaallijst['first_letter'] = boekzaallijst.apply(lambda row: get_first_letter(row, 'Voornaam_BZ', 'Voorletter_BZ'), axis=1)

In [86]:
fil_boekzaallijst = boekzaallijst.dropna(subset=['Jaar Beroepen'])

In [87]:
# Create the link to formulate the connection using strategy 1
fil_boekzaallijst['strat1_boekzaallink'] = fil_boekzaallijst['first_letter'].astype(str) + '_' + fil_boekzaallijst['Achternaam_BZ'].astype(str) + '_' + fil_boekzaallijst['Jaar Beroepen'].astype(str).str.replace(' ', '')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fil_boekzaallijst['strat1_boekzaallink'] = fil_boekzaallijst['first_letter'].astype(str) + '_' + fil_boekzaallijst['Achternaam_BZ'].astype(str) + '_' + fil_boekzaallijst['Jaar Beroepen'].astype(str).str.replace(' ', '')


In [88]:
# Create the link to formulate the connection using strategy 2
def lower_one_to_integer(num):
    return num - 1

In [89]:
fil_boekzaallijst['year_min1'] = fil_boekzaallijst['Jaar Beroepen'].apply(lower_one_to_integer)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fil_boekzaallijst['year_min1'] = fil_boekzaallijst['Jaar Beroepen'].apply(lower_one_to_integer)


In [90]:
fil_boekzaallijst['strat2_boekzaallink'] = fil_boekzaallijst['first_letter'].astype(str) + '_' + fil_boekzaallijst['Achternaam_BZ'].astype(str) + '_' + fil_boekzaallijst['year_min1'].astype(str).str.replace(' ', '')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fil_boekzaallijst['strat2_boekzaallink'] = fil_boekzaallijst['first_letter'].astype(str) + '_' + fil_boekzaallijst['Achternaam_BZ'].astype(str) + '_' + fil_boekzaallijst['year_min1'].astype(str).str.replace(' ', '')


In [91]:
# Strategy 3 and 4 
fil_boekzaallijst['strat3_boekzaallink'] =  fil_boekzaallijst['Achternaam_BZ'].str[:3]+ '_' + fil_boekzaallijst['Jaar Beroepen'].astype(str).str.replace(' ', '')
fil_boekzaallijst['strat4_boekzaallink'] =  fil_boekzaallijst['Achternaam_BZ'].str[:3]+ '_' + fil_boekzaallijst['year_min1'].astype(str).str.replace(' ', '')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fil_boekzaallijst['strat3_boekzaallink'] =  fil_boekzaallijst['Achternaam_BZ'].str[:3]+ '_' + fil_boekzaallijst['Jaar Beroepen'].astype(str).str.replace(' ', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fil_boekzaallijst['strat4_boekzaallink'] =  fil_boekzaallijst['Achternaam_BZ'].str[:3]+ '_' + fil_boekzaallijst['year_min1'].astype(str).str.replace(' ', '')


In [92]:
fil_boekzaallijst.head()

Unnamed: 0,Nr_BZ,Achternaam_BZ,Tussenvoegsel_BZ,Voornaam_BZ,Voorletter_BZ,Variate_Achternaam_BZ,Variate_Voornaam_BZ,Variatie_Voorletter_BZ,Family name,Toevoeging,Classis,Rol,Rol plaats,Jaar (Begin) Rol,Periode (eind) Rol,Jaar 2 en verder rol,Periode 2 (en verder) rol,Jaar (eind) rol,Periode (eind) rol,Jaar Beroepen,Periode (Beroepen),Rol.1,Bestemming,Toevoeging plaats bestemming,Aantekening bij bestemming,Overleden,Jaar overlijden,Extra functie,Locatie xtra functie,Jaar begin xtra functie,Periode xtra functie,Unnamed: 31,Periode (einde),Aantekening xtra functie,first_letter,strat1_boekzaallink,year_min1,strat2_boekzaallink,strat3_boekzaallink,strat4_boekzaallink
0,1,Aaken,van,,J.,,,J.H.,,,13.0,Proponent,,1815.0,AB,,,,,1816,A,Predikant,St. Michiels Gestel en Gemunde,,,,,,,,,,,,J,J_Aaken_1816,1815,J_Aaken_1815,Aak_1816,Aak_1815
1,2,Aalburg,van,Joannes,,,,,,,,,,,,,,,,1743,B,Predikant,Oudkarspel,,,,,,,,,,,,J,J_Aalburg_1743,1742,J_Aalburg_1742,Aal_1743,Aal_1742
4,5,Aartsen,,Albertus,,,,,,,40.0,Proponent,,1766.0,AB,1767.0,AB,,,1776,B,Predikant,Randwijk,,,,,,,,,,,,A,A_Aartsen_1776,1775,A_Aartsen_1775,Aar_1776,Aar_1775
5,6,Aartsen,,Antonius,,Aartzen,,,,,41.0,Proponent,,1742.0,A,,,,,1742,B,Predikant,Zunderdorp,,,,,,,,,,,,A,A_Aartsen_1742,1741,A_Aartsen_1741,Aar_1742,Aar_1741
6,7,Aartze,,Lambertus,,,,,Ant. Fil.,,26.0,Proponent,,1773.0,B,1774.0,A,,,1774,B,Predikant,Voorhout,,,,,,,,,,,,L,L_Aartze_1774,1773,L_Aartze_1773,Aar_1774,Aar_1773


In [93]:
conn_str = (
    r'DRIVER={Microsoft Access Driver (*.mdb, *.accdb)};'
    r'DBQ='+folderlink+input_folder+drc_database+';'
)

In [94]:
# Establish the connection
conn = pyodbc.connect(conn_str)

# Read the table into a pandas DataFrame
# Replace 'your_table_name' with the name of the table you want to read.
drc_bio = pd.read_sql('SELECT * FROM 01_DRC_BIO', conn)
drc_role = pd.read_sql('SELECT * FROM 12_DRC_roles', conn)

# Close the connection
conn.close()



In [95]:
def double_to_integer(dataframe, field):
    dataframe[field] = dataframe[field].astype('Int64')  


In [96]:
double_to_integer(drc_role, 'role_start_year')
double_to_integer(drc_role, 'role_end_year')
double_to_integer(drc_bio, 'birth_year')
double_to_integer(drc_bio, 'death_year')
double_to_integer(drc_bio, 'baptized_year')
double_to_integer(drc_bio, 'burried_year')

In [97]:
drc_joined = pd.merge(drc_bio, drc_role, left_on='drc_id', right_on='drc_id', how = 'right')


In [98]:
drc_subset = drc_joined[drc_joined['role_type'] == 'predikant']

In [99]:
drc_subset.dropna(subset=['role_start_year'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  drc_subset.dropna(subset=['role_start_year'], inplace=True)


In [100]:
first_minister_subset = drc_subset.loc[drc_subset.groupby('drc_id')['role_start_year'].idxmin()]

In [101]:
first_minister_subset.head()

Unnamed: 0,drc_id,first_name,infix,surname,first_letter,place_birth,place_death,place_baptized,place_burried,original_input,name_info_family,birth_year,death_year,baptized_year,burried_year,birth_year_accuracy,death_year_accuracy,baptized_year_accuracy,burried_year_accuracy,join_name,DRC_checked,DRC_modifications,DRC_checking_remarks,role_place,role_start_year,role_start_year_accuracy,role_end_year,role_end_year_accuracy,role_remarks,role_type
2288,1,Johannes,van,Aalburg,J,Zierikzee,,,,"Aalburg, van Johannes Geb. Zierikzee ca. 1717 pred. Oudkarspel 30 juni 1743, overl. 14 maart 1777.<1>",,1717.0,1777,,,circa,,,,Aalburg Johannes van,True,False,,Oudkarspel,1743,,,,Oudkarspel 30 juni 1743,predikant
29636,2,Cornelius,van,Aalst,C,Castricum,Amsterdam,,,"Aalst, van Cornelius Geb. Castricum ca. 1686 ambassadepred. in Parijs maart tot dec. 1715 pred. Kalslagen ber. 21 febr. 1717, emer. 1751 overl. Amsterdam 27 aug. 1756.<2>",,1686.0,1756,,,circa,,,,Aalst Cornelius van,True,True,,Kalslagen,1717,,,,Kalslagen ber. 21 febr. 1717,predikant
29638,3,Gerardus,van,Aalst,G,,,,,"Aalst, van Gerardus Geb. xxx sept. 1678 pred. Vuren en Dalem 10 aug. 1704, Sommelsdijk 13 juni 1706, West-Zaandam 4 aug. 1715, emer. 1755 overl. 29 juni 1759.<3>",,1678.0,1759,,,,,,,Aalst Gerardus van,True,False,,Vuren en Dalem,1704,,,,Vuren en Dalem 10 aug. 1704,predikant
29641,4,Wilhelmus,,Aalst,W,,,Biggekerke,,"Aalst Wilhelmus Gedoopt Biggekerke 5 jan. 1664 pred. Aardenburg 22 mei 1695, overl. 19 dec. 1700.<4>",,,1700,1664.0,,,,,,Aalst Wilhelmus,True,False,,Aardenburg,1695,,,,Aardenburg 22 mei 1695,predikant
29643,5,Henricus,,Aalstius,H,'s-Hertogenbosch (?),Limmen,,,"Aalstius Henricus Geb. 's-Hertogenbosch (?) yyy pred. Castricum en Heemskerk nov. 1700, emer. sept. 1733 overl. Limmen 15 maart 1736.<5>",,,1736,,,,,,,Aalstius Henricus,True,False,,Castricum en Heemskerk,1700,,,,Castricum en Heemskerk nov. 1700,predikant


In [107]:
# Creating the linking field for strategy 1 and strategy 2

first_minister_subset['strat12_drc_link'] = first_minister_subset['first_letter'].astype(str) + '_' +first_minister_subset['surname'].astype(str) + '_' + first_minister_subset['role_start_year'].astype(str).str.replace(' ', '')

In [108]:
# Creating the linking field for strategy 3 and strategy 4
first_minister_subset['strat34_drc_link'] = first_minister_subset['surname'].str[:3] + '_' + first_minister_subset['role_start_year'].astype(str).str.replace(' ', '')


In [106]:
first_minister_subset.head()

Unnamed: 0,drc_id,first_name,infix,surname,first_letter,place_birth,place_death,place_baptized,place_burried,original_input,name_info_family,birth_year,death_year,baptized_year,burried_year,birth_year_accuracy,death_year_accuracy,baptized_year_accuracy,burried_year_accuracy,join_name,DRC_checked,DRC_modifications,DRC_checking_remarks,role_place,role_start_year,role_start_year_accuracy,role_end_year,role_end_year_accuracy,role_remarks,role_type,strat1_drc_link,strat34_drc_link
2288,1,Johannes,van,Aalburg,J,Zierikzee,,,,"Aalburg, van Johannes Geb. Zierikzee ca. 1717 pred. Oudkarspel 30 juni 1743, overl. 14 maart 1777.<1>",,1717.0,1777,,,circa,,,,Aalburg Johannes van,True,False,,Oudkarspel,1743,,,,Oudkarspel 30 juni 1743,predikant,J_Aalburg_1743,Aal_1743
29636,2,Cornelius,van,Aalst,C,Castricum,Amsterdam,,,"Aalst, van Cornelius Geb. Castricum ca. 1686 ambassadepred. in Parijs maart tot dec. 1715 pred. Kalslagen ber. 21 febr. 1717, emer. 1751 overl. Amsterdam 27 aug. 1756.<2>",,1686.0,1756,,,circa,,,,Aalst Cornelius van,True,True,,Kalslagen,1717,,,,Kalslagen ber. 21 febr. 1717,predikant,C_Aalst_1717,Aal_1717
29638,3,Gerardus,van,Aalst,G,,,,,"Aalst, van Gerardus Geb. xxx sept. 1678 pred. Vuren en Dalem 10 aug. 1704, Sommelsdijk 13 juni 1706, West-Zaandam 4 aug. 1715, emer. 1755 overl. 29 juni 1759.<3>",,1678.0,1759,,,,,,,Aalst Gerardus van,True,False,,Vuren en Dalem,1704,,,,Vuren en Dalem 10 aug. 1704,predikant,G_Aalst_1704,Aal_1704
29641,4,Wilhelmus,,Aalst,W,,,Biggekerke,,"Aalst Wilhelmus Gedoopt Biggekerke 5 jan. 1664 pred. Aardenburg 22 mei 1695, overl. 19 dec. 1700.<4>",,,1700,1664.0,,,,,,Aalst Wilhelmus,True,False,,Aardenburg,1695,,,,Aardenburg 22 mei 1695,predikant,W_Aalst_1695,Aal_1695
29643,5,Henricus,,Aalstius,H,'s-Hertogenbosch (?),Limmen,,,"Aalstius Henricus Geb. 's-Hertogenbosch (?) yyy pred. Castricum en Heemskerk nov. 1700, emer. sept. 1733 overl. Limmen 15 maart 1736.<5>",,,1736,,,,,,,Aalstius Henricus,True,False,,Castricum en Heemskerk,1700,,,,Castricum en Heemskerk nov. 1700,predikant,H_Aalstius_1700,Aal_1700


In [109]:
ligth_drc = first_minister_subset[['drc_id','original_input','strat34_drc_link','strat12_drc_link']]


In [110]:
ligth_bz = fil_boekzaallijst[['Nr_BZ','strat1_boekzaallink','strat2_boekzaallink','strat3_boekzaallink','strat4_boekzaallink']]

In [112]:
ligth_drc.head()

Unnamed: 0,drc_id,original_input,strat34_drc_link,strat12_drc_link
2288,1,"Aalburg, van Johannes Geb. Zierikzee ca. 1717 pred. Oudkarspel 30 juni 1743, overl. 14 maart 1777.<1>",Aal_1743,J_Aalburg_1743
29636,2,"Aalst, van Cornelius Geb. Castricum ca. 1686 ambassadepred. in Parijs maart tot dec. 1715 pred. Kalslagen ber. 21 febr. 1717, emer. 1751 overl. Amsterdam 27 aug. 1756.<2>",Aal_1717,C_Aalst_1717
29638,3,"Aalst, van Gerardus Geb. xxx sept. 1678 pred. Vuren en Dalem 10 aug. 1704, Sommelsdijk 13 juni 1706, West-Zaandam 4 aug. 1715, emer. 1755 overl. 29 juni 1759.<3>",Aal_1704,G_Aalst_1704
29641,4,"Aalst Wilhelmus Gedoopt Biggekerke 5 jan. 1664 pred. Aardenburg 22 mei 1695, overl. 19 dec. 1700.<4>",Aal_1695,W_Aalst_1695
29643,5,"Aalstius Henricus Geb. 's-Hertogenbosch (?) yyy pred. Castricum en Heemskerk nov. 1700, emer. sept. 1733 overl. Limmen 15 maart 1736.<5>",Aal_1700,H_Aalstius_1700


In [117]:
strategy1 = pd.merge(ligth_bz, ligth_drc, left_on='strat1_boekzaallink', right_on='strat12_drc_link', how='inner')
strategy1['strategy'] = 1
strategy2 = pd.merge(ligth_bz, ligth_drc, left_on='strat2_boekzaallink', right_on='strat12_drc_link', how='inner')
strategy2['strategy'] = 2
strategy3 = pd.merge(ligth_bz, ligth_drc, left_on='strat3_boekzaallink', right_on='strat34_drc_link', how='inner')
strategy3['strategy'] = 3
strategy4 = pd.merge(ligth_bz, ligth_drc, left_on='strat4_boekzaallink', right_on='strat34_drc_link', how='inner')
strategy4['strategy'] = 4

In [118]:
appended_strategies = pd.concat([strategy1, strategy2, strategy3, strategy4], ignore_index=True)


In [115]:
appended_strategies.describe()

Unnamed: 0,Nr_BZ,drc_id
count,5683.0,5683.0
mean,2427.205877,6083.981524
std,1358.299408,2988.358155
min,2.0,1.0
25%,1268.0,3860.0
50%,2474.0,6487.0
75%,3657.5,8615.0
max,4696.0,11236.0


In [None]:
boekzaal_minister_left = pd.merge(fil_boekzaallijst, minister, left_on='boekzaallink', right_on='drc_link', how='left')
boekzaal_minister_left.to_csv(folderlink+folder_output+'boekzaal_minister_year_left.csv', sep=';', encoding='utf-8', index=False)