In [22]:
import pyodbc
import pandas as pd
import shutil
import re

def copy_database(original_path, temp_path):
    shutil.copyfile(original_path, temp_path)

def export_access_to_dataframes(database_path):
    # Connection string for Access database
    conn_str = (
        r'DRIVER={Microsoft Access Driver (*.mdb, *.accdb)};'
        r'DBQ=' + database_path + ';'
    )

    # Establish a connection to the Access database
    conn = pyodbc.connect(conn_str)
    cursor = conn.cursor()

    # Get a list of all tables in the database
    tables = [row.table_name for row in cursor.tables(tableType='TABLE')]

    # Loop through the tables and load each into a DataFrame
    for table in tables:
        query = f'SELECT * FROM [{table}]'
        df = pd.read_sql(query, conn)
        globals()[f'tbl_{table}'] = df  # Create a global variable with the table name

    # Close the connection
    conn.close()

def update_access_table(database_path, table_name, df):
    # Connection string for Access database
    conn_str = (
        r'DRIVER={Microsoft Access Driver (*.mdb, *.accdb)};'
        r'DBQ=' + database_path + ';'
    )

    # Establish a connection to the Access database
    conn = pyodbc.connect(conn_str)
    cursor = conn.cursor()

    # Clear the existing table
    cursor.execute(f'DELETE FROM [{table_name}]')
    conn.commit()

    # Insert the updated data
    for index, row in df.iterrows():
        columns = ', '.join(row.index)
        placeholders = ', '.join(['?' for _ in row])
        values = tuple(row)
        sql = f'INSERT INTO [{table_name}] ({columns}) VALUES ({placeholders})'
        cursor.execute(sql, values)
    conn.commit()

    # Close the connection
    conn.close()

# Paths
original_database_path = 'E:\\digidure\\CLERUS_v1_24072024.accdb'
temp_database_path = 'E:\\digidure\\CLERUS_temp.accdb'

# Copy the database
copy_database(original_database_path, temp_database_path)

# Export data from the copied database
export_access_to_dataframes(temp_database_path)

  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)


In [23]:
# Panda settings for showing data (this is foremost done to more easily explore the data while processing it)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [24]:
# All the individuals that could be matched with DRC are already curated and processed. The next step is to extract individuals from DM that are not in DRC. 
# When a link between DM and DRC was found, a value for clerus_id or new_clerus_id was produced. Thus to isolate those that have not been matched we create 
# a subselection existing of rows where these fields are empty.
filtered_only_DM = tbl_999_Dm_all_drc_match[tbl_999_Dm_all_drc_match['clerus_id'].isna() & tbl_999_Dm_all_drc_match['new_clerus_id'].isna()]


In [25]:
# Since we want to link the individuals to clerus we want to use the individual_id lateron as a new clerus_id. To avoid overlap in numbers we added 9000000 to every id. The individual ids have been created through 2_1_check_links_DRC_DM.ipynb
filtered_only_DM["individual_id"] = filtered_only_DM["individual_id"] + 9000000

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_only_DM["individual_id"] = filtered_only_DM["individual_id"] + 9000000


In [26]:
filtered_only_DM.head(15)

Unnamed: 0,pid,ind_id,provincie,classis,gemeente,wijk,predikant,Herkomst,dag intrede,maand intrede,jaar intrede,vertrek naar of vanwege,dag vertrek,maand vertrek,jaar vertrek,Bijzonderheden,individual_id,clerus_id,new_clerus_id
0,35651,1.0,ZH,,Oudewater,,(Hekendorp) Brunner; L.,hulppredikant Oirschot,,,1867,Lobith,,,1868.0,,9013955,,
1,45546,2.0,GE,,Velp,2. Oude Jan,(Hilten van-)Matthijsen;mevrouw L.,kandidaat,2.0,maart,1986,Apeldoorn,23.0,oktober,1994.0,,9013956,,
2,7179,3.0,NB,,Breda,,Aa; dr Pieter Jan Baptist Karel Simon van der,Winterswijk,26.0,februari,1893,emeritaat,1.0,mei,1904.0,,9013962,,
3,21505,3.0,GR,,Hornhuizen,,Aa; dr. P.J.B.K. Simon van der,kandidaat,,,1862,Goutum,,,1866.0,,9013959,,
4,36887,3.0,NH,,Purmerend,,Aa; dr. P.J.B.K. Simon van der,Goutum,,,1886,Winterswijk,,,1892.0,,9013960,,
5,50146,3.0,GE,,Winterswijk,,Aa; dr. P.J.B.K. Simon van der,Purmerend,,,1892,Breda,,,1893.0,,9013960,,
6,15179,3.0,FR,,Goutum,,Aa; dr. P.J.B.K. van der,Kloosterburen,,,1866,Purmerend,,,1886.0,,9013961,,
7,7204,3.0,NB,,Breda,4. Wijk Noord,Aa; dr. P.J.B.K.S. van der,Winterswijk,26.0,februari,1893,emeritaat,1.0,mei,1904.0,,9013958,,
8,19779,4.0,GE,,Herwijnen,,Aa; W.P. van der,kandidaat,24.0,december,1989,,,,,,9013957,,
9,35941,5.0,OV,,Overdinkel,,Aaij; J.,Hoogkarspel-De Drieslag,25.0,september,1983,emeritaat,1.0,september,1986.0,,9013963,,


In [None]:
#From this dataset we will be extracting all kind of information to align with the Clerus database structure.

In [50]:
#First we extract information for every individual which we use to fill in the table 01_clerus_bio. These are the names of the individual, the year of death and remarks (Bijzonderheden).
# since we are not interested in all the other fields we make a subselection

DM_tbl_bio_1 = filtered_only_DM[['pid', 'predikant', "vertrek naar of vanwege", "jaar vertrek", "Bijzonderheden", "individual_id"]]

In [51]:
# Extract Year of Death 
# After analysing the data in DM it appears that information about someone's year of death can be found in the field "Bijzonderheden", but also in the field "vertrek naar of vanwege". 
# To extract this information and to save it in a new field is done in two steps.

# first we extract year from Bijzonderheden where it contains the string "overl." or "overleden"
def extract_year(bijzonderheden):
    if isinstance(bijzonderheden, str):
        match = re.search(r'(overleden|overl\.)\s*(\d{4})', bijzonderheden, re.IGNORECASE)
        if match:
            return int(match.group(2))
    return None

DM_tbl_bio_1['year_death'] = DM_tbl_bio_1['Bijzonderheden'].apply(extract_year)

# Second we update 'year_death' where it is None, using 'jaar_vertrek' if 'vertrek_naar_of_vanwege' contains 'overleden' or 'overl.'
DM_tbl_bio_1['year_death'] = DM_tbl_bio_1.apply(
    lambda row: row['jaar vertrek'] 
    if pd.isnull(row['year_death']) and 
       isinstance(row['vertrek naar of vanwege'], str) and 
       ('overleden' in row['vertrek naar of vanwege'].lower() or 'overl.' in row['vertrek naar of vanwege'].lower())
    else row['year_death'], 
    axis=1
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DM_tbl_bio_1['year_death'] = DM_tbl_bio_1['Bijzonderheden'].apply(extract_year)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DM_tbl_bio_1['year_death'] = DM_tbl_bio_1.apply(


In [52]:
# Next we need to make another subselection of the data to get only the name, clerus_id, year of death and remarks. 
DM_tbl_bio_2 = DM_tbl_bio_1[['predikant', "Bijzonderheden", "individual_id", "year_death"]]

In [68]:
DM_tbl_bio_2['year_death'] = DM_tbl_bio_2['year_death'].fillna(0).astype(int)
print(DM_tbl_bio_2.dtypes)

predikant         object
Bijzonderheden    object
individual_id      int64
year_death         int32
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DM_tbl_bio_2['year_death'] = DM_tbl_bio_2['year_death'].fillna(0).astype(int)


In [69]:
# Now we remove all duplicates that remain based on the inidivual_id and prioritize on whether "year_death" has a value. 

DM_tbl_bio_2 = DM_tbl_bio_2.sort_values(by='year_death', ascending=False, na_position='last')



In [70]:
DM_tbl_bio_2.describe()

Unnamed: 0,individual_id,year_death
count,29463.0,29463.0
mean,9022581.0,168.277399
std,5430.808,539.533276
min,9000025.0,0.0
25%,9018184.0,0.0
50%,9022633.0,0.0
75%,9027134.0,0.0
max,9031653.0,2005.0


In [71]:
# Drop duplicates based on 'individual_id', keeping the first occurrence
DM_tbl_bio_unique = DM_tbl_bio_2.drop_duplicates(subset='individual_id', keep='first')

In [72]:
DM_tbl_bio_unique.describe()

Unnamed: 0,individual_id,year_death
count,17460.0,17460.0
mean,9022504.0,279.286025
std,5558.771,672.406587
min,9000025.0,0.0
25%,9018143.0,0.0
50%,9022632.0,0.0
75%,9027147.0,0.0
max,9031653.0,2005.0


In [11]:
# Update the table in the copied database
update_access_table(temp_database_path, '12_clerus_role', df_joined)