In [96]:
import pyodbc
import pandas as pd
import shutil
import re
import numpy as np


def copy_database(original_path, temp_path):
    shutil.copyfile(original_path, temp_path)

def export_access_to_dataframes(database_path):
    # Connection string for Access database
    conn_str = (
        r'DRIVER={Microsoft Access Driver (*.mdb, *.accdb)};'
        r'DBQ=' + database_path + ';'
    )

    # Establish a connection to the Access database
    conn = pyodbc.connect(conn_str)
    cursor = conn.cursor()

    # Get a list of all tables in the database
    tables = [row.table_name for row in cursor.tables(tableType='TABLE')]

    # Loop through the tables and load each into a DataFrame
    for table in tables:
        query = f'SELECT * FROM [{table}]'
        df = pd.read_sql(query, conn)
        globals()[f'tbl_{table}'] = df  # Create a global variable with the table name

    # Close the connection
    conn.close()

def update_access_table(database_path, table_name, df):
    # Connection string for Access database
    conn_str = (
        r'DRIVER={Microsoft Access Driver (*.mdb, *.accdb)};'
        r'DBQ=' + database_path + ';'
    )

    # Establish a connection to the Access database
    conn = pyodbc.connect(conn_str)
    cursor = conn.cursor()

    # Clear the existing table
    cursor.execute(f'DELETE FROM [{table_name}]')
    conn.commit()

    # Insert the updated data
    for index, row in df.iterrows():
        columns = ', '.join(row.index)
        placeholders = ', '.join(['?' for _ in row])
        values = tuple(row)
        sql = f'INSERT INTO [{table_name}] ({columns}) VALUES ({placeholders})'
        cursor.execute(sql, values)
    conn.commit()

    # Close the connection
    conn.close()

# Paths
original_database_path = 'E:\\digidure\\CLERUS_v2_06082024.accdb'
temp_database_path = 'E:\\digidure\\CLERUS_v3_06082024.accdb'

# Copy the database
copy_database(original_database_path, temp_database_path)

# Export data from the copied database
export_access_to_dataframes(temp_database_path)

  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)


In [97]:
# Function to append the data to access database
def append_to_access_table(df, database_path, table_name):
    """
    Appends a DataFrame to a table in an Access database.

    Parameters:
    df (pandas.DataFrame): The DataFrame to append.
    database_path (str): The path to the Access database file.
    table_name (str): The name of the table to append the data to.
    """
    # Connection string for Access database
    connection_string = (
        r'DRIVER={Microsoft Access Driver (*.mdb, *.accdb)};'
        r'DBQ=' + database_path + ';'
    )

    # Connect to the Access database
    conn = pyodbc.connect(connection_string)
    cursor = conn.cursor()

    # Append DataFrame to the Access table
    for index, row in df.iterrows():
        columns = ', '.join(row.index)
        values = ', '.join(['?' for _ in row])
        sql = f'INSERT INTO {table_name} ({columns}) VALUES ({values})'
        cursor.execute(sql, tuple(row))

    conn.commit()
    cursor.close()
    conn.close()

In [98]:
# Panda settings for showing data (this is foremost done to more easily explore the data while processing it)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [99]:
# All the individuals that could be matched with DRC are already curated and processed. The next step is to extract individuals from DM that are not in DRC.
# When a link between DM and DRC was found, a value for clerus_id or new_clerus_id was produced. Thus to isolate those that have not been matched we create
# a subselection existing of rows where these fields are empty.
filtered_only_DM = tbl_999_Dm_all_drc_match[tbl_999_Dm_all_drc_match['clerus_id'].isna() & tbl_999_Dm_all_drc_match['new_clerus_id'].isna()]


In [100]:
# Since we want to link the individuals to clerus we want to use the individual_id lateron as a new clerus_id. To avoid overlap in numbers we added 9000000 to every id. The individual ids have been created through 2_1_check_links_DRC_DM.ipynb
filtered_only_DM["individual_id"] = filtered_only_DM["individual_id"] + 9000000

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_only_DM["individual_id"] = filtered_only_DM["individual_id"] + 9000000


In [101]:
#From this dataset we will be extracting all kind of information to align with the Clerus database structure.

In [102]:
#First we extract information for every individual which we use to fill in the table 01_clerus_bio. These are the names of the individual, the year of death and remarks (Bijzonderheden).
# since we are not interested in all the other fields we make a subselection

DM_tbl_bio_1 = filtered_only_DM[['pid', 'predikant', "vertrek naar of vanwege", "jaar vertrek", "Bijzonderheden", "individual_id"]]

In [103]:
# Extract Year of Death
# After analysing the data in DM it appears that information about someone's year of death can be found in the field "Bijzonderheden", but also in the field "vertrek naar of vanwege".
# To extract this information and to save it in a new field is done in two steps.

# first we extract year from Bijzonderheden where it contains the string "overl." or "overleden"
def extract_year(bijzonderheden):
    if isinstance(bijzonderheden, str):
        match = re.search(r'(overleden|overl\.)\s*(\d{4})', bijzonderheden, re.IGNORECASE)
        if match:
            return int(match.group(2))
    return None

DM_tbl_bio_1['death_year'] = DM_tbl_bio_1['Bijzonderheden'].apply(extract_year)

# Second we update 'death_year' where it is None, using 'jaar_vertrek' if 'vertrek_naar_of_vanwege' contains 'overleden' or 'overl.'
DM_tbl_bio_1['death_year'] = DM_tbl_bio_1.apply(
    lambda row: row['jaar vertrek']
    if pd.isnull(row['death_year']) and
       isinstance(row['vertrek naar of vanwege'], str) and
       ('overleden' in row['vertrek naar of vanwege'].lower() or 'overl.' in row['vertrek naar of vanwege'].lower())
    else row['death_year'],
    axis=1
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DM_tbl_bio_1['death_year'] = DM_tbl_bio_1['Bijzonderheden'].apply(extract_year)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DM_tbl_bio_1['death_year'] = DM_tbl_bio_1.apply(


In [104]:
# Next we need to make another subselection of the data to get only the name, clerus_id, year of death and remarks.
DM_tbl_bio_2 = DM_tbl_bio_1[['predikant', "Bijzonderheden", "individual_id", "death_year"]]

In [105]:
DM_tbl_bio_2['death_year'] = DM_tbl_bio_2['death_year'].fillna(0).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DM_tbl_bio_2['death_year'] = DM_tbl_bio_2['death_year'].fillna(0).astype(int)


In [106]:
# Now we remove all duplicates that remain based on the inidivual_id and prioritize on whether "year_death" has a value.
DM_tbl_bio_2 = DM_tbl_bio_2.sort_values(by='death_year', ascending=False, na_position='last')

In [107]:
# Drop duplicates based on 'individual_id', keeping the first occurrence
DM_tbl_bio_unique = DM_tbl_bio_2.drop_duplicates(subset='individual_id', keep='first')

In [108]:
DM_tbl_bio_unique[['surname', 'first_name']] = DM_tbl_bio_unique['predikant'].str.split(';', expand=True)
DM_tbl_bio_unique.drop(columns=['predikant'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DM_tbl_bio_unique[['surname', 'first_name']] = DM_tbl_bio_unique['predikant'].str.split(';', expand=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DM_tbl_bio_unique[['surname', 'first_name']] = DM_tbl_bio_unique['predikant'].str.split(';', expand=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DM_tbl_bio_unique.drop(columns=['predikant'], inpl

In [109]:
DM_tbl_bio_unique.rename(columns={'Bijzonderheden': 'remarks', 'individual_id': 'clerus_id'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DM_tbl_bio_unique.rename(columns={'Bijzonderheden': 'remarks', 'individual_id': 'clerus_id'}, inplace=True)


In [110]:
DM_tbl_bio_unique.head(10)

Unnamed: 0,remarks,clerus_id,death_year,surname,first_name
785,,9014158,2005,Alma,G.S.
49910,,9030309,2000,Vrijlandt,M.A.
44667,Parttime,9028633,1999,Storm,G. van der
29816,,9023861,1998,Magedans,F.C.
13378,,9018125,1996,Emmen,L.E.
26048,,9022453,1995,Korporaaj,G.K.
48862,,9029961,1994,Vlastuin,J.
49838,(overleden1994),9030294,1994,Vries,W. de
48851,,9029960,1994,Vlasblom,W.A.
22683,,9021135,1994,Jansen,M.


In [111]:
#Add information about the sex to the individual in DM. All are male except when the name contains "mevr" or "mevrouw"

def determine_sex(first_name):
    if first_name and 'mevr' in first_name.lower():
        return 'female'
    else:
        return 'male'

In [112]:
DM_tbl_bio_unique['sex'] = DM_tbl_bio_unique['first_name'].apply(determine_sex)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DM_tbl_bio_unique['sex'] = DM_tbl_bio_unique['first_name'].apply(determine_sex)


In [113]:
DM_tbl_bio_unique.tail()


Unnamed: 0,remarks,clerus_id,death_year,surname,first_name,sex
19567,,9020100,0,Hertog,M.M. den,male
19560,,9020096,0,Herson,dr. A. van,male
19559,,9020093,0,Herpen,W. van,male
19557,,9020094,0,Herpen,W. van,male
19556,,9020095,0,Herpen,W. van,male


In [114]:
# Append data to database
table_name = '01_clerus_bio'
append_to_access_table(DM_tbl_bio_unique, temp_database_path, table_name)

In [115]:
# Get roles seperate out of the dataframe where roles from individuals that are only present in DM are used. (i.e. filtered_only_DM)
# Provincie [role_province], Classis [role_classis], Gemeente [role_place], clerus_id, pid(as backup), jaar intrede [year_start], jaar vertrek [year_end], dag intrede - maand intrede - jaar intrede [role_start_accuracy], dag vertrek - maand vertrek - jaar vertrek [role_end_accuracy]

filtered_only_DM.head()

Unnamed: 0,pid,ind_id,provincie,classis,gemeente,wijk,predikant,Herkomst,dag intrede,maand intrede,jaar intrede,vertrek naar of vanwege,dag vertrek,maand vertrek,jaar vertrek,Bijzonderheden,individual_id,clerus_id,new_clerus_id
0,35651,1,ZH,,Oudewater,,(Hekendorp) Brunner; L.,hulppredikant Oirschot,,,1867,Lobith,,,1868,,9013955,,
1,45546,2,GE,,Velp,2. Oude Jan,(Hilten van-)Matthijsen;mevrouw L.,kandidaat,2.0,maart,1986,Apeldoorn,23.0,oktober,1994,,9013956,,
2,7179,3,NB,,Breda,,Aa; dr Pieter Jan Baptist Karel Simon van der,Winterswijk,26.0,februari,1893,emeritaat,1.0,mei,1904,,9013962,,
3,21505,3,GR,,Hornhuizen,,Aa; dr. P.J.B.K. Simon van der,kandidaat,,,1862,Goutum,,,1866,,9013959,,
4,36887,3,NH,,Purmerend,,Aa; dr. P.J.B.K. Simon van der,Goutum,,,1886,Winterswijk,,,1892,,9013960,,


In [116]:

role_dm = filtered_only_DM

In [117]:
# from the day, month and year create a date field which we will fill in to start_date_accuracy and end_date_accuracy
role_dm['role_start_date_exact'] = role_dm['dag intrede'].astype(str) + " " + role_dm['maand intrede'] + " " + role_dm['jaar intrede'].astype(str)
role_dm['role_end_date_exact'] = role_dm['dag vertrek'].astype(str) + " " + role_dm['maand vertrek'] + " " + role_dm['jaar vertrek'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  role_dm['role_start_date_exact'] = role_dm['dag intrede'].astype(str) + " " + role_dm['maand intrede'] + " " + role_dm['jaar intrede'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  role_dm['role_end_date_exact'] = role_dm['dag vertrek'].astype(str) + " " + role_dm['maand vertrek'] + " " + role_dm['jaar vertrek'].astype(str)


In [118]:
role_dm.drop(columns=['ind_id','clerus_id', 'new_clerus_id', 'dag vertrek', 'maand vertrek', 'dag intrede', 'maand intrede', 'vertrek naar of vanwege', 'Herkomst', 'predikant'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  role_dm.drop(columns=['ind_id','clerus_id', 'new_clerus_id', 'dag vertrek', 'maand vertrek', 'dag intrede', 'maand intrede', 'vertrek naar of vanwege', 'Herkomst', 'predikant'], inplace=True)


In [119]:
role_dm.rename(columns={'Bijzonderheden': 'role_remarks',
                        'individual_id': 'clerus_id',
                        'pid':'role_source_id',
                        'provincie': 'role_province',
                        'classis': 'role_classis',
                        'gemeente': 'role_place',
                        'wijk': 'role_parish',
                        'jaar intrede': 'role_start_year',
                        'jaar vertrek': 'role_end_year'	}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  role_dm.rename(columns={'Bijzonderheden': 'role_remarks',


In [120]:
role_dm['role_type'] = "predikant"
role_dm['role_remarks_source'] = "DM"



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  role_dm['role_type'] = "predikant"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  role_dm['role_remarks_source'] = "DM"


In [121]:
role_dm['role_end_year'] = role_dm['role_end_year'].fillna(0).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  role_dm['role_end_year'] = role_dm['role_end_year'].fillna(0).astype(int)


In [122]:
role_dm = role_dm.replace({np.nan: ''})

In [124]:
table_name = '12_clerus_role'
append_to_access_table(role_dm, temp_database_path, table_name)