In [1]:
import pyodbc
import pandas as pd
import shutil
import re
import numpy as np

import time
from datetime import datetime
current_date = datetime.now()
formatted_date = current_date.strftime('%d%m%Y')

def copy_database(original_path, temp_path):
    shutil.copyfile(original_path, temp_path)

def export_access_to_dataframes(database_path):
    # Connection string for Access database
    conn_str = (
        r'DRIVER={Microsoft Access Driver (*.mdb, *.accdb)};'
        r'DBQ=' + database_path + ';'
    )

    # Establish a connection to the Access database
    conn = pyodbc.connect(conn_str)
    cursor = conn.cursor()

    # Get a list of all tables in the database
    tables = [row.table_name for row in cursor.tables(tableType='TABLE')]

    # Loop through the tables and load each into a DataFrame
    for table in tables:
        query = f'SELECT * FROM [{table}]'
        df = pd.read_sql(query, conn)
        globals()[f'tbl_{table}'] = df  # Create a global variable with the table name

    # Close the connection
    conn.close()

def update_access_table(database_path, table_name, df):
    # Connection string for Access database
    conn_str = (
        r'DRIVER={Microsoft Access Driver (*.mdb, *.accdb)};'
        r'DBQ=' + database_path + ';'
    )

    # Establish a connection to the Access database
    conn = pyodbc.connect(conn_str)
    cursor = conn.cursor()

    # Clear the existing table
    cursor.execute(f'DELETE FROM [{table_name}]')
    conn.commit()

    # Insert the updated data
    for index, row in df.iterrows():
        columns = ', '.join(row.index)
        placeholders = ', '.join(['?' for _ in row])
        values = tuple(row)
        sql = f'INSERT INTO [{table_name}] ({columns}) VALUES ({placeholders})'
        cursor.execute(sql, values)
    conn.commit()

    # Close the connection
    conn.close()

# Paths
original_database_path = 'E:\\digidure\\CLERUS_v1_DRC_07082024.accdb'
temp_database_path = 'E:\\digidure\\CLERUS_v2_DRC_DM_'+formatted_date+'.accdb'

# Copy the database
copy_database(original_database_path, temp_database_path)

# Export data from the copied database
export_access_to_dataframes(temp_database_path)

  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)
  df = pd.read_sql(query, conn)


In [2]:
# Function to append the data to access database
def append_to_access_table(df, database_path, table_name):
    """
    Appends a DataFrame to a table in an Access database.

    Parameters:
    df (pandas.DataFrame): The DataFrame to append.
    database_path (str): The path to the Access database file.
    table_name (str): The name of the table to append the data to.
    """
    # Connection string for Access database
    connection_string = (
        r'DRIVER={Microsoft Access Driver (*.mdb, *.accdb)};'
        r'DBQ=' + database_path + ';'
    )

    # Connect to the Access database
    conn = pyodbc.connect(connection_string)
    cursor = conn.cursor()

    # Append DataFrame to the Access table
    for index, row in df.iterrows():
        columns = ', '.join(row.index)
        values = ', '.join(['?' for _ in row])
        sql = f'INSERT INTO {table_name} ({columns}) VALUES ({values})'
        cursor.execute(sql, tuple(row))

    conn.commit()
    cursor.close()
    conn.close()

In [3]:
# Panda settings for showing data (this is foremost done to more easily explore the data while processing it)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [4]:
# All the individuals that could be matched with DRC are already curated and processed. The next step is to extract individuals from DM that are not in DRC.
# When a link between DM and DRC was found, a value for clerus_id or new_clerus_id was produced. Thus to isolate those that have not been matched we create
# a subselection existing of rows where these fields are empty.
filtered_only_DM = tbl_999_Dm_all_drc_match[tbl_999_Dm_all_drc_match['clerus_id'].isna() & tbl_999_Dm_all_drc_match['new_clerus_id'].isna()]


In [5]:
# Since we want to link the individuals to clerus we want to use the individual_id lateron as a new clerus_id. To avoid overlap in numbers we added 9000000 to every id. The individual ids have been created through 2_1_check_links_DRC_DM.ipynb
filtered_only_DM["individual_id"] = filtered_only_DM["individual_id"] + 9000000

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_only_DM["individual_id"] = filtered_only_DM["individual_id"] + 9000000


In [7]:
DM_tbl_bio_1 = filtered_only_DM

In [8]:
# Extract Year of Death
# After analysing the data in DM it appears that information about someone's year of death can be found in the field "Bijzonderheden", but also in the field "vertrek naar of vanwege".
# To extract this information and to save it in a new field is done in two steps.

# first we extract year from Bijzonderheden where it contains the string "overl." or "overleden"
def extract_year(bijzonderheden):
    if isinstance(bijzonderheden, str):
        # Search for "overleden" or "overl." followed by any characters, then find all 4-digit numbers
        match = re.search(r'(overleden|overl\.).*?(\d{4})(?!.*\d{4})', bijzonderheden, re.IGNORECASE)
        if match:
            return int(match.group(2))  # Return the 4-digit year found
    return None

DM_tbl_bio_1['death_year'] = DM_tbl_bio_1['Bijzonderheden'].apply(extract_year)

# Second we update 'death_year' where it is None, using 'jaar_vertrek' if 'vertrek_naar_of_vanwege' contains 'overleden' or 'overl.'
DM_tbl_bio_1['death_year'] = DM_tbl_bio_1.apply(
    lambda row: row['jaar vertrek']
    if pd.isnull(row['death_year']) and
       isinstance(row['vertrek naar of vanwege'], str) and
       ('overleden' in row['vertrek naar of vanwege'].lower() or 'overl.' in row['vertrek naar of vanwege'].lower())
    else row['death_year'],
    axis=1
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DM_tbl_bio_1['death_year'] = DM_tbl_bio_1['Bijzonderheden'].apply(extract_year)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DM_tbl_bio_1['death_year'] = DM_tbl_bio_1.apply(


In [14]:
DM_tbl_bio_1.head(1000)

Unnamed: 0,pid,provincie,classis,gemeente,wijk,predikant,Herkomst,dag intrede,maand intrede,jaar intrede,vertrek naar of vanwege,dag vertrek,maand vertrek,jaar vertrek,Bijzonderheden,individual_id,death_year
0,35651,ZH,,Oudewater,,(Hekendorp) Brunner; L.,hulppredikant Oirschot,,,1867,Lobith,,,1868.0,,9013955,
1,45546,GE,,Velp,2. Oude Jan,(Hilten van-)Matthijsen;mevrouw L.,kandidaat,2.0,maart,1986,Apeldoorn,23.0,oktober,1994.0,,9013956,
2,7179,NB,,Breda,,Aa; dr Pieter Jan Baptist Karel Simon van der,Winterswijk,26.0,februari,1893,emeritaat,1.0,mei,1904.0,,9013962,
3,21505,GR,,Hornhuizen,,Aa; dr. P.J.B.K. Simon van der,kandidaat,,,1862,Goutum,,,1866.0,,9013959,
4,36887,NH,,Purmerend,,Aa; dr. P.J.B.K. Simon van der,Goutum,,,1886,Winterswijk,,,1892.0,,9013960,
5,50146,GE,,Winterswijk,,Aa; dr. P.J.B.K. Simon van der,Purmerend,,,1892,Breda,,,1893.0,,9013960,
6,15179,FR,,Goutum,,Aa; dr. P.J.B.K. van der,Kloosterburen,,,1866,Purmerend,,,1886.0,,9013961,
7,7204,NB,,Breda,4. Wijk Noord,Aa; dr. P.J.B.K.S. van der,Winterswijk,26.0,februari,1893,emeritaat,1.0,mei,1904.0,,9013958,
8,19779,GE,,Herwijnen,,Aa; W.P. van der,kandidaat,24.0,december,1989,,,,,,9013957,
9,35941,OV,,Overdinkel,,Aaij; J.,Hoogkarspel-De Drieslag,25.0,september,1983,emeritaat,1.0,september,1986.0,,9013963,


In [13]:
columns_to_drop = ['clerus_id','ind_id','new_clerus_id']
DM_tbl_bio_1 = DM_tbl_bio_1.drop(columns=columns_to_drop)

In [15]:
DM_tbl_bio_1 = DM_tbl_bio_1.rename(columns={'individual_id': 'clerus_id'})

In [17]:
file_path = 'E:\\digidure\\DM_curating_21082024.xlsx'

# Save the DataFrame to an Excel file with UTF-8 encoding
DM_tbl_bio_1.to_excel(file_path, index=False)