This notebook contains additional dataparsing steps that are taken after all the processing done by glp500 and JJ.

As input we use the excl sheets: 
- Events.xlsx

[note that these should be put in the folder of parsing_data_june_2025, but are currently in .gitignore]

In [None]:
import pandas as pd
import re
from datetime import datetime
import numpy as np

In [None]:
events = pd.read_excel('./input/Events.xlsx')

In [None]:
events.head()

In [None]:
# Panda settings for showing data (this is foremost done to more easily explore the data while processing it)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [None]:
# to keep track on the changes we copy the fields that we modify the most
events['werkgebied en -soort_org'] = events['werkgebied en -soort']
events['bijzonderheden_org'] = events['bijzonderheden']

In [None]:
# Besides a year of death we found quite often in the dataset that information about the death was found in 'bijzonderheden' or 'werkgebied en -soort'. Therefore we parse it to a new field and remove it form the original field.
# So that other information will remain specifically about 'bijzonderheden' and 'werkgebied en -soort' 

# extract info about death
info_sterven1 = events['werkgebied en -soort'].str.extract(r'(†.*)') 
info_sterven2 = events['bijzonderheden'].str.extract(r'(†.*)')

# extract info about death to a new field
events['informatie_sterven'] = info_sterven1.combine_first(info_sterven2)

# delete info death from original fields 
events['werkgebied en -soort'] = events['werkgebied en -soort'].str.replace(r'†.*', '', regex=True).str.strip()
events['bijzonderheden'] = events['bijzonderheden'].str.replace(r'†.*', '', regex=True).str.strip()

In [None]:
# we find quite some information about follow-up roles. We store these into a new field "vervolgrol(len)"
rollen1 = events['bijzonderheden'].str.extract(r'(>.*)')
rollen2 = events['werkgebied en -soort'].str.extract(r'(>.*)')

events['vervolgrol(len)'] = rollen1.combine_first(rollen2)

# delete info death from original fields 
events['werkgebied en -soort'] = events['werkgebied en -soort'].str.replace(r'>.*', '', regex=True).str.strip()
events['bijzonderheden'] = events['bijzonderheden'].str.replace(r'>.*', '', regex=True).str.strip()

In [None]:
events['info_spouse'] = events['bijzonderheden'].str.extract(r'(~.*)')
events['bijzonderheden'] = events['bijzonderheden'].str.replace(r'~.*', '', regex=True).str.strip()

In [None]:
split_periods = events['werkperiode'].str.extract(r'(?P<periode_start>[^-\s]*)\s*-\s*(?P<periode_einde>.+)')

# Replace empty strings with NaN
split_periods = split_periods.replace('', np.nan)
events = pd.concat([events, split_periods], axis=1)

In [None]:

# Add missing single-year values into periode_start
# Only if original row had no dash and no extracted periode_start
no_dash = ~events['werkperiode'].astype(str).str.contains('-')
only_year = events['werkperiode'].astype(str).str.fullmatch(r'\d{3,4}')
missing_start = split_periods['periode_start'].isna()

# Set periode_start where applicable
split_periods.loc[no_dash & only_year & missing_start, 'periode_start'] = events.loc[no_dash & only_year & missing_start, 'werkperiode']

events = events.drop(columns=['periode_start', 'periode_einde'], errors='ignore')

# Combine with original dataframe
events = pd.concat([events, split_periods], axis=1)

In [None]:
# Extract the first 4-digit number from 'periode_start'
events['periode_start_int'] = events['periode_start'].astype(str).str.extract(r'(\d{4})')

# Convert to integer (optional, depending on if you want NaN or errors on failure)
events['periode_start_int'] = events['periode_start_int'].astype(float).astype('Int64')


# Extract the first 4-digit number from 'periode_einde'
events['periode_einde_int'] = events['periode_einde'].astype(str).str.extract(r'(\d{4})')

# Convert to integer (optional, depending on if you want NaN or errors on failure)
events['periode_einde_int'] = events['periode_einde_int'].astype(float).astype('Int64')


In [None]:
events.head()

In [None]:
events.columns.tolist()

In [None]:
# change the order of the columns drop, "spouse_info" and add empty event field.

events['event'] = np.nan
events['bio_info'] = np.nan

events = events[[
    'id', 
    'werkgebied en -soort_org',
    'bijzonderheden_org',
    'organ.',
    'werkgebied en -soort',
    'werkperiode',
    'periode_start',
    'periode_einde',
    'periode_start_int',
    'periode_einde_int',
    'bijzonderheden',
    'event',
    'bron', 
    'informatie_sterven',
    'vervolgrol(len)',
    'info_spouse',
    'info toegevoegd',
    'welke info'
]]

In [None]:
date_str = datetime.today().strftime('%m_%d_%Y')
filename = "output//"+ f'events_{date_str}.xlsx'

In [None]:
events.to_excel(filename, index=False)