In [None]:
# import the required libraries
import os
import re
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Set variables for the project (i.e. the input location of the file to be processed and the output location) )

folderlink = '..//data//'
folder_input = 'input//'
folder_output = 'output//'
input_file_parent = folderlink+folder_output+"parent_data.csv"
input_stcn = folderlink+folder_output+"stcn_q8.csv"

In [None]:
# Panda settings for showing data
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [None]:
stcn_authors = pd.read_csv(input_stcn, sep=';', encoding='utf-8')

In [None]:
stcn_authors.head()

In [None]:
stcn_authors['birth_year'] = stcn_authors['author_birthDate'].str.extract(r'(\d{4})')

In [None]:
stcn_authors['first_letter'] = stcn_authors['author_givenName'].astype(str).apply(lambda x: x[0])

In [None]:
stcn_authors.head()

In [None]:
stcn_authors['surname'] = stcn_authors['author_familyName'].str.replace(' ', '')

In [None]:
stcn_authors['stcn_name_surname_year'] = (stcn_authors['first_letter'].astype(str)  + '_' + stcn_authors['surname'].astype(str) + '_' + np.where(stcn_authors['birth_year'].isna(), '', stcn_authors['birth_year'].astype(str)))

In [None]:
stcn_authors = stcn_authors[~stcn_authors['birth_year'].isna()]

In [None]:
stcn_authors.head()

In [None]:
year_int = {'year_birth': pd.Int64Dtype(),
            'year_death': pd.Int64Dtype(),
            'year_baptized': pd.Int64Dtype(), 
            'year_pastoor': pd.Int64Dtype(), 
            'year_garnizoenspredikant': pd.Int64Dtype(),
            'year_emeritus_status': pd.Int64Dtype(),
            'year_burried': pd.Int64Dtype(),
            'year_conrector': pd.Int64Dtype(),
            'year_rector': pd.Int64Dtype(),
            'year_monnik': pd.Int64Dtype(),
            'year_schoolmeester': pd.Int64Dtype(),
            'year_hoogleraar': pd.Int64Dtype(),
            'year_chirurgijn': pd.Int64Dtype(),
            'year_praeceptor': pd.Int64Dtype(),
            'year_ziekentrooster': pd.Int64Dtype(),
            'year_vlootpredikant': pd.Int64Dtype(),
            'year_ambassadepredikant': pd.Int64Dtype()}


In [None]:
parent = pd.read_csv(input_file_parent, sep=';', dtype=year_int , encoding='utf-8')

In [None]:
parent['first_letter'] = parent['name'].astype(str).apply(lambda x: x[1])

In [None]:
parent = parent[~parent['year_birth'].isna()]

In [None]:

parent['year_birth'] = parent['year_birth'].astype(str)

In [None]:
parent['year_birth']= parent['year_birth'].str.extract(r'(\d{4})')

In [None]:
parent.head()

In [None]:
parent['infix'] = parent['infix'].str.replace(' ', '')

In [None]:
parent['par_name_surname_year'] = (parent['first_letter'].astype(str) + '_' + np.where(parent['infix'].isna(), '', parent['infix'].astype(str)) + '' + parent['surname'].astype(str) + '_' + parent['year_birth'].astype(str))

In [None]:
parent.head()

In [None]:
clerus_stcn = pd.merge(parent, stcn_authors, left_on='par_name_surname_year', right_on='stcn_name_surname_year', how='inner')

In [None]:
clerus_stcn.head()

In [None]:
clerus_stcn.describe()

In [None]:
clerus_stcn = clerus_stcn.drop(['baptized','legerpredikant',	'pastoor',	'garnizoenspredikant',	'emeritus_status',	'burried',	'conrector',	'rector',	'monnik',	'schoolmeester',	'hoogleraar',	'chirurgijn',	'praeceptor',	'ziekentrooster',	'vlootpredikant',	'ambassadepredikant', 'year_baptized', 'year_pastoor', 'year_garnizoenspredikant', 'year_emeritus_status', 'year_burried', 'year_conrector', 'year_rector', 'year_monnik','year_schoolmeester',
'year_hoogleraar', 'year_chirurgijn', 'year_praeceptor', 'year_ziekentrooster', 'year_vlootpredikant', 'year_ambassadepredikant', 'year_legerpredikant','accu_year_birth','accu_year_death','accu_year_baptized','accu_year_legerpredikant','accu_year_pastoor','accu_year_garnizoenspredikant','accu_year_emeritus_status','accu_year_burried','accu_year_conrector','accu_year_rector','accu_year_monnik','accu_year_schoolmeester','accu_year_hoogleraar','accu_year_chirurgijn','accu_year_praeceptor','accu_year_ziekentrooster','accu_year_vlootpredikant','accu_year_ambassadepredikant','join_name'], axis=1)

In [None]:
clerus_stcn.to_csv(folderlink+folder_output+'clerus_stcn_inner.csv', sep=';', encoding='utf-8', index=False)

In [None]:
unique_ids = clerus_stcn['clerus_id'].nunique()
print("Number of unique IDs:", unique_ids)

In [None]:
clerus_stcn.head()

In [None]:
clerus_stcn['year_birth'] = clerus_stcn['year_birth'].astype(int)

# Calculate the bins for every 20 years
min_year = clerus_stcn['year_birth'].min()
max_year = clerus_stcn['year_birth'].max()
bin_width = 1
bins = range(min_year, max_year + bin_width, bin_width)

# Plotting the histogram
plt.hist(clerus_stcn['year_birth'], bins=bins)

# Customize the plot
plt.title("Quantities per Year")
plt.xlabel("Year of birth")
plt.ylabel("Quantity")

# Show the plot
plt.show()