In [6]:
# import the required libraries
import docx2txt
import os
import re
import csv
import pandas as pd
import numpy as np

In [10]:
# Set variables for the project (i.e. the input location of the file to be processed and the output location) )

folderlink = 'data//'
input_file = os.path.join(folderlink, 'Repertoriummetoudepersoonsnummers.docx')
output_txt = folderlink+'output.txt'
output_csv = folderlink+'output_file.csv'
folder_output = 'output//'


In [8]:
# Panda settings for showing data
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [11]:
# Converting the docx to a text file and remove all unecessary rows.

# Use docx2txt library to extract text from .docx file
text = docx2txt.process(input_file)

# Remove excessive whitespaces
data = ' '.join(text.split())

 "N.N. "de oude vicarius">pred. Lichtenvoorde 1602 tot 1615.<20871>" has been updated to 
 "N.N. "de oude vicarius" pred. Lichtenvoorde 1602 tot 1615.<20871>" manually, since it was the only record that had a > symbol in the line.
 
"Bosch; Cornelius
Geb. Utrecht 1634; pred. Renswoude 16 dec. 1656, Maasland 15 april 1663, Brielle 30 jan. 1667, Alkmaar 1667, 's Gravenhage 5 juli 1676, emer. 1713,;overl. 28 maart 1715.<1185>"

Was updates to:

Bosch; Cornelius
Geb. Utrecht 1634; pred. Renswoude 16 dec. 1656, Maasland 15 april 1663, Brielle 30 jan. 1667, Alkmaar 1667, 's Gravenhage 5 juli 1676, emer. 1713, overl. 28 maart 1715.<1185>

Since it splitted the string based on ",;overl"
 

In [12]:
data = data.replace('overl. 28 maart 1715.<1185>', ' overl. 28 maart 1715.<1185>').replace('>pred.', ' pred.')

In [13]:
# Replace semicolons with newlines and add semicolons around < and > since these identify the IDs
data = data.replace(';', ';\n').replace(';\n ', '; ').replace('>', '>;\n ').replace('<', ';<')
lines = data.split('\n')

lines = [line for line in lines if not line.startswith('; ;') and not line.startswith('; ')]
data = '\n'.join(lines)
lines = data.strip().split('\n')
data = '\n'.join([line.lstrip() for line in lines])

In [14]:
with open(output_txt, 'w', encoding='utf-8') as f:
    f.write(data)

In [15]:
columns = ("Geb.", # deze is cruciaal
 " pred.",
 "overl.", # deze is cruciaal
 "Gedoopt", # deze is cruciaal
 "legerpred.",
 "pastoor",
 "garnizoenspred.",
 "emer.",
 "begraven",
 "conrector",
 " rector",
 "monnik",           
 "schoolmeester",
 "hoogleraar",
 "chirurgijn",
 "praeceptor",
 "ziekentrooster",
 "vlootpred.",
 "legerpred.",
 "ambassadepred."
           
)          
    

In [16]:
for column in columns:
    with open(output_txt, 'r', encoding='utf-8') as f:
        lines = f.readlines()        
    with open(output_txt,'w', encoding='utf-8') as f:
        for line in lines:
            if "; "+column in line:
                f.write(line)
            elif column in line:
                line = line.replace(column, ";"+column)
                f.write(line)
            else:
                f.write(line)

In [17]:
# Define the headers for the output file
headers = ['surname', 'name', 'Field1', 'Field2', 'Field3', 'Field4', 'Field5', 'Field6', 'Field7', 'Field8', 'Field9', 'Field10', 'Field11','Field12','Field13','Field14','Field15','Field16','Field17']

with open(output_txt, 'r', encoding='utf-8') as infile, open(output_csv, 'w', newline='', encoding='utf-8' ) as outfile:
    reader = csv.reader(infile, delimiter=';')
    writer = csv.writer(outfile, delimiter=';')
    
    # Write the headers to the output file
    writer.writerow(headers)

    # Loop through each row in the input file and write it to the output file with 12 fields
    for row in reader:
        # Create a new row with 12 fields by extending the current row with empty values
        new_row = row + [''] * (12 - len(row))
        writer.writerow(new_row)

In [18]:
df = pd.read_csv(output_csv, sep=';', encoding='utf-8')

In [19]:
#In the file all IDs are stored between < and > therefore 
for column in df.columns:
    # Check if any value in the column contains '<'
    if df[column].astype(str).str.contains('<').any():
        # Copy the values containing '<' to the 'test' column
        df.loc[df[column].astype(str).str.contains('<'), 'id'] = df[column]


In [20]:
df['id'] = df['id'].str.replace('>', '').str.replace('<','')

In [21]:
for column in columns:
    df[column] = df.apply(lambda row: row[row.astype(str).str.contains(column)].iloc[0] if any(row.astype(str).str.contains(column)) else '', axis=1)

In [22]:
df['original_input'] = df['surname'].fillna('') + df['name'].fillna('') + df['Field1'].fillna('') + df['Field2'].fillna('')+ df['Field3'].fillna('')+ df['Field4'].fillna('')+ df['Field5'].fillna('')+ df['Field6'].fillna('')+ df['Field7'].fillna('')+ df['Field8'].fillna('')+ df['Field9'].fillna('')+ df['Field10'].fillna('')+ df['Field11'].fillna('')+ df['Field12'].fillna('')+ df['Field13'].fillna('')+ df['Field14'].fillna('')+ df['Field15'].fillna('')+ df['Field16'].fillna('')+ df['Field17'].fillna('')

In [23]:
for column in columns:
    df[column] = df[column].str.replace(column, '')

  df[column] = df[column].str.replace(column, '')


In [24]:
df['name_info_father'] =df['name'] .str.extract(r'\[(.*?)\]')

In [25]:
columns_rename = {
    'Geb.': 'birth', 
    ' pred.': 'minister', 
    'overl.': 'death', 
    'Gedoopt':'baptized', 
    'legerpred.':'legerpredikant',
    'pastoor':'pastoor',
    'garnizoenspred.':'garnizoenspredikant',
    "emer.":'emeritus_status',
    "begraven":'burried',
    "conrector":'conrector',
    " rector":'rector',
    "monnik":'monnik',           
    "schoolmeester":'schoolmeester',
    "hoogleraar":'hoogleraar',
    "chirurgijn":'chirurgijn',
    "praeceptor":'praeceptor',
    "ziekentrooster":'ziekentrooster',
    "vlootpred.":'vlootpredikant',
    "legerpred.":'legerpredikant',
    "ambassadepred.":'ambassadepredikant'}

In [26]:
# Rename the columns
df = df.rename(columns=columns_rename)
new_columns = list(columns_rename.values())


In [27]:
df.head()

Unnamed: 0,surname,name,Field1,Field2,Field3,Field4,Field5,Field6,Field7,Field8,Field9,Field10,Field11,Field12,Field13,Field14,Field15,Field16,Field17,id,birth,minister,death,baptized,legerpredikant,pastoor,garnizoenspredikant,emeritus_status,burried,conrector,rector,monnik,schoolmeester,hoogleraar,chirurgijn,praeceptor,ziekentrooster,vlootpredikant,ambassadepredikant,original_input,name_info_father
0,"Aalburg, van",Johannes,Geb. Zierikzee ca. 1717,,"pred. Oudkarspel 30 juni 1743,",overl. 14 maart 1777.,<1>,,,,,,,,,,,,,1,Zierikzee ca. 1717,"Oudkarspel 30 juni 1743,",14 maart 1777.,,,,,,,,,,,,,,,,,"Aalburg, van Johannes Geb. Zierikzee ca. 1717 pred. Oudkarspel 30 juni 1743, overl. 14 maart 1777.<1>",
1,Aalst,Wilhelmus,Gedoopt Biggekerke 5 jan. 1664,,"pred. Aardenburg 22 mei 1695,",overl. 19 dec. 1700.,<4>,,,,,,,,,,,,,4,,"Aardenburg 22 mei 1695,",19 dec. 1700.,Biggekerke 5 jan. 1664,,,,,,,,,,,,,,,,"Aalst Wilhelmus Gedoopt Biggekerke 5 jan. 1664 pred. Aardenburg 22 mei 1695, overl. 19 dec. 1700.<4>",
2,"Aalst, van",Cornelius,Geb. Castricum ca. 1686,ambassadepred. in Parijs maart tot dec. 1715,,"pred. Kalslagen ber. 21 febr. 1717,",emer. 1751,overl. Amsterdam 27 aug. 1756.,<2>,,,,,,,,,,,2,Castricum ca. 1686,"Kalslagen ber. 21 febr. 1717,",Amsterdam 27 aug. 1756.,,,,,1751,,,,,,,,,,,in Parijs maart tot dec. 1715,"Aalst, van Cornelius Geb. Castricum ca. 1686 ambassadepred. in Parijs maart tot dec. 1715 pred. Kalslagen ber. 21 febr. 1717, emer. 1751 overl. Amsterdam 27 aug. 1756.<2>",
3,"Aalst, van",Gerardus,Geb. xxx sept. 1678,,"pred. Vuren en Dalem 10 aug. 1704, Sommelsdijk 13 juni 1706, WestZaandam 4 aug. 1715,",emer. 1755,overl. 29 juni 1759.,<3>,,,,,,,,,,,,3,xxx sept. 1678,"Vuren en Dalem 10 aug. 1704, Sommelsdijk 13 juni 1706, WestZaandam 4 aug. 1715,",29 juni 1759.,,,,,1755,,,,,,,,,,,,"Aalst, van Gerardus Geb. xxx sept. 1678 pred. Vuren en Dalem 10 aug. 1704, Sommelsdijk 13 juni 1706, WestZaandam 4 aug. 1715, emer. 1755 overl. 29 juni 1759.<3>",
4,Aalstius,Henricus,Geb. 's-Hertogenbosch (?) yyy,,"pred. Castricum en Heemskerk nov. 1700,",emer. sept. 1733,overl. Limmen 15 maart 1736.,<5>,,,,,,,,,,,,5,'s-Hertogenbosch (?) yyy,"Castricum en Heemskerk nov. 1700,",Limmen 15 maart 1736.,,,,,sept. 1733,,,,,,,,,,,,"Aalstius Henricus Geb. 's-Hertogenbosch (?) yyy pred. Castricum en Heemskerk nov. 1700, emer. sept. 1733 overl. Limmen 15 maart 1736.<5>",


In [28]:
array_drop = [i for i in range(1, 18)]
for dropid in array_drop:
    column_dropid = 'Field'+str(dropid)
    df = df.drop(column_dropid, axis=1)

In [29]:
def extract_year(text):
    match = re.search(r'\d{4}', text)
    if match:
        return match.group(0)
    else:
        return None

In [30]:
function_year = [word for word in new_columns if word != 'minister']

print(function_year)

['birth', 'death', 'baptized', 'legerpredikant', 'pastoor', 'garnizoenspredikant', 'emeritus_status', 'burried', 'conrector', 'rector', 'monnik', 'schoolmeester', 'hoogleraar', 'chirurgijn', 'praeceptor', 'ziekentrooster', 'vlootpredikant', 'ambassadepredikant']


In [31]:
df.head()

Unnamed: 0,surname,name,id,birth,minister,death,baptized,legerpredikant,pastoor,garnizoenspredikant,emeritus_status,burried,conrector,rector,monnik,schoolmeester,hoogleraar,chirurgijn,praeceptor,ziekentrooster,vlootpredikant,ambassadepredikant,original_input,name_info_father
0,"Aalburg, van",Johannes,1,Zierikzee ca. 1717,"Oudkarspel 30 juni 1743,",14 maart 1777.,,,,,,,,,,,,,,,,,"Aalburg, van Johannes Geb. Zierikzee ca. 1717 pred. Oudkarspel 30 juni 1743, overl. 14 maart 1777.<1>",
1,Aalst,Wilhelmus,4,,"Aardenburg 22 mei 1695,",19 dec. 1700.,Biggekerke 5 jan. 1664,,,,,,,,,,,,,,,,"Aalst Wilhelmus Gedoopt Biggekerke 5 jan. 1664 pred. Aardenburg 22 mei 1695, overl. 19 dec. 1700.<4>",
2,"Aalst, van",Cornelius,2,Castricum ca. 1686,"Kalslagen ber. 21 febr. 1717,",Amsterdam 27 aug. 1756.,,,,,1751,,,,,,,,,,,in Parijs maart tot dec. 1715,"Aalst, van Cornelius Geb. Castricum ca. 1686 ambassadepred. in Parijs maart tot dec. 1715 pred. Kalslagen ber. 21 febr. 1717, emer. 1751 overl. Amsterdam 27 aug. 1756.<2>",
3,"Aalst, van",Gerardus,3,xxx sept. 1678,"Vuren en Dalem 10 aug. 1704, Sommelsdijk 13 juni 1706, WestZaandam 4 aug. 1715,",29 juni 1759.,,,,,1755,,,,,,,,,,,,"Aalst, van Gerardus Geb. xxx sept. 1678 pred. Vuren en Dalem 10 aug. 1704, Sommelsdijk 13 juni 1706, WestZaandam 4 aug. 1715, emer. 1755 overl. 29 juni 1759.<3>",
4,Aalstius,Henricus,5,'s-Hertogenbosch (?) yyy,"Castricum en Heemskerk nov. 1700,",Limmen 15 maart 1736.,,,,,sept. 1733,,,,,,,,,,,,"Aalstius Henricus Geb. 's-Hertogenbosch (?) yyy pred. Castricum en Heemskerk nov. 1700, emer. sept. 1733 overl. Limmen 15 maart 1736.<5>",


In [32]:
for year in function_year:
    fld_year = 'year_' +year
    df[fld_year] = df[year].apply(lambda x: extract_year(x))    

In [33]:
for year_accu in function_year:
    accu_fld_year = 'accu_year_' +year_accu
    df[accu_fld_year] = ''
    # Iterate over each row in the DataFrame
    for index, row in df.iterrows():
        # Check if the string contains "ca." (case-insensitive)
        if 'ca.' in row[year_accu].lower():
            # If found, set the value of the "accuracy" column to "circa"
            df.at[index, accu_fld_year] = 'circa'
            
    

In [34]:
df.head()


Unnamed: 0,surname,name,id,birth,minister,death,baptized,legerpredikant,pastoor,garnizoenspredikant,emeritus_status,burried,conrector,rector,monnik,schoolmeester,hoogleraar,chirurgijn,praeceptor,ziekentrooster,vlootpredikant,ambassadepredikant,original_input,name_info_father,year_birth,year_death,year_baptized,year_legerpredikant,year_pastoor,year_garnizoenspredikant,year_emeritus_status,year_burried,year_conrector,year_rector,year_monnik,year_schoolmeester,year_hoogleraar,year_chirurgijn,year_praeceptor,year_ziekentrooster,year_vlootpredikant,year_ambassadepredikant,accu_year_birth,accu_year_death,accu_year_baptized,accu_year_legerpredikant,accu_year_pastoor,accu_year_garnizoenspredikant,accu_year_emeritus_status,accu_year_burried,accu_year_conrector,accu_year_rector,accu_year_monnik,accu_year_schoolmeester,accu_year_hoogleraar,accu_year_chirurgijn,accu_year_praeceptor,accu_year_ziekentrooster,accu_year_vlootpredikant,accu_year_ambassadepredikant
0,"Aalburg, van",Johannes,1,Zierikzee ca. 1717,"Oudkarspel 30 juni 1743,",14 maart 1777.,,,,,,,,,,,,,,,,,"Aalburg, van Johannes Geb. Zierikzee ca. 1717 pred. Oudkarspel 30 juni 1743, overl. 14 maart 1777.<1>",,1717.0,1777,,,,,,,,,,,,,,,,,circa,,,,,,,,,,,,,,,,,
1,Aalst,Wilhelmus,4,,"Aardenburg 22 mei 1695,",19 dec. 1700.,Biggekerke 5 jan. 1664,,,,,,,,,,,,,,,,"Aalst Wilhelmus Gedoopt Biggekerke 5 jan. 1664 pred. Aardenburg 22 mei 1695, overl. 19 dec. 1700.<4>",,,1700,1664.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,"Aalst, van",Cornelius,2,Castricum ca. 1686,"Kalslagen ber. 21 febr. 1717,",Amsterdam 27 aug. 1756.,,,,,1751,,,,,,,,,,,in Parijs maart tot dec. 1715,"Aalst, van Cornelius Geb. Castricum ca. 1686 ambassadepred. in Parijs maart tot dec. 1715 pred. Kalslagen ber. 21 febr. 1717, emer. 1751 overl. Amsterdam 27 aug. 1756.<2>",,1686.0,1756,,,,,1751.0,,,,,,,,,,,1715.0,circa,,,,,,,,,,,,,,,,,
3,"Aalst, van",Gerardus,3,xxx sept. 1678,"Vuren en Dalem 10 aug. 1704, Sommelsdijk 13 juni 1706, WestZaandam 4 aug. 1715,",29 juni 1759.,,,,,1755,,,,,,,,,,,,"Aalst, van Gerardus Geb. xxx sept. 1678 pred. Vuren en Dalem 10 aug. 1704, Sommelsdijk 13 juni 1706, WestZaandam 4 aug. 1715, emer. 1755 overl. 29 juni 1759.<3>",,1678.0,1759,,,,,1755.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,Aalstius,Henricus,5,'s-Hertogenbosch (?) yyy,"Castricum en Heemskerk nov. 1700,",Limmen 15 maart 1736.,,,,,sept. 1733,,,,,,,,,,,,"Aalstius Henricus Geb. 's-Hertogenbosch (?) yyy pred. Castricum en Heemskerk nov. 1700, emer. sept. 1733 overl. Limmen 15 maart 1736.<5>",,,1736,,,,,1733.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [35]:
months =(" januari ", 
 " februari ",
 " maart ",
 " april ",
 " mei ",
 " juni ",
 " juli ",
 " augustus ",
 " september ",
 " oktober ",
 " november ",           
 " december ",
 "jan. ",
 "feb. ",
 "mrt. ",
 "apr. ",
 "jun. ",
 "jul. ",
 "aug. ",
 "sept. ",
 "sep. ",
 "okt. ",
 "nov. ",
 "dec. ",
 "yyy",
 "xxx",
 "ca.",
 "febr."        
) 


In [36]:
for column_strip in function_year:   
    for month in months:
        df[column_strip] = df[column_strip].str.replace(month, '')


  df[column_strip] = df[column_strip].str.replace(month, '')


In [37]:
for column_strip in function_year:
    df[column_strip] = df[column_strip].apply(lambda x: re.sub(r'[\d\.]', '', x))

In [38]:
df.head(15)

Unnamed: 0,surname,name,id,birth,minister,death,baptized,legerpredikant,pastoor,garnizoenspredikant,emeritus_status,burried,conrector,rector,monnik,schoolmeester,hoogleraar,chirurgijn,praeceptor,ziekentrooster,vlootpredikant,ambassadepredikant,original_input,name_info_father,year_birth,year_death,year_baptized,year_legerpredikant,year_pastoor,year_garnizoenspredikant,year_emeritus_status,year_burried,year_conrector,year_rector,year_monnik,year_schoolmeester,year_hoogleraar,year_chirurgijn,year_praeceptor,year_ziekentrooster,year_vlootpredikant,year_ambassadepredikant,accu_year_birth,accu_year_death,accu_year_baptized,accu_year_legerpredikant,accu_year_pastoor,accu_year_garnizoenspredikant,accu_year_emeritus_status,accu_year_burried,accu_year_conrector,accu_year_rector,accu_year_monnik,accu_year_schoolmeester,accu_year_hoogleraar,accu_year_chirurgijn,accu_year_praeceptor,accu_year_ziekentrooster,accu_year_vlootpredikant,accu_year_ambassadepredikant
0,"Aalburg, van",Johannes,1,Zierikzee,"Oudkarspel 30 juni 1743,",,,,,,,,,,,,,,,,,,"Aalburg, van Johannes Geb. Zierikzee ca. 1717 pred. Oudkarspel 30 juni 1743, overl. 14 maart 1777.<1>",,1717.0,1777.0,,,,,,,,,,,,,,,,,circa,,,,,,,,,,,,,,,,,
1,Aalst,Wilhelmus,4,,"Aardenburg 22 mei 1695,",,Biggekerke,,,,,,,,,,,,,,,,"Aalst Wilhelmus Gedoopt Biggekerke 5 jan. 1664 pred. Aardenburg 22 mei 1695, overl. 19 dec. 1700.<4>",,,1700.0,1664.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,"Aalst, van",Cornelius,2,Castricum,"Kalslagen ber. 21 febr. 1717,",Amsterdam,,,,,,,,,,,,,,,,in Parijstot,"Aalst, van Cornelius Geb. Castricum ca. 1686 ambassadepred. in Parijs maart tot dec. 1715 pred. Kalslagen ber. 21 febr. 1717, emer. 1751 overl. Amsterdam 27 aug. 1756.<2>",,1686.0,1756.0,,,,,1751.0,,,,,,,,,,,1715.0,circa,,,,,,,,,,,,,,,,,
3,"Aalst, van",Gerardus,3,,"Vuren en Dalem 10 aug. 1704, Sommelsdijk 13 juni 1706, WestZaandam 4 aug. 1715,",,,,,,,,,,,,,,,,,,"Aalst, van Gerardus Geb. xxx sept. 1678 pred. Vuren en Dalem 10 aug. 1704, Sommelsdijk 13 juni 1706, WestZaandam 4 aug. 1715, emer. 1755 overl. 29 juni 1759.<3>",,1678.0,1759.0,,,,,1755.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,Aalstius,Henricus,5,'s-Hertogenbosch (?),"Castricum en Heemskerk nov. 1700,",Limmen,,,,,,,,,,,,,,,,,"Aalstius Henricus Geb. 's-Hertogenbosch (?) yyy pred. Castricum en Heemskerk nov. 1700, emer. sept. 1733 overl. Limmen 15 maart 1736.<5>",,,1736.0,,,,,1733.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,Aalstius,Johannes,7,,"Isabella en Anthonyschans 1642, Oirschot en Best 13 sept. 1648,",,,,,,,,,,,,,,,,,,"Aalstius Johannes Geb. xxx yyy pred. Isabella en Anthonyschans 1642, Oirschot en Best 13 sept. 1648, overl. 11 dec. 1657.<7>",,,1657.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
6,Aalstius,Johannes [z.v. Johannes Leonardusz.],6,,"Obdam 1644, Boxtel 13 sept. 1648, Bergen op Zoom 9 nov. 1652 (tevens Waals",,,,,,,,,,,,,,,,,,"Aalstius Johannes [z.v. Johannes Leonardusz.] Geb. xxx 1620 pred. Obdam 1644, Boxtel 13 sept. 1648, Bergen op Zoom 9 nov. 1652 (tevens Waals pred.), emer. 1685 begraven 17 nov. 1687.<6>",z.v. Johannes Leonardusz.,1620.0,,,,,,1685.0,1687.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,
7,Aalstius,Johannes,8,'s-Hertogenbosch,"Hoornaar 1687, Beverwijk april 1698, Middelburg 22 nov. 1705 (tevens",,,,,,,,,,,,"tot ),",,,,,,"Aalstius Johannes Geb. 's-Hertogenbosch ca. 1660 pred. Hoornaar 1687, Beverwijk april 1698, Middelburg 22 nov. 1705 (tevens hoogleraar 1705 tot 1709), overl. 10 juni 1712.<8>",,1660.0,1712.0,,,,,,,,,,,1705.0,,,,,,circa,,,,,,,,,,,,,,,,,
8,Aalstius,Leonardus,104,Haarlem,"Berghem + Nistelrode ber. 4 maart 1649, Lith en Lithoyen 1653,",,,,,,,,,,,,,,,,,,"Aalstius Leonardus Geb. Haarlem ca. 1628 pred. Berghem + Nistelrode ber. 4 maart 1649, Lith en Lithoyen 1653, overl. 1666.<104>",,1628.0,1666.0,,,,,,,,,,,,,,,,,circa,,,,,,,,,,,,,,,,,
9,Aalstius,Petrus [z.v. Johannes Leonardusz.],9,'s-Hertogenbosch,"Bakel 1655, Beers (Oost, Middel en Wester) (Fr) 1658, SintOedenrode 19 okt. 1670,",,,,,,,,,,,,,,,,,,"Aalstius Petrus [z.v. Johannes Leonardusz.] Geb. 's-Hertogenbosch ca. 1632 pred. Bakel 1655, Beers (Oost, Middel en Wester) (Fr) 1658, SintOedenrode 19 okt. 1670, overl. juli 1686.<9>",z.v. Johannes Leonardusz.,1632.0,1686.0,,,,,,,,,,,,,,,,,circa,,,,,,,,,,,,,,,,,


In [40]:

birth_org = df[['id','birth','year_birth','accu_year_birth']]
birth_org.to_csv(folderlink+folder_output+'birth_info.csv', sep=';', encoding='utf-8', index=False)


In [41]:
death_org = df[['id','death','year_death','accu_year_death']]
death_org.to_csv(folderlink+folder_output+'death_info.csv', sep=';', encoding='utf-8', index=False)

In [42]:
bapt_org = df[['id','baptized','year_baptized','accu_year_baptized']]
bapt_org.to_csv(folderlink+folder_output+'bapt_info.csv', sep=';', encoding='utf-8', index=False)

In [43]:
df.to_csv(folderlink+folder_output+'parent_data.csv', sep=';', encoding='utf-8', index=False)

From here we are going to make the parent child relationship with predikanten Ministers

Here the minister data is set as a child. 

In [44]:
subset_pred = df[['id','minister']]

In [45]:
subset_pred.head(6)

Unnamed: 0,id,minister
0,1,"Oudkarspel 30 juni 1743,"
1,4,"Aardenburg 22 mei 1695,"
2,2,"Kalslagen ber. 21 febr. 1717,"
3,3,"Vuren en Dalem 10 aug. 1704, Sommelsdijk 13 juni 1706, WestZaandam 4 aug. 1715,"
4,5,"Castricum en Heemskerk nov. 1700,"
5,7,"Isabella en Anthonyschans 1642, Oirschot en Best 13 sept. 1648,"


In [46]:
df_expanded = subset_pred.assign(minister=subset_pred['minister'].str.split(','))

# Explode the 'pred.' column to create separate rows for each item
df_expanded = df_expanded.explode('minister')

In [47]:
df_expanded.head(6)

Unnamed: 0,id,minister
0,1,Oudkarspel 30 juni 1743
0,1,
1,4,Aardenburg 22 mei 1695
1,4,
2,2,Kalslagen ber. 21 febr. 1717
2,2,


In [48]:
df_filtered = df_expanded[['id','minister']]

In [49]:
childs = df_filtered[df_filtered["minister"] !=" "]

In [50]:
childs.head()

Unnamed: 0,id,minister
0,1,Oudkarspel 30 juni 1743
1,4,Aardenburg 22 mei 1695
2,2,Kalslagen ber. 21 febr. 1717
3,3,Vuren en Dalem 10 aug. 1704
3,3,Sommelsdijk 13 juni 1706


In [51]:
childs['minister_year'] = childs['minister'].apply(lambda x: extract_year(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  childs['minister_year'] = childs['minister'].apply(lambda x: extract_year(x))


In [52]:
accu_fld_year = 'accu_year_' +'minster'
childs[accu_fld_year] = ''
# Iterate over each row in the DataFrame
for index, row in df.iterrows():
    # Check if the string contains "ca." (case-insensitive)
    if 'ca.' in row[year_accu].lower():
        # If found, set the value of the "accuracy" column to "circa"
        df.at[index, accu_fld_year] = 'circa'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  childs[accu_fld_year] = ''


In [53]:
childs.head()

Unnamed: 0,id,minister,minister_year,accu_year_minster
0,1,Oudkarspel 30 juni 1743,1743,
1,4,Aardenburg 22 mei 1695,1695,
2,2,Kalslagen ber. 21 febr. 1717,1717,
3,3,Vuren en Dalem 10 aug. 1704,1704,
3,3,Sommelsdijk 13 juni 1706,1706,


In [54]:
for month in months:
        childs['minister'] = childs['minister'].str.replace(month, '')

childs['minister'] = childs['minister'].apply(lambda x: re.sub(r'[\d\.]', '', x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  childs['minister'] = childs['minister'].str.replace(month, '')
  childs['minister'] = childs['minister'].str.replace(month, '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  childs['minister'] = childs['minister'].apply(lambda x: re.sub(r'[\d\.]', '', x))


In [55]:
childs.head()

Unnamed: 0,id,minister,minister_year,accu_year_minster
0,1,Oudkarspel,1743,
1,4,Aardenburg,1695,
2,2,Kalslagen ber,1717,
3,3,Vuren en Dalem,1704,
3,3,Sommelsdijk,1706,


In [56]:
childs.to_csv(folderlink+folder_output+'minister_info.csv', sep=';', encoding='utf-8', index=False)

In [None]:
os.remove(output_txt)
os.remove(output_csv)