In [1]:
import pandas as pd
import re


In [5]:
spouses_file = '/Volumes/Extreme SSD/Python_Projects/NIAA Project/FINAL/spouse_unsplit.csv'
df_spouses = pd.read_csv(spouses_file)

# Inspect the first few rows.
print("Spouses Dataset Head:")
print(df_spouses.head())

Spouses Dataset Head:
                                  spouse   id
0            ~ 1935 Akke Reidinga † 1990  1.0
1          ~ 1918 Regina Horstman † 1988  4.0
2                        ~ Emma Reinmuth  5.0
3         ~ 1888 J.M. de Buisonjé † 1940  7.0
4  ~ 1894 Maria Lambertha Gunning † 1939  8.0


In [6]:

def parse_spouse_info(text):
    """
    Parse the input text from the "spouse" column and return a tuple:
       (spouse_name, birth_date, death_date, year_married)
    """
    # Ensure we are dealing with a string.
    if not isinstance(text, str) or not text.strip():
        return None, None, None, None

    s = text.strip()

    # Remove a leading tilde and any extra spaces.
    if s.startswith("~"):
        s = s[1:].strip()

    # Remove any leading ordering indicator like "(1)", "(2)", etc.
    s = re.sub(r'^\(\s*\d+\s*\)\s*', '', s)

    # Initialize our output values.
    spouse_name = None
    birth_date = None
    death_date = None
    year_married = None

    # --- 1. Extract a parenthesized birth-death pattern, e.g. "(1900-1992)"
    paren_match = re.search(r'\(\s*(\d{3,4})\s*[-–]\s*(\d{3,4})\s*\)', s)
    if paren_match:
        birth_date = paren_match.group(1)
        death_date = paren_match.group(2)
        # Remove the parenthesized portion from the text.
        s = s.replace(paren_match.group(0), '').strip()

    # --- 2. Look for a dagger symbol (†) if death_date still not set.
    dagger_match = re.search(r'†\s*([^()\[\]]+)', s)
    if dagger_match and not death_date:
        death_date_candidate = dagger_match.group(1).strip()
        # Remove any trailing punctuation.
        death_date = death_date_candidate.strip(" ,;:")
        s = s.replace(dagger_match.group(0), '').strip()

    # --- 3. Look for a leading year (year married)
    # Optionally, the year may begin with a ± or other symbol.
    ym_match = re.match(r'^([±]?\d{3,4})\b', s)
    if ym_match:
        year_married = ym_match.group(1)
        s = s[ym_match.end():].strip()

    # --- 4. If text starts with a month name and a year, treat that as a birth date (if not set)
    # List of month names (case-insensitive).
    month_pattern = r'^(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}'
    month_match = re.match(month_pattern, s, flags=re.IGNORECASE)
    if month_match and not birth_date:
        birth_date = month_match.group(0).strip()
        s = s[month_match.end():].strip()

    # --- 5. The remainder is taken as the spouse name.
    spouse_name = s.strip(" ,:;")

    return spouse_name if spouse_name != "" else None, birth_date, death_date, year_married

# (Optional) Test the helper function with a few sample strings.
sample_values = [
    "~ 1935 Akke Reidinga † 1990",
    "~ 1918 Regina Horstman † 1988",
    "~ Emma Reinmuth",
    "~ 1888 J.M. de Buisonjé † 1940",
    "~ 1948 J.M. Schouten (1900-1992)",
    "~ (1) 1920 Joh. Maria van Leeuwen",
    "~ (2) Clem. Sophie Barkley",
    "~ (1) April 1835 Mary Maxwell † Des.1835",
    "~ 1926 Gerritje Alida [sic] Scheurer (1896-1992)",
    "~ 1936 W.C.A. Jansz (Pati, 1894-1962)",
    "~ (1) S. Fahrni † 9-2-1930"
]

for val in sample_values:
    parsed = parse_spouse_info(val)
    print(f"Original: {val}\nParsed: {parsed}\n")

Original: ~ 1935 Akke Reidinga † 1990
Parsed: ('Akke Reidinga', None, '1990', '1935')

Original: ~ 1918 Regina Horstman † 1988
Parsed: ('Regina Horstman', None, '1988', '1918')

Original: ~ Emma Reinmuth
Parsed: ('Emma Reinmuth', None, None, None)

Original: ~ 1888 J.M. de Buisonjé † 1940
Parsed: ('J.M. de Buisonjé', None, '1940', '1888')

Original: ~ 1948 J.M. Schouten (1900-1992)
Parsed: ('J.M. Schouten', '1900', '1992', '1948')

Original: ~ (1) 1920 Joh. Maria van Leeuwen
Parsed: ('Joh. Maria van Leeuwen', None, None, '1920')

Original: ~ (2) Clem. Sophie Barkley
Parsed: ('Clem. Sophie Barkley', None, None, None)

Original: ~ (1) April 1835 Mary Maxwell † Des.1835
Parsed: ('Mary Maxwell', 'April 1835', 'Des.1835', None)

Original: ~ 1926 Gerritje Alida [sic] Scheurer (1896-1992)
Parsed: ('Gerritje Alida [sic] Scheurer', '1896', '1992', '1926')

Original: ~ 1936 W.C.A. Jansz (Pati, 1894-1962)
Parsed: ('W.C.A. Jansz (Pati, 1894-1962)', None, None, '1936')

Original: ~ (1) S. Fahrni † 

In [7]:

df_spouses[['spouse_name', 'birth_date', 'death_date', 'year_married']] = df_spouses['spouse'].apply(
    lambda x: pd.Series(parse_spouse_info(x))
)

# Show the first several rows to verify the extraction.
print("Updated Spouses Dataset Head:")
print(df_spouses.head(20))

Updated Spouses Dataset Head:
                                              spouse    id  \
0                        ~ 1935 Akke Reidinga † 1990   1.0   
1                      ~ 1918 Regina Horstman † 1988   4.0   
2                                    ~ Emma Reinmuth   5.0   
3                     ~ 1888 J.M. de Buisonjé † 1940   7.0   
4              ~ 1894 Maria Lambertha Gunning † 1939   8.0   
5                           ~ M.E. Hufschmied † 1982   9.0   
6                         ~ 1920 H.C. Dijkman † 1945  10.0   
7                                           ~ [n.a.]  11.0   
8                        ~ ± 1933 A. Ristjouw † 1990  13.0   
9                     ~ 1864 Chr.C. vd Linden † 1907  14.0   
10                      ~1907 M.M. Ver­hoe­ven †1961  16.0   
11                       ~ 1893 Jetske Ringma † 1948  18.0   
12                 ~ (1) 1920 Joh. Maria van Leeuwen  19.0   
13                        ~ (2) Clem. Sophie Barkley  19.0   
14                  ~ 1948 J.M. Schouten

In [8]:
output_spouses_file = '/Volumes/Extreme SSD/Python_Projects/NIAA Project/FINAL/spouses_FINAL.csv'
df_spouses.to_csv(output_spouses_file, index=False)
print(f"Updated spouses dataset saved as '{output_spouses_file}'.")

Updated spouses dataset saved as '/Volumes/Extreme SSD/Python_Projects/NIAA Project/FINAL/spouses_FINAL.csv'.
