In [8]:
import pandas as pd
import matplotlib.pyplot as plt

In [9]:
# %% [code]
import pandas as pd
import re

# Optionally use fuzzy matching if available.
try:
    from fuzzywuzzy import fuzz
    FUZZY_AVAILABLE = True
except ImportError:
    import difflib
    FUZZY_AVAILABLE = False

def is_ongehuwd(text, target="ongehuwd gebleven", threshold=70):
    """
    Determine whether a text indicates an 'ongehuwd gebleven' note,
    even if there are extra characters or minor misspellings.

    Strategy:
      1. Lowercase the text and check if both "ongehuwd" and "gebleven" appear.
      2. If not found exactly, use fuzzy matching (if available) with a lower threshold.
    
    Parameters:
      text (str): The text from the 'nama orang' field.
      target (str): The target phrase.
      threshold (int): For fuzzy matching, the minimum score required (0-100 scale).
    
    Returns:
      bool: True if the text is considered to match, False otherwise.
    """
    if not isinstance(text, str):
        return False
    text_lower = text.lower()
    # Basic substring check that allows extra characters and spaces.
    if "ongehuwd" in text_lower and "gebleven" in text_lower:
        return True
    
    # Fallback: fuzzy matching if the exact substrings are not found.
    if FUZZY_AVAILABLE:
        score = fuzz.partial_ratio(text_lower, target.lower())
        return score >= threshold
    else:
        # Use difflib as a fallback. difflib returns a ratio between 0 and 1.
        ratio = difflib.SequenceMatcher(None, text_lower, target.lower()).ratio()
        return ratio >= (threshold / 100.0)

# (Optional) Debug test cases for the helper function.
test_texts = [
    "Ongehuwd gebleven",
    "ongehuwd   gebleven",
    "ongehuwd- gebleven",
    "ongehuwd gebleeven",  # slight misspelling
    "nog iets anders",
    "Test: ongehuwd gebleven!",
]
print("Debug: is_ongehuwd() results for sample texts:")
for txt in test_texts:
    print(f"'{txt}':", is_ongehuwd(txt))

Debug: is_ongehuwd() results for sample texts:
'Ongehuwd gebleven': True
'ongehuwd   gebleven': True
'ongehuwd- gebleven': True
'ongehuwd gebleeven': True
'nog iets anders': False
'Test: ongehuwd gebleven!': True


In [11]:
# %% [code]
# Set the file paths for your CSV files.
bio_file = '/Volumes/Extreme SSD/Python_Projects/NIAA Project/FINAL Data (with annotations)/bio_data - bio_data.csv'
spouse_file = '/Volumes/Extreme SSD/Python_Projects/NIAA Project/FINAL Data (with annotations)/spouse_data - spouse_data.csv'

# Load the datasets.
bio_df = pd.read_csv(bio_file)
spouse_df = pd.read_csv(spouse_file)

# Display the first few rows of each to confirm the structure.
print("Bio Dataset:")
print(bio_df.head(), "\n")
print("Spouse Dataset:")
print(spouse_df.head())

Bio Dataset:
                            nama orang   geb/overl.  id  actual id Unnamed: 4
0              Aa, zr. W. van der (N)   1908-1942/5   0        NaN        NaN
1  Aalbers, dr. Joh. Godefr., arts (N)    1910-1992   1        NaN        NaN
2              Aalders, zr. Jacoba (N)    1910-1999   2        NaN        NaN
3                    Abeel, David (VS)    1804-1846   3        NaN        NaN
4          Abkoude, ds. F.N.M. van (N)    1895-1988   4        NaN        NaN 

Spouse Dataset:
                                  spouse  id
0            ~ 1935 Akke Reidinga † 1990   1
1          ~ 1918 Regina Horstman † 1988   4
2                        ~ Emma Reinmuth   5
3         ~ 1888 J.M. de Buisonjé † 1940   7
4  ~ 1894 Maria Lambertha Gunning † 1939   8


In [12]:
# %% [code]
# Define column names per the provided headers.
name_col = "nama orang"
id_col = "id"
actual_id_col = "actual id"

# Create a Boolean mask for rows with a non-null 'actual id'.
mask_actual_id = bio_df[actual_id_col].notnull()

# Process each mistaken row individually.
# (A debug print is included so you can verify each processed row. Remove if not needed.)
for idx, row in bio_df[mask_actual_id].iterrows():
    mistaken_text = row[name_col]
    target_id = row[actual_id_col]
    
    # Debug output to verify processing.
    # print(f"Processing row {idx}: mistaken_text='{mistaken_text}', target_id='{target_id}'")
    
    if isinstance(mistaken_text, str) and is_ongehuwd(mistaken_text):
        # For rows indicating "ongehuwd gebleven", add a new spouse row.
        new_spouse_row = {"id": target_id, "spouse": "ongehuwd gebleven"}
        spouse_df = pd.concat([spouse_df, pd.DataFrame([new_spouse_row])], ignore_index=True)
        # Debug statement:
        print(f"Added spouse row for actual id {target_id} with 'ongehuwd gebleven'")
    else:
        # Otherwise, append the mistaken text to the correct bio record.
        target_mask = bio_df[id_col] == target_id
        if target_mask.any():
            bio_df.loc[target_mask, name_col] = (
                bio_df.loc[target_mask, name_col].astype(str) + " " + mistaken_text
            )
            print(f"Appended text to bio row with id {target_id}")
        else:
            print(f"Warning: No matching bio row found for actual id {target_id}")

# Remove mistaken rows (those with non-null 'actual id') from the bio dataset.
bio_df_cleaned = bio_df[~mask_actual_id].reset_index(drop=True)

print("\nAfter processing:")
print("Cleaned Bio Dataset Head:")
print(bio_df_cleaned.head(), "\n")
print("Updated Spouse Dataset Head:")
print(spouse_df.head())

Appended text to bio row with id 39.0
Appended text to bio row with id 126.0
Appended text to bio row with id 153.0
Added spouse row for actual id 316.0 with 'ongehuwd gebleven'
Appended text to bio row with id 454.0
Appended text to bio row with id 459.0
Added spouse row for actual id 488.0 with 'ongehuwd gebleven'
Added spouse row for actual id 521.0 with 'ongehuwd gebleven'
Appended text to bio row with id 552.0
Added spouse row for actual id 559.0 with 'ongehuwd gebleven'
Added spouse row for actual id 567.0 with 'ongehuwd gebleven'
Added spouse row for actual id 642.0 with 'ongehuwd gebleven'
Appended text to bio row with id 651.0
Added spouse row for actual id 712.0 with 'ongehuwd gebleven'
Added spouse row for actual id 755.0 with 'ongehuwd gebleven'
Appended text to bio row with id 790.0
Added spouse row for actual id 828.0 with 'ongehuwd gebleven'
Added spouse row for actual id 851.0 with 'ongehuwd gebleven'
Added spouse row for actual id 864.0 with 'ongehuwd gebleven'
Added s

In [13]:
bio_df_cleaned.to_csv('/Volumes/Extreme SSD/Python_Projects/NIAA Project/FINAL/bio_data_cleaned.csv', index=False)
spouse_df.to_csv('/Volumes/Extreme SSD/Python_Projects/NIAA Project/FINAL/spouse_data_cleaned.csv', index=False)
print("Updated files have been saved as 'bio_updated.csv' and 'spouse_updated.csv'.")

Updated files have been saved as 'bio_updated.csv' and 'spouse_updated.csv'.


In [14]:
# %% [code]
# Load the dataset from CSV.
data_file = '/Volumes/Extreme SSD/Python_Projects/NIAA Project/FINAL Data (with annotations)/event_data_cleaned_with_gaps - Sheet1.csv'
df = pd.read_csv(data_file)

# Display the first few rows to verify the structure.
print("Original Dataset Head:")
print(df.head())

Original Dataset Head:
  organ.                               werkgebied en -soort  werkperiode  \
0    GZB  Rantepao (Sulsel), verpleegster † in Japanse i...    1932-1933   
1    NaN                                                NaN          NaN   
2    NZG                                               Deli    1937-1946   
3     SZ                    Bojonegoro, verpleegster in Zzh  1939-1942/6   
4  ABCFM                              Batavia, bij Medhurst         1831   

                                      bijzonderheden  id  
0  ~ 1933 F.R.O. Steller, contr. BB, z.v. hp E.T....   0  
1                                                NaN   0  
2                        1942-45 Japanse internering   1  
3  > dir. Centraal Zh in Hollan­dia (Dake 64) > S...   2  
4                                                NaN   3  


In [15]:
# %% [code]
# Replace empty strings (or strings with only whitespace) with NaN.
df_clean = df.replace(r'^\s*$', pd.NA, regex=True)

# Optional: display a summary to check that empty strings have been replaced.
print("Dataset after replacing empty strings:")
print(df_clean.head())

Dataset after replacing empty strings:
  organ.                               werkgebied en -soort  werkperiode  \
0    GZB  Rantepao (Sulsel), verpleegster † in Japanse i...    1932-1933   
1    NaN                                                NaN          NaN   
2    NZG                                               Deli    1937-1946   
3     SZ                    Bojonegoro, verpleegster in Zzh  1939-1942/6   
4  ABCFM                              Batavia, bij Medhurst         1831   

                                      bijzonderheden  id  
0  ~ 1933 F.R.O. Steller, contr. BB, z.v. hp E.T....   0  
1                                                NaN   0  
2                        1942-45 Japanse internering   1  
3  > dir. Centraal Zh in Hollan­dia (Dake 64) > S...   2  
4                                                NaN   3  


In [16]:
# %% [code]
# Create a boolean mask identifying rows where columns other than "id" are all missing.
# First, drop the "id" column and check for missing values across each row.
mask_empty_others = df_clean.drop(columns=['id']).isna().all(axis=1)

# Also ensure that the "id" column itself is not missing (meaning that the row contains an id).
mask_only_id = mask_empty_others & df_clean['id'].notna()

# Output number of rows that will be removed.
rows_to_remove = mask_only_id.sum()
print(f"Rows to remove (only id present): {rows_to_remove}")

# Keep only rows that do NOT match the "only id" condition.
df_filtered = df_clean[~mask_only_id].reset_index(drop=True)

# Show the cleaned dataset.
print("\nCleaned Dataset Head:")
print(df_filtered.head())

Rows to remove (only id present): 104

Cleaned Dataset Head:
    organ.                               werkgebied en -soort  werkperiode  \
0      GZB  Rantepao (Sulsel), verpleegster † in Japanse i...    1932-1933   
1      NZG                                               Deli    1937-1946   
2       SZ                    Bojonegoro, verpleegster in Zzh  1939-1942/6   
3    ABCFM                              Batavia, bij Medhurst         1831   
4  mandiri                       pred. gem. Temanggung, MJava    1925-1942   

                                      bijzonderheden  id  
0  ~ 1933 F.R.O. Steller, contr. BB, z.v. hp E.T....   0  
1                        1942-45 Japanse internering   1  
2  > dir. Centraal Zh in Hollan­dia (Dake 64) > S...   2  
3                                                NaN   3  
4                                       EndSumba 839   4  


In [17]:
# %% [code]
# Save the filtered dataframe to CSV.
df_filtered.to_csv('/Volumes/Extreme SSD/Python_Projects/NIAA Project/FINAL/event_data_cleaned.csv', index=False)
print("Cleaned dataset saved as 'event_data_cleaned.csv'.")

Cleaned dataset saved as 'event_data_cleaned.csv'.
