## Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandocfilters as pf
import docx2txt

## Read in the data

```python

In [2]:

df = pd.read_excel('/Volumes/Extreme SSD/Python_Projects/NIAA Project/maart_2023_ALLE_ZENDINGSARBEIDERS_NEDERLANDS_individuals.xlsx')


In [3]:
df.head()

Unnamed: 0,nama orang,geb/overl.,organ.,werkgebied en -soort,werkperiode,bijzonderheden,Unnamed: 6
0,A,,,,,,
1,"Aa, zr. W. van der (N)",1908-1942/5,GZB,"Rantepao (Sulsel), verpleegster",1932-1933,"~ 1933 F.R.O. Steller, contr. BB, z.v. hp E.T....",
2,,,,† in Japanse internering?,,,
3,"Aalbers, dr. Joh. Godefr., arts (N)",1910-1992,NZG,Deli,1937-1946,1942-45 Japanse internering,
4,~ 1935 Akke Reidinga † 1990,,,,,,


## Convert the data in the first column to a string datatype


In [4]:
# count number of rows
print(df.shape[0])

# Make all values in the column 'nama orang' string
df['nama orang'] = df['nama orang'].astype(str)

3358


## Remove single letter rows 
Remove rows that contain only one letter. These rows are not useful for the analysis.

In [5]:
pattern = r'^\s*[A-Z]\s*$'

# Filter out rows that match the pattern
df = df[~df['nama orang'].str.match(pattern)]

In [6]:
print(df.shape[0])

3334


In [7]:
# print all the values in the column 'nama orang' and their length
print(df['nama orang'].str.len())


1       23
2        3
3       35
4       27
5       23
        ..
3353    30
3354    22
3355    33
3356     3
3357     3
Name: nama orang, Length: 3334, dtype: int64


In [8]:
df.head()

Unnamed: 0,nama orang,geb/overl.,organ.,werkgebied en -soort,werkperiode,bijzonderheden,Unnamed: 6
1,"Aa, zr. W. van der (N)",1908-1942/5,GZB,"Rantepao (Sulsel), verpleegster",1932-1933,"~ 1933 F.R.O. Steller, contr. BB, z.v. hp E.T....",
2,,,,† in Japanse internering?,,,
3,"Aalbers, dr. Joh. Godefr., arts (N)",1910-1992,NZG,Deli,1937-1946,1942-45 Japanse internering,
4,~ 1935 Akke Reidinga † 1990,,,,,,
5,"Aalders, zr. Jacoba (N)",1910-1999,SZ,"Bojonegoro, verpleegster in Zzh",1939-1942/6,> dir. Centraal Zh in Hollan­dia (Dake 64) > S...,


## Merging Rows That Start with `~` or Are `NaN`

The following code snippet processes each row of the DataFrame and conditionally merges rows if the first column’s entry is either `NaN` or starts with `~`. This is useful when dealing with data in which certain rows need to be treated as a continuation of the previous row.



In [9]:
new_rows = []

# Keep track of the "in-progress" row
current_row = []

for i, row in df.iterrows():
    # Grab the first column's value
    first_val = row.iloc[0]
    
    # Define the condition under which we append this row
    # to the existing row (i.e., double the length).
    # Condition:
    #   1) The first_val is actual NaN (pd.isna)
    #   2) The string in first_val starts with '~'
    #   3) The string in first_val starts with 'nan' or 'NaN'
    
    # Note: We check for string and then lower() it for a case-insensitive match
    #       so that 'NaN', 'nan', 'NaN123', etc. are recognized.
    
    if (
        pd.isna(first_val) or 
        (
            isinstance(first_val, str) and 
            (
                first_val.strip().startswith("~") or 
                first_val.strip().lower().startswith("nan")
            )
        )
    ):
        # Append this row's data to the existing row
        current_row.extend(row.values)
    else:
        # If we already have a 'current_row' in progress, store it
        if current_row:
            new_rows.append(current_row)
        # Start a fresh 'current_row' with the current row's data
        current_row = list(row.values)

# After the loop, don't forget to add the last accumulated row if it exists
if current_row:
    new_rows.append(current_row)

# Now 'new_rows' is a list of lists, each of potentially different length.

# Pad each combined row to the same length so we can create a new DataFrame
max_len = max(len(r) for r in new_rows)
padded_rows = [r + [None]*(max_len - len(r)) for r in new_rows]

# Create a new DataFrame from the padded rows
df_combined = pd.DataFrame(padded_rows)


In [11]:
# write the new dataframe to a new excel file
df_combined.to_excel('ALLE_ZENDINGSARBEIDERS_CLEANED_data_FINAL.xlsx', index=True)