In [1]:
# This script finds the path to the 'data' folder in the parent directory of the current notebook.
import os

# Get the directory of the current notebook
notebook_directory = os.getcwd()
print(f"Current notebook directory: {notebook_directory}")

# Go up one level to the main project directory
parent_directory = os.path.dirname(notebook_directory)
print(f"Parent directory: {parent_directory}")

# Specify the path to the 'data' folder from the parent directory
data_folder_path = os.path.join(parent_directory, 'data')
print(f"Data directory: {data_folder_path}")

Current notebook directory: c:\08_AHFID\gbv-predictive-tool\notebooks
Parent directory: c:\08_AHFID\gbv-predictive-tool
Data directory: c:\08_AHFID\gbv-predictive-tool\data


In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np

 # Set pd.set_option to show all columns in the DataFrame
pd.set_option('display.max_columns', None) 

In [3]:
# Load the dataset from the 'data' folder
file_path = os.path.join(data_folder_path, 'NGBV Dashboard dataset.xlsx')
df = pd.read_excel(file_path, header=2)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44360 entries, 0 to 44359
Data columns (total 78 columns):
 #   Column                                                       Non-Null Count  Dtype  
---  ------                                                       --------------  -----  
 0   Time Stamp Date                                              44350 non-null  object 
 1   Time Stamp Time                                              44350 non-null  object 
 2   Type_of_Organisation                                         44350 non-null  object 
 3   Location of Organisation State                               44350 non-null  object 
 4   Organisation LGA                                             38944 non-null  object 
 5   Contact Channel                                              41821 non-null  object 
 6   Contact Channel Other                                        2515 non-null   object 
 7   Was the Violence Fatal                                       44350 non-null 

In [4]:
# Display the original shape and columns of the DataFrame
print("The original shape of the DataFrame:", df.shape)
print("The original columns of the DataFrame:", df.columns.tolist())

The original shape of the DataFrame: (44360, 78)
The original columns of the DataFrame: ['Time Stamp Date', 'Time Stamp Time', 'Type_of_Organisation', 'Location of Organisation State', 'Organisation LGA', 'Contact Channel', 'Contact Channel Other', 'Was the Violence Fatal', 'Who Reported the Incident', 'WHO REPORTED THE INCIDENT_OTHER', 'Sex of survivor', 'Age of survivor', 'MARITAL STATUS', 'EMPLOYMENT STATUS OF PARENT/GUARDIAN', 'EMPLOYMENT STATUS OF PARENT/GUARDIAN_OTHERS', 'EMPLOYMENT STATUS OF SURVIVOR/VICTIM', 'EMPLOYMENT STATUS OF SURVIVOR/VICTIM_OTHERS', 'VULNERABLE POPULATION_Person living with disability', 'VULNERABLE POPULATION_PLHIV', 'VULNERABLE POPULATION_Female sex worker', 'VULNERABLE POPULATION_IDP', 'VULNERABLE POPULATION_DRUG USER', 'VULNERABLE POPULATION_WIDOW', 'VULNERABLE POPULATION_OUT OF SCHOOL CHILD', 'VULNERABLE POPULATION_MINOR', 'VULNERABLE POPULATION_House maids/domestic staff', 'VULNERABLE POPULATION_CHILD APPRENTICE', 'VULNERABLE POPULATION_ORPHANS', 'VUL

In [5]:
# Create a copy of the dataframe to avoid SettingWithCopyWarning
df_copy = df.copy()

## Data Cleaning and Preprocessing

### **Merge several related columns into a single, consolidated column:**

**General Strategy:**
*   Identify Groups: Group the columns that need to be merged.
*   Define New Column Name: Decide on the name for the consolidated column.
*   Combine Values: For each row, iterate through the columns in a group
*   Create New Column: Assign the combined values to the new consolidated column.
*   Drop Old Columns: Remove the original, now redundant, columns.

In [6]:
def combine_columns(df, new_col_name, cols_to_combine, separator=','):
    """
    Combine multiple columns into a new column with a separator, ignoring NaN.

    Args:
        df: The DataFrame to modify
        new_col_name: The new name of the combined column
        cols_to_combine (list): The list of the column names to combine
        separator: The string to use to join the multiple non-NaN values

    Returns:
        The modified DataFrame with the new column and old ones dropped
    """
    missing_cols = [col for col in cols_to_combine if col not in df.columns]
    if missing_cols:
        raise ValueError(f"The following columns do not exist in the DataFrame: {missing_cols}")

    cols_to_combine = [col for col in cols_to_combine if col in df.columns]
    if not cols_to_combine:
        raise ValueError("No valid columns to combine")

    df[new_col_name] = df[cols_to_combine].apply(
        lambda row: separator.join(row.dropna().astype(str)),
        axis=1
    )

    df[new_col_name] = df[new_col_name].replace('', pd.NA)

    df = df.drop(columns=cols_to_combine)

    return df

In [7]:
# 1. Location of Organisation
df = combine_columns(df, 'Location of Organisation', ['Location of Organisation State', 'Organisation LGA'])

# 2. Who reported the incident
df = combine_columns(df, 'WHO REPORTED THE INCIDENT', ['Who Reported the Incident', 'WHO REPORTED THE INCIDENT_OTHER'])

# 3. Contact Channel
df = combine_columns(df, 'Contact Channel', ['Contact Channel', 'Contact Channel Other'])

# 4. Parent/Guardian Employment Status
df = combine_columns(df, 'PARENT/GUARDIAN EMPLOYMENT STATUS', ['EMPLOYMENT STATUS OF PARENT/GUARDIAN', 'EMPLOYMENT STATUS OF PARENT/GUARDIAN_OTHERS'])

# 5. Survivor/Victim Employment Status
df = combine_columns(df, 'SURVIVOR/VICTIM EMPLOYMENT STATUS', ['EMPLOYMENT STATUS OF SURVIVOR/VICTIM', 'EMPLOYMENT STATUS OF SURVIVOR/VICTIM_OTHERS'])

# 6. Educational Status
df = combine_columns(df, 'EDUCATIONAL STATUS', ['EDUCATIONAL STATUS', 'EDUCATIONAL STATUS_OTHER'])

# 7. Who does the survivor/victim live with
df = combine_columns(df, 'WHO SURVIVOR/VICTIM LIVE WITH', ['WHO DOES THE SURVIVOR/VICTIM LIVE WITH', 'WHO DOES THE SURVIVOR/VICTIM LIVE WITH_OTHER'])

# 8. Perpetrator's Sex
df = combine_columns(df, 'SEX OF PERPETRATOR', ['SEX OF PERPETRATOR_1', 'SEX OF PERPETRATOR_2', 'SEX OF PERPETRATOR_3', 'SEX OF PERPETRATOR_OTHER'])

# 9. Perpetrator's Age
df = combine_columns(df, 'AGE OF PERPETRATOR', ['AGE OF PERPETRATOR_1', 'AGE OF PERPETRATOR_2', 'AGE OF PERPETRATOR_3', 'AGE OF PERPETRATOR_OTHER'])

# 10 Survivor/Victim's Relationship with Perpetrator
df = combine_columns(
    df,
    "RELATIONSHIP WITH PERPETRATOR",
    [
        "SURVIVOR/VICTIM'S RELATIONSHIP WITH PERPETRATOR_1",
        "SURVIVOR/VICTIM'S RELATIONSHIP WITH PERPETRATOR_2",
        "SURVIVOR/VICTIM'S RELATIONSHIP WITH PERPETRATOR_3",
        "SURVIVOR/VICTIM'S RELATIONSHIP WITH PERPETRATOR_OTHER"
    ]
)

# # 11. Location of Violence
# df = combine_columns(df, 'LOCATION OF VIOLENCE', ['LOCATION OF VIOLENCE (STATE)', 'LOCATION OF VIOLENCE (L.G.A)', 'LOCATION OF VIOLENCE (WARD)'])

# 12. Place of Incident
df = combine_columns(df, 'PLACE OF INCIDENT', ['PLACE OF INCIDENT', 'PLACE OF INCIDENT_OTHER'])


print("\nProcessed DataFrame shape:", df.shape)
print("\nNew DataFrame Columns:")
print(df.columns.tolist())


Processed DataFrame shape: (44360, 58)

New DataFrame Columns:
['Time Stamp Date', 'Time Stamp Time', 'Type_of_Organisation', 'Was the Violence Fatal', 'Sex of survivor', 'Age of survivor', 'MARITAL STATUS', 'VULNERABLE POPULATION_Person living with disability', 'VULNERABLE POPULATION_PLHIV', 'VULNERABLE POPULATION_Female sex worker', 'VULNERABLE POPULATION_IDP', 'VULNERABLE POPULATION_DRUG USER', 'VULNERABLE POPULATION_WIDOW', 'VULNERABLE POPULATION_OUT OF SCHOOL CHILD', 'VULNERABLE POPULATION_MINOR', 'VULNERABLE POPULATION_House maids/domestic staff', 'VULNERABLE POPULATION_CHILD APPRENTICE', 'VULNERABLE POPULATION_ORPHANS', 'VULNERABLE POPULATION_NOT APPLICABLE', 'VULNERABLE POPULATION_OTHER', 'DOES THE SURVIVOR/VICTIM LIVE ALONE', 'ESTIMATED AVERAGE MONTHLY INCOME', 'DATE OF INCIDENT', 'DATE REPORTED', 'LOCATION OF VIOLENCE (STATE)', 'LOCATION OF VIOLENCE (L.G.A)', 'LOCATION OF VIOLENCE (WARD)', 'TIME OF THE DAY THAT INCIDENT TOOK PLACE', 'TYPE OF VIOLENCE_SEXUAL ASSAULT', 'TYPE O

#### **Collapse multiple binary columns into a single column "VULNERABLE POPULATION" and "TYPE OF VIOLENCE"**
**What should be done:**
* Loop through the binary columns.
* For each row, collect labels of columns marked “Yes”.
* Store them as a comma-separated list in a new column.


In [8]:
#@title Collapse multiple "VULNERABLE POPULATION_" columns into a single "VULNERABLE POPULATION"
def combine_vulnerable_population_columns(df):
    # Step 1: Select relevant columns
    vuln_cols = [col for col in df.columns if col.startswith("VULNERABLE POPULATION_")]

    # Step 2: Strip the prefix to get clean category names
    category_map = {col: col.replace("VULNERABLE POPULATION_", "").strip() for col in vuln_cols}

    # Step 3: Apply row-wise logic
    def get_vuln_categories(row):
        categories = [category_map[col] for col in vuln_cols if str(row[col]).strip().upper() == "YES"]
        return ", ".join(categories) if categories else pd.NA

    # Step 4: Create the new column
    df["VULNERABLE POPULATION"] = df.apply(get_vuln_categories, axis=1)

    # Optional: Drop the original binary columns
    df = df.drop(columns=vuln_cols)

    return df

df = combine_vulnerable_population_columns(df)

print("\nProcessed DataFrame shape:", df.shape)
print("\nNew DataFrame Columns:")
print(df.columns.tolist())


Processed DataFrame shape: (44360, 46)

New DataFrame Columns:
['Time Stamp Date', 'Time Stamp Time', 'Type_of_Organisation', 'Was the Violence Fatal', 'Sex of survivor', 'Age of survivor', 'MARITAL STATUS', 'DOES THE SURVIVOR/VICTIM LIVE ALONE', 'ESTIMATED AVERAGE MONTHLY INCOME', 'DATE OF INCIDENT', 'DATE REPORTED', 'LOCATION OF VIOLENCE (STATE)', 'LOCATION OF VIOLENCE (L.G.A)', 'LOCATION OF VIOLENCE (WARD)', 'TIME OF THE DAY THAT INCIDENT TOOK PLACE', 'TYPE OF VIOLENCE_SEXUAL ASSAULT', 'TYPE OF VIOLENCE_PHYSICAL ASSAULT', 'TYPE OF VIOLENCE_FINANCIAL/ECONOMIC', 'TYPE OF VIOLENCE_ONLINE/CYBER', 'TYPE OF VIOLENCE_RAPE', 'TYPE OF VIOLENCE_DEFILEMENT', 'TYPE OF VIOLENCE_FORCED MARRIAGE', 'TYPE OF VIOLENCE_DENIAL OF RESOURCES', 'TYPE OF VIOLENCE_PSYCHOLOGICAL/EMOTIONAL ABUSE', 'TYPE OF VIOLENCE_FEMALE GENITAL MUTILATION', 'TYPE OF VIOLENCE_VIOLATION OF PROPERTY & INHERITANCE RIGHTS', 'TYPE OF VIOLENCE_CHILD ABUSE AND NEGLECT', 'TYPE OF VIOLENCE_OTHER', 'DOES THE SURVIVOR WANT ACCESS TO J

In [9]:
#@title Collapse multiple "TYPE OF VIOLENCE_" columns into a single "TYPE OF VIOLENCE"
def combine_violence_types(df):
    # Step 1: Get all relevant columns
    violence_cols = [col for col in df.columns if col.startswith("TYPE OF VIOLENCE_")]

    # Step 2: Map column names to clean labels
    type_map = {col: col.replace("TYPE OF VIOLENCE_", "").strip() for col in violence_cols}

    # Step 3: Apply row-wise logic
    def get_violence_types(row):
        types = [type_map[col] for col in violence_cols if str(row[col]).strip().upper() == "YES"]
        return ", ".join(types) if types else pd.NA

    # Step 4: Add new column
    df["TYPE OF VIOLENCE"] = df.apply(get_violence_types, axis=1)

    # Step 5: Drop old flag columns
    df = df.drop(columns=violence_cols)

    return df

df = combine_violence_types(df)

print("\nProcessed DataFrame shape:", df.shape)
print("\nNew DataFrame Columns:")
print(df.columns.tolist())


Processed DataFrame shape: (44360, 34)

New DataFrame Columns:
['Time Stamp Date', 'Time Stamp Time', 'Type_of_Organisation', 'Was the Violence Fatal', 'Sex of survivor', 'Age of survivor', 'MARITAL STATUS', 'DOES THE SURVIVOR/VICTIM LIVE ALONE', 'ESTIMATED AVERAGE MONTHLY INCOME', 'DATE OF INCIDENT', 'DATE REPORTED', 'LOCATION OF VIOLENCE (STATE)', 'LOCATION OF VIOLENCE (L.G.A)', 'LOCATION OF VIOLENCE (WARD)', 'TIME OF THE DAY THAT INCIDENT TOOK PLACE', 'DOES THE SURVIVOR WANT ACCESS TO JUSTICE', 'OUTCOME OF PROSECUTION', 'DATE JUSTICE WAS RECEIVED', 'HAS THE CASE BEEN CLOSED', 'WHO CLOSED THE CASE?', 'DATE CASE WAS CLOSED', 'APPROVED BY ORG. SUPERVISOR_DATE', 'APPROVED BY LGA SUPERVISOR_DATE', 'APPROVED BY STATE SUPERVISOR_DATE', 'Location of Organisation', 'WHO REPORTED THE INCIDENT', 'PARENT/GUARDIAN EMPLOYMENT STATUS', 'SURVIVOR/VICTIM EMPLOYMENT STATUS', 'WHO SURVIVOR/VICTIM LIVE WITH', 'SEX OF PERPETRATOR', 'AGE OF PERPETRATOR', 'RELATIONSHIP WITH PERPETRATOR', 'VULNERABLE POPU

#### **Drop 'Time Stamp Date' and 'Time Stamp Time'**

* 'Time Stamp Date' and 'Time Stamp Time' were the date and time the data were inputed. It is irrelevant to the incident data

In [10]:
#@title Drop 'Time Stamp Date' and 'Time Stamp Time'
df = df.drop(columns=['Time Stamp Date', 'Time Stamp Time'])
df.columns.tolist()

['Type_of_Organisation',
 'Was the Violence Fatal',
 'Sex of survivor',
 'Age of survivor',
 'MARITAL STATUS',
 'DOES THE SURVIVOR/VICTIM LIVE ALONE',
 'ESTIMATED AVERAGE MONTHLY INCOME',
 'DATE OF INCIDENT',
 'DATE REPORTED',
 'LOCATION OF VIOLENCE (STATE)',
 'LOCATION OF VIOLENCE (L.G.A)',
 'LOCATION OF VIOLENCE (WARD)',
 'TIME OF THE DAY THAT INCIDENT TOOK PLACE',
 'DOES THE SURVIVOR WANT ACCESS TO JUSTICE',
 'OUTCOME OF PROSECUTION',
 'DATE JUSTICE WAS RECEIVED',
 'HAS THE CASE BEEN CLOSED',
 'WHO CLOSED THE CASE?',
 'DATE CASE WAS CLOSED',
 'APPROVED BY ORG. SUPERVISOR_DATE',
 'APPROVED BY LGA SUPERVISOR_DATE',
 'APPROVED BY STATE SUPERVISOR_DATE',
 'Location of Organisation',
 'WHO REPORTED THE INCIDENT',
 'PARENT/GUARDIAN EMPLOYMENT STATUS',
 'SURVIVOR/VICTIM EMPLOYMENT STATUS',
 'WHO SURVIVOR/VICTIM LIVE WITH',
 'SEX OF PERPETRATOR',
 'AGE OF PERPETRATOR',
 'RELATIONSHIP WITH PERPETRATOR',
 'VULNERABLE POPULATION',
 'TYPE OF VIOLENCE']

In [11]:
# df.to_csv(os.path.join(data_folder_path, 'processed_data.csv'), index=False)
# print("Data saved to 'processed_data.csv' in the data folder.")

#### 'Was the Violence Fatal': For more descriptive and contextually meaningful than 'Yes' and 'No', and standardised terms that are consistent

* Map 'Yes' to 'Fatal', 'No' to 'Non-fatal'
* Combine 'NotApplicable' and NaN into a new category, 'Unknown' 

In [12]:
# Map 'Yes' and 'No' to more descriptive terms in 'Was the Violence Fatal' and handle 'NotApplicable' and NaN

df['Was the Violence Fatal'] = df['Was the Violence Fatal'].replace({
    'Yes': 'Fatal',
    'No': 'Non-fatal',
    'NotApplicable': 'Unknown'
})
df['Was the Violence Fatal'] = df['Was the Violence Fatal'].fillna('Unknown')

#  Check the new value counts
df['Was the Violence Fatal'].value_counts(dropna=False)

Was the Violence Fatal
Non-fatal    42179
Fatal         2132
Unknown         49
Name: count, dtype: int64

#### 'Age of survivor'
* Correct Unrealistic Values: Values like 3434 were assumed to be typos, and only the first two digits were kept to fix the data while preserving the intended information.
* Handle Missing Ages: The value 0.0 was converted to NaN because it is not a valid age.
* Impute NaN Values: All missing NaN values were filled with the median age, which is a statistically sound method to handle missing data without skewing the distribution.

In [13]:
# Correct the typo values by keeping only the first two digits
df['Age of survivor'] = df['Age of survivor'].astype(str).str[:2]

# Use pd.to_numeric to convert the column to a number, coercing errors to NaN
df['Age of survivor'] = pd.to_numeric(df['Age of survivor'], errors='coerce')

# Replace "nan" strings with NumPy's NaN
df['Age of survivor'] = df['Age of survivor'].replace("nan", np.nan)

# Set an upper age limit to handle any other potential errors
upper_age_limit = 100
df.loc[df['Age of survivor'] > upper_age_limit, 'Age of survivor'] = np.nan

# Handle the 0.0 values by replacing them with NaN
df['Age of survivor'] = df['Age of survivor'].replace(0.0, np.nan)

# Fill the remaining NaN values with the median age
median_age = df['Age of survivor'].median()
df['Age of survivor'] = df['Age of survivor'].fillna(median_age)

#### 'MARITAL STATUS'
1. Consolidated 'Never married' and 'Single': The categories were merged into a single 'Single' label for simplicity and to combine similar statuses, which is good for analysis.

2. Consolidated 'Other' and NaN: Both were merged into a single 'Unknown' category to handle unclassified data consistently without losing any information.

In [14]:
# Merge 'Never married' and 'Single' to 'Single'
df['MARITAL STATUS'] = df['MARITAL STATUS'].replace({'Never married': 'Single'})

# Merge 'Other' and NaN to 'Unknown'
df['MARITAL STATUS'] = df['MARITAL STATUS'].replace('Other', 'Unknown')
df['MARITAL STATUS'] = df['MARITAL STATUS'].fillna('Unknown')

# Check the new value counts
df['MARITAL STATUS'].value_counts(dropna=False)

MARITAL STATUS
Married/cohabiting    18136
Single                12337
Unknown               10396
Divorced/separated     2582
Widowed                 909
Name: count, dtype: int64

#### 'WHO SURVIVOR/VICTIM LIVE WITH' and 'DOES THE SURVIVOR/VICTIM LIVE ALONE'
1. Conditional Imputation: We filled missing 'WHO SURVIVOR/VICTIM LIVE WITH' values based on 'DOES THE SURVIVOR/VICTIM LIVE ALONE' to infer if the survivor lived 'Alone' or with 'Parent/guardian', leveraging existing information.
2. Fill Remaining Missing: Any remaining NaN values in 'WHO SURVIVOR/VICTIM LIVE WITH' were labeled 'Unknown' to ensure no missing data and provide a clear category for unstated living situations.
3. Standardize and Consolidate Categories: Various spellings and similar categories (e.g., 'Mother', 'MOTHER', 'Grandmother', 'Aunt') were mapped to a unified set of broader categories (e.g., 'PARENT/GUARDIAN', 'SPOUSE/PARTNER', 'CHILDREN') to reduce noise and improve consistency for analysis.
4. Drop Redundant Column: 'DOES THE SURVIVOR/VICTIM LIVE ALONE' was dropped as its information was successfully transferred and consolidated into 'WHO SURVIVOR/VICTIM LIVE WITH', simplifying the dataset.

In [15]:
# a. Compare 'WHO SURVIVOR/VICTIM LIVE WITH' and 'DOES THE SURVIVOR/VICTIM LIVE ALONE'
# If 'DOES THE SURVIVOR/VICTIM LIVE ALONE' is 'Yes' and 'WHO SURVIVOR/VICTIM LIVE WITH' is NaN, replace NaN with 'Alone'
df.loc[(df['DOES THE SURVIVOR/VICTIM LIVE ALONE'] == 'Yes') & (df['WHO SURVIVOR/VICTIM LIVE WITH'].isna()),
       'WHO SURVIVOR/VICTIM LIVE WITH'] = 'Alone'

# If 'DOES THE SURVIVOR/VICTIM LIVE ALONE' is 'No' and 'WHO SURVIVOR/VICTIM LIVE WITH' is NaN, replace NaN with 'Parent/guardian'
df.loc[(df['DOES THE SURVIVOR/VICTIM LIVE ALONE'] == 'No') & (df['WHO SURVIVOR/VICTIM LIVE WITH'].isna()),
       'WHO SURVIVOR/VICTIM LIVE WITH'] = 'Parent/guardian'

# b. Replace remaining NaN in 'WHO SURVIVOR/VICTIM LIVE WITH' with 'Unknown'
df['WHO SURVIVOR/VICTIM LIVE WITH'] = df['WHO SURVIVOR/VICTIM LIVE WITH'].fillna('Unknown')

# c. Properly correct those values with different spellings to make them a single value
# Convert to string and uppercase for consistent matching, then map
df['WHO SURVIVOR/VICTIM LIVE WITH'] = df['WHO SURVIVOR/VICTIM LIVE WITH'].astype(str).str.upper().str.strip()

# Create a mapping for common misspellings and variations
mapping = {
    'PARENT/GUARDIAN': 'PARENT/GUARDIAN',
    'PARENTS': 'PARENT/GUARDIAN',
    'MOTHER': 'PARENT/GUARDIAN',
    'M0THER': 'PARENT/GUARDIAN',
    'FATHER': 'PARENT/GUARDIAN',
    'PARENT': 'PARENT/GUARDIAN',
    'BIOLOGICAL/STEP MOTHER': 'PARENT/GUARDIAN',
    'MOTHER AND GRAND MOTHER': 'PARENT/GUARDIAN',
    'MOTHER AND STEP FATHER': 'PARENT/GUARDIAN',
    'GRANDMOTHER': 'PARENT/GUARDIAN',
    'GRAND MOTHER': 'PARENT/GUARDIAN',
    'GRANDMA': 'PARENT/GUARDIAN',
    'GRAD MOTHER': 'PARENT/GUARDIAN',
    'GRANDPARENTS': 'PARENT/GUARDIAN',
    'GREAT GRANDMOTHER': 'PARENT/GUARDIAN',
    'GRANDPA': 'PARENT/GUARDIAN',
    'AUNT': 'PARENT/GUARDIAN',
    'AUNTY': 'PARENT/GUARDIAN',
    'AUNTY LANDA\'S HOME': 'PARENT/GUARDIAN',
    'AUNT ': 'PARENT/GUARDIAN',
    'AUNT AND UNCLE': 'PARENT/GUARDIAN',
    'UNCLE': 'PARENT/GUARDIAN',
    'GUARDIAN': 'PARENT/GUARDIAN',
    'BOSS ACTING AS GUARDIAN': 'PARENT/GUARDIAN',
    'BOSS ACTING AS A GUARDIAN': 'PARENT/GUARDIAN',
    'GUARDIAN/BOSS': 'PARENT/GUARDIAN',
    'STEP MOTHER': 'PARENT/GUARDIAN',
    'STEP SISTER': 'PARENT/GUARDIAN',
    'STEP SISTER ': 'PARENT/GUARDIAN',
    'SISTER': 'PARENT/GUARDIAN',
    'SIBLING': 'PARENT/GUARDIAN',
    'BROTHER': 'PARENT/GUARDIAN',
    'HALF BROTHER': 'PARENT/GUARDIAN',
    'ELDER SISTER': 'PARENT/GUARDIAN',
    'MOTHERS FRIEND': 'PARENT/GUARDIAN',
    'PATERNAL STEP GRANDMA': 'PARENT/GUARDIAN',
    'IMMEDIATE FAMILY': 'PARENT/GUARDIAN',
    'FOSTER FAMILY': 'PARENT/GUARDIAN',
    'TRAINER': 'PARENT/GUARDIAN', # Could be a guardian/mentor
    'GUILDANCE': 'PARENT/GUARDIAN',
    'MATERNAL GRANDPARENT': 'PARENT/GUARDIAN', 
    'GRANDPARENT': 'PARENT/GUARDIAN',
    'FATHER(MOTHER IS LATE)': 'PARENT/GUARDIAN',
    'GROUNDMOTHER': 'PARENT/GUARDIAN',
    'PATERNAL GRANDMOTHER': 'PARENT/GUARDIAN',
    'GREAT AUNT': 'PARENT/GUARDIAN',
    'HER MOTHER': 'PARENT/GUARDIAN',
    'FOSTER MOTHER': 'PARENT/GUARDIAN',
    'PARENT AND SIBLINGS': 'PARENT/GUARDIAN',
    'HER AUNT': 'PARENT/GUARDIAN',
    'PATERNAL GRANDMA': 'PARENT/GUARDIAN',
    'WITH THEIR MOTHER': 'PARENT/GUARDIAN',
    'MOTHER AND SIBLING': 'PARENT/GUARDIAN',
    'MOTHER AND SIBLINGS': 'PARENT/GUARDIAN',
    'WITH HER MOTHER' : 'PARENT/GUARDIAN',
    'MOTHER, GRANDMOTHER AND TWO YOUNG UNCLES': 'PARENT/GUARDIAN',

    'SPOUSE/COHABITING': 'SPOUSE/PARTNER',
    'SPOUSE': 'SPOUSE/PARTNER',
    'HUSBAND': 'SPOUSE/PARTNER',
    'HUSBAND AND CHILDREN': 'SPOUSE/PARTNER',
    'HUSBAND ': 'SPOUSE/PARTNER',
    'OTHER SPOUSE': 'SPOUSE/PARTNER',
    'PARTNER': 'SPOUSE/PARTNER',
    'BOYFRIEND': 'SPOUSE/PARTNER',
    'SEXUAL PARTNER': 'SPOUSE/PARTNER',
    'COHABITING': 'SPOUSE/PARTNER',
    'FORCED MARRIAGE': 'SPOUSE/PARTNER',
    'CO-HABITING': 'SPOUSE/PARTNER',
    'HUSBAND HOUSE': 'SPOUSE/PARTNER',
    'OLD HUSBAND HOUSE': 'SPOUSE/PARTNER',
    'HUSBAND FRIEND': 'SPOUSE/PARTNER',
    'INTIMATE PARTNER RELATIVE': 'SPOUSE/PARTNER',
    'INTIMATE PARTNER': 'SPOUSE/PARTNER',
    'FORMER INTIMATE PARTNER ': 'SPOUSE/PARTNER',
    'FORMER INTIMATE PARTNER': 'SPOUSE/PARTNER',
    'CURRENT INTIMATE PARTNERS ': 'SPOUSE/PARTNER',
    'EX-HUSBAND': 'SPOUSE/PARTNER',
    'NEW PARTNER': 'SPOUSE/PARTNER',
    'WIFE & CHILDREN (FAMILY)': 'SPOUSE/PARTNER',
    'SPOUSE AND CHILD': 'SPOUSE/PARTNER',
    'SPOUSE/CHILDREN': 'SPOUSE/PARTNER',
    'SPOUSE /HIS FAMILY': 'SPOUSE/PARTNER',
    'GOT MARRIED TO ANOTHER HUSBAND ': 'SPOUSE/PARTNER',
    'BABY DADDY': 'SPOUSE/PARTNER', # Could be partner, but context of 'WHO LIVE WITH' might make it unknown if not primary.
    'SPOUS': 'SPOUSE/PARTNER',
    'GIRLFRIEND': 'SPOUSE/PARTNER',
    'A GIRL FRIEND': 'SPOUSE/PARTNER',
    'HUSBAND  HOUSE': 'SPOUSE/PARTNER',
    'CURRENT INTIMATE PARTNERS': 'SPOUSE/PARTNER',
    'COHIBITING': 'SPOUSE/PARTNER', # If not spouse/partner, then unknown

    'CHILDREN': 'CHILDREN',
    'WITH HER TWO YOUNGER BROTHERS': 'CHILDREN',
    'WITH HER CHILDREN': 'CHILDREN',
    'HER CHILDREN': 'CHILDREN',
    'SON': 'CHILDREN',
    'CHILD\'S MOTHER': 'CHILDREN',
    'HER DAUGHTER': 'CHILDREN',
    'CHILDREN AND RELATIVES': 'CHILDREN',
    'WITH FOUR CHILDREN': 'CHILDREN',
    'THREE CHILDREN': 'CHILDREN',
    'WITH THEIR SIX CHILDREN': 'CHILDREN',
    'WITH THEIR THREE CHILDREN': 'CHILDREN',
    'WITH SURVIVORS (CHILDREN)': 'CHILDREN',
    'WITH CHILD': 'CHILDREN',
    'CHILDREEN': 'CHILDREN',
    'WITH CHILDREN': 'CHILDREN',
    'ALONE WITH CHILD': 'CHILDREN',
    'CHILD': 'CHILDREN',
    'CHILDREN WITH MOTHER': 'CHILDREN',
    'WITH THREE CHILDREN': 'CHILDREN',
    'FOUR OF THEIR CHILDREN': 'CHILDREN',
    'LIVES WITH HER CHILDREN ': 'CHILDREN',
    'DAUGHTER ': 'CHILDREN',
    'THE CHILDREN ': 'CHILDREN',
    'WITH HER CHILD': 'CHILDREN',
    'KIDS AND IN-LAWS': 'CHILDREN',
    'GRAND CHILDREN': 'CHILDREN',
    'LIVING WITH HER 3 CHILDREN': 'CHILDREN',
    'WITH THE CHILDREN': 'CHILDREN',
    'SPOUSE AND CHILDREN': 'CHILDREN',
    'CHILDREN AND IN-LAW ': 'CHILDREN',
    'CHILDREN ': 'CHILDREN',
    'CHILDREN AND RELATIVES CHILDREN': 'CHILDREN',
    'DAUGHTER': 'CHILDREN',
    'THE CHILDREN':'CHILDREN',
    'SINGLE MOTHER': 'CHILDREN',
    'LIVES WITH HER CHILDREN': 'CHILDREN',
    'CHILDREN AND IN-LAW': 'CHILDREN',
    'LIVING WITH CHILDREN': 'CHILDREN', # Already covered, but good to double check
    'WITH CHILDREN': 'CHILDREN', # Already covered
    'SON ': 'CHILDREN',
    
    
    
    'SISTER AND HER HUSBAND': 'RELATIVE/FRIEND', # Assuming this refers to living with sister's family including HUSBAND
    'RELATIVE': 'RELATIVE/FRIEND',
    'FRIENDS': 'RELATIVE/FRIEND',
    'A FRIEND': 'RELATIVE/FRIEND',
    'FRIEND': 'RELATIVE/FRIEND',
    'FRIENDS ': 'RELATIVE/FRIEND',
    'FREIND': 'RELATIVE/FRIEND',
    'FREINDS': 'RELATIVE/FRIEND',
    'FAMILY FRIEND': 'RELATIVE/FRIEND',
    'OTHER FRIEND': 'RELATIVE/FRIEND',
    'RELATIVES ': 'RELATIVE/FRIEND',
    'RELATIVES': 'RELATIVE/FRIEND',
    'COUSIN': 'RELATIVE/FRIEND',
    'ROOMMATES': 'RELATIVE/FRIEND',
    'ROOM MATE': 'RELATIVE/FRIEND',
    'ROOMMATE': 'RELATIVE/FRIEND',
    'ROOM-MATE': 'RELATIVE/FRIEND',
    'STUDENTS AT YABATECH': 'RELATIVE/FRIEND',
    'SCHOOL MATE': 'RELATIVE/FRIEND',
    'WITH A FRIEND': 'RELATIVE/FRIEND', # Generic, could be relative/friend but if not specified, unknown
    'BROTHER IN-LAW': 'RELATIVE/FRIEND',
    'FAMILY': 'RELATIVE/FRIEND',
    'HEAD OF HOUSEHOLD': 'RELATIVE/FRIEND',
    'MATERNAL COUSIN': 'RELATIVE/FRIEND',
    'SISTER AND  HER HUSBAND': 'RELATIVE/FRIEND',
    'FAMILY MEMBERS': 'RELATIVE/FRIEND',
    'IN-LAWS': 'RELATIVE/FRIEND',
    'SIBLINGS': 'RELATIVE/FRIEND',
    'MOTHER INLAW': 'RELATIVE/FRIEND',
    'HUSBAND\'S FAMILY': 'RELATIVE/FRIEND',
    'SUSPECT\'S FAMILY ': 'RELATIVE/FRIEND',
    'DAUGHTER IN-LAW': 'RELATIVE/FRIEND', # More specific than unknown
    'INLAW': 'RELATIVE/FRIEND',
    'FRIENDS/FAMILY': 'RELATIVE/FRIEND',
    'BESTY': 'RELATIVE/FRIEND',
    'LATE HUSBAND\'S HOUSE': 'RELATIVE/FRIEND', # Status, not who they live with
    'SUSPECT\'S FAMILY': 'RELATIVE/FRIEND', # Assuming this refers to living with suspect's family


    'CAREGIVER': 'CAREGIVER/EMPLOYER',
    'BOSS': 'CAREGIVER/EMPLOYER',
    'EMPLOYER': 'CAREGIVER/EMPLOYER',
    'HER MADAM': 'CAREGIVER/EMPLOYER',
    'HER EMPLOYER': 'CAREGIVER/EMPLOYER',
    'BOSS ': 'CAREGIVER/EMPLOYER',
    'BOSS AND MADAM': 'CAREGIVER/EMPLOYER',
    'MADAM': 'CAREGIVER/EMPLOYER',
    'EMPLOYEE': 'CAREGIVER/EMPLOYER',
    'BOSS HOUSE': 'CAREGIVER/EMPLOYER',
    'A LADY TO WHOM SHE WAS HOUSE MAID': 'CAREGIVER/EMPLOYER',
    'LIVES WHERE SHE WORKS AS A MAID': 'CAREGIVER/EMPLOYER',
    'LIVES WITH HER EMPOLYER': 'CAREGIVER/EMPLOYER',
    'HOUSE GIRL': 'CAREGIVER/EMPLOYER',
    'WITH HIS BOSS': 'CAREGIVER/EMPLOYER',
    'BOSS/ GUARDIAN': 'CAREGIVER/EMPLOYER', # Overlap with guardian, but context suggests employer
    'GUARDIAN/BOSS': 'CAREGIVER/EMPLOYER',
    'THE FAMILY OF THE BOSS': 'CAREGIVER/EMPLOYER',
    'MADAM FAMILY': 'CAREGIVER/EMPLOYER',
    'CARE GIVER': 'CAREGIVER/EMPLOYER',
    'CARE  GIVER': 'CAREGIVER/EMPLOYER',
    'BOSS\'S HOUSE':'CAREGIVER/EMPLOYER',


    'ALONE': 'ALONE',
    'SELF': 'ALONE',
    'ALONE ': 'ALONE',
    'LIVE ALONE': 'ALONE',
    'STAY ALONE': 'ALONE',
    'LEAVE ALONE': 'ALONE',
    'ALONE WITH CHILD': 'ALONE',
    'LIVING IN A SEPARATE APARTMENT': 'ALONE',


    'PERPETRATOR': 'PERPETRATOR/TRAFFICKER',
    'THE PERPETRATOR': 'PERPETRATOR/TRAFFICKER',
    'PERPETRATOR (PARTERNAL UNCLE)': 'PERPETRATOR/TRAFFICKER',
    'TRAFFICKER': 'PERPETRATOR/TRAFFICKER',
    'TRAFFICKERS': 'PERPETRATOR/TRAFFICKER',
    'CHILD TRAFFICKER ': 'PERPETRATOR/TRAFFICKER',
    'TRAFFICKED': 'PERPETRATOR/TRAFFICKER',
    'ABDUCTORS ': 'PERPETRATOR/TRAFFICKER',
    'BABY FACTORY APARTMENT': 'PERPETRATOR/TRAFFICKER',
    'CHILD TRAFFICKER': 'PERPETRATOR/TRAFFICKER',
    'ABDUCTORS': 'PERPETRATOR/TRAFFICKER',


    'IDP CAMPS': 'IDP CAMP/SHELTER',
    'IDP CAMP': 'IDP CAMP/SHELTER',
    'IDPS': 'IDP CAMP/SHELTER',
    'CAMP WITH OTHER IDP\'S': 'IDP CAMP/SHELTER',
    'POLICE STATION': 'IDP CAMP/SHELTER',
    'POLICE SHELTER': 'IDP CAMP/SHELTER',
    'POLICE ': 'IDP CAMP/SHELTER',
    'NAPTIP SHELTER': 'IDP CAMP/SHELTER',
    'JUVENILE HOME': 'IDP CAMP/SHELTER',
    'REHABILITATION CENTER': 'IDP CAMP/SHELTER',
    'BOARDING HOUSE': 'IDP CAMP/SHELTER',
    'SCHOOL HOSTEL': 'IDP CAMP/SHELTER',
    'SCHOOL HOSTEL': 'IDP CAMP/SHELTER',
    'ISLAMIC SCHOOL': 'IDP CAMP/SHELTER',
    'SHELTER': 'IDP CAMP/SHELTER',
    'EMERGENCY SHELTER': 'IDP CAMP/SHELTER',
    'EMERGENCY ACCOMMODATION ': 'IDP CAMP/SHELTER',
    'EMERGENCY ACCOMMODATION.': 'IDP CAMP/SHELTER',
    'HOSPITAL WARD': 'IDP CAMP/SHELTER',
    'CBO [NACO]': 'IDP CAMP/SHELTER',
    'ORPHANAGE HOME': 'IDP CAMP/SHELTER',
    'ORPHANAGE': 'IDP CAMP/SHELTER',
    'ORPHANAGE ': 'IDP CAMP/SHELTER',
    'ORPHANEGE': 'IDP CAMP/SHELTER',
    'SOUGHT AFTER ORPHANAGE ': 'IDP CAMP/SHELTER',
    'BRCI': 'IDP CAMP/SHELTER',
    'COMMUNITY CASE WORKER': 'IDP CAMP/SHELTER',
    'EDO STATE MINISTRY OF EDUCATION': 'IDP CAMP/SHELTER',
    'CASEWORKER': 'IDP CAMP/SHELTER',
    'VOLUNTEER ': 'IDP CAMP/SHELTER',
    'CHURCH': 'IDP CAMP/SHELTER', # Could be a shelter
    'POLICE SHELTER': 'IDP CAMP/SHELTER',
    'POLICE STATION': 'IDP CAMP/SHELTER',
    'LIVE IN A CHURCH': 'IDP CAMP/SHELTER',
    'BABY FACTORY APARTMENT ': 'IDP CAMP/SHELTER',
    'BABY FACTORY HOME': 'IDP CAMP/SHELTER',
    'BROTHEL': 'IDP CAMP/SHELTER', # Could be a place of forced living
    'HUSBAND HOUSE': 'IDP CAMP/SHELTER', # If it's a forced situation
    'UNCOMPLETED BUILDING ': 'IDP CAMP/SHELTER',
    'HOSTEL': 'IDP CAMP/SHELTER',
    'HOME': 'IDP CAMP/SHELTER',
    'INMATE': 'IDP CAMP/SHELTER',
    'FELLOW SEX WORKERS': 'IDP CAMP/SHELTER',
    'EMERGENCY ACCOMMODATION': 'IDP CAMP/SHELTER',
    'ALARAMMA AT TSANGAYA SCHOOL': 'IDP CAMP/SHELTER',
    'LEAVE IN THE MARKET': 'IDP CAMP/SHELTER',
    'FOSTER HOME': 'IDP CAMP/SHELTER',
    'CECE-YARA HOME':  'IDP CAMP/SHELTER',
    'CECE-YARA': 'IDP CAMP/SHELTER',
    'AT THE BAR WHERE SHE WORKS': 'IDP CAMP/SHELTER',
    'SOUGHT AFTER ORPHANAG': 'IDP CAMP/SHELTER',
    'WITH OTHER SALESGIRLS FOR AUNT': 'IDP CAMP/SHELTER',
    'UNCOMPLETED BUILDING': 'IDP CAMP/SHELTER',
    'OTHER SEX WORKERS': 'IDP CAMP/SHELTER',
    'POLICE': 'IDP CAMP/SHELTER',
    'SCHOOL': 'IDP CAMP/SHELTER',
    'VOLUNTEER': 'IDP CAMP/SHELTER',
    'WAS LIVING IN A BROTEL': 'IDP CAMP/SHELTER',
    'LIVING WITH SURVIVORS': 'IDP CAMP/SHELTER',
    'STUDENT LIVING IN THE HOSTEL': 'IDP CAMP/SHELTER',
    'STUDENT LIVING IN HOSTEL': 'IDP CAMP/SHELTER',
    'HOSTEL': 'IDP CAMP/SHELTER',
    'VOLUNTEER ': 'IDP CAMP/SHELTER',
    'OTHER SEX WORKERS ': 'IDP CAMP/SHELTER',
    'FELLOW SEX WORKERS ': 'IDP CAMP/SHELTER',
    'WORK PLACE': 'IDP CAMP/SHELTER', # Too generic, could be anywhere
    'LEAVE IN THE MARKET ': 'IDP CAMP/SHELTER',
    'SHOP': 'IDP CAMP/SHELTER',
    'BAR': 'IDP CAMP/SHELTER',
    'BANK': 'IDP CAMP/SHELTER',
    'ROAD': 'IDP CAMP/SHELTER',
    'HOUSE': 'IDP CAMP/SHELTER',
    'SOUGHT AFTER ORPHANAGE': 'IDP CAMP/SHELTER',


    'COMMUNITY MEMBER': 'COMMUNITY/NEIGHBOR',
    'COMMNUITY MEMBER': 'COMMUNITY/NEIGHBOR',
    'NEIGHBOR': 'COMMUNITY/NEIGHBOR',
    'NEIGHBOUR': 'COMMUNITY/NEIGHBOR',
    'NEIGHBOR ': 'COMMUNITY/NEIGHBOR',
    'COMMUNITY LEADER (HAKIMI)': 'COMMUNITY/NEIGHBOR',
    'CHURCH MEMBER': 'COMMUNITY/NEIGHBOR',
    'CHURCH MEMBER/GUARDIAN': 'COMMUNITY/NEIGHBOR',
    'RELIGIOUS LEADER': 'COMMUNITY/NEIGHBOR',
    'BISHOP ': 'COMMUNITY/NEIGHBOR',
    'PASTOR OF HER CHURCH ': 'COMMUNITY/NEIGHBOR',
    'PASTOR': 'COMMUNITY/NEIGHBOR',
    'A GOOD SAMARITAN': 'COMMUNITY/NEIGHBOR',
    'SOMEONE FROM HER VILLAGE': 'COMMUNITY/NEIGHBOR',
    'OTHER RETURNEES FROM CAPTIVITY': 'COMMUNITY/NEIGHBOR',
    'FATHER\'S BOSS': 'COMMUNITY/NEIGHBOR',
    'BISHOP': 'COMMUNITY/NEIGHBOR',
    'CHURCH M,EMBERS': 'COMMUNITY/NEIGHBOR',
    'COLLEAGUE\'S FAMILY': 'COMMUNITY/NEIGHBOR',
    'HER FRIEND\'S KINSMAN': 'COMMUNITY/NEIGHBOR',
    'PASTOR OF HER CHURCH': 'COMMUNITY/NEIGHBOR',
    'STREET BROTHERS': 'COMMUNITY/NEIGHBOR',
    'NEIGBOUR': 'COMMUNITY/NEIGHBOR',

    'UNKNOWN': 'UNKNOWN',
    'NOT REPORTED': 'UNKNOWN',
    'NOT SPECIFIED': 'UNKNOWN',
    'OTHER': 'UNKNOWN',
    'NONE': 'UNKNOWN',
    'DOES NOT HAVE A HOME': 'UNKNOWN',
    'FOUND ON THE ROAD': 'UNKNOWN',
    'WITNESS': 'UNKNOWN',
    'STRANGER': 'UNKNOWN',
    'STRENGER': 'UNKNOWN',
    'ABANDONED': 'UNKNOWN',
    'ABANDONED CHILD': 'UNKNOWN',
    'FORCED COHABITATION': 'UNKNOWN',
    'BOREHOLE': 'UNKNOWN',
    'MOTHER AND MOTHER\'S BOYFRIEND': 'UNKNOWN', # Complex, might be best as unknown
    'FATHER GIRLFRIEND': 'UNKNOWN', # Complex, might be best as unknown
    'CHASED OUT BY HER SPOUSE AND FAMILY': 'UNKNOWN', # Outcome, not who they live with
    'HELPER': 'UNKNOWN', # Too vague
    'NOT REPORTED': 'UNKNOWN',
    'SEPARATED': 'UNKNOWN', # Status, not who they live with
    'ON THE STREET': 'UNKNOWN',
    'GOT MARRIED  TO ANOTHER HUSBAND': 'UNKNOWN'
}



df['WHO SURVIVOR/VICTIM LIVE WITH'] = df['WHO SURVIVOR/VICTIM LIVE WITH'].replace(mapping)

# d. Regroup those that can be merged together for proper handling (already done in mapping)

# Drop the redundant column
df = df.drop(columns=['DOES THE SURVIVOR/VICTIM LIVE ALONE'])

print("\nValue counts for 'WHO SURVIVOR/VICTIM LIVE WITH' after cleaning:")
print(df['WHO SURVIVOR/VICTIM LIVE WITH'].value_counts(dropna=False))


Value counts for 'WHO SURVIVOR/VICTIM LIVE WITH' after cleaning:
WHO SURVIVOR/VICTIM LIVE WITH
PARENT/GUARDIAN           14084
SPOUSE/PARTNER            11052
UNKNOWN                    7727
ALONE                      7014
RELATIVE/FRIEND            4089
CHILDREN                    169
IDP CAMP/SHELTER            111
CAREGIVER/EMPLOYER           67
COMMUNITY/NEIGHBOR           34
PERPETRATOR/TRAFFICKER       13
Name: count, dtype: int64


#### Convert all the dates to datetime
* Define a placeholder date for missing values (e.g., January 1, 1900)
* Convert the columns to datetime objects, coercing invalid formats to NaT
* Replace the NaT values with the placeholder date

In [16]:
# A list of all the date columns you want to clean
date_columns = [
    'DATE OF INCIDENT',
    'DATE REPORTED',
    'DATE JUSTICE WAS RECEIVED',
    'DATE CASE WAS CLOSED',
    'APPROVED BY ORG. SUPERVISOR_DATE',
    'APPROVED BY LGA SUPERVISOR_DATE',
    'APPROVED BY STATE SUPERVISOR_DATE'
]

# Define a placeholder date for missing values (e.g., January 1, 1900)
placeholder_date = pd.Timestamp('1900-01-01')

# Loop through each date column, convert it, and fill NaT values
for col in date_columns:
    # Convert the column to datetime objects, coercing invalid formats to NaT
    df[col] = pd.to_datetime(df[col], errors='coerce')
    
    # Fill the NaT values with the placeholder date
    df[col] = df[col].fillna(placeholder_date)

  df[col] = pd.to_datetime(df[col], errors='coerce')


#### 'TIME OF THE DAY THAT INCIDENT TOOK PLACE'
* Merge 'CannotRemember' into 'Unknown'

In [17]:


# Merge 'CannotRemember' into 'Unknown'
df['TIME OF THE DAY THAT INCIDENT TOOK PLACE'] = df['TIME OF THE DAY THAT INCIDENT TOOK PLACE'].replace(
    'CannotRemember', 'Unknown'
)

# # Fill any NaN values with 'Unknown' as well
# df['TIME OF THE DAY THAT INCIDENT TOOK PLACE'] = df['TIME OF THE DAY THAT INCIDENT TOOK PLACE'].fillna('Unknown')

# Print the new value counts to verify the changes
print(df['TIME OF THE DAY THAT INCIDENT TOOK PLACE'].value_counts(dropna=False))

TIME OF THE DAY THAT INCIDENT TOOK PLACE
Evening      16366
Afternoon    11457
Morning       9308
Unknown       7219
NaN             10
Name: count, dtype: int64


#### 'DOES THE SURVIVOR WANT ACCESS TO JUSTICE'
1. Change "Yes" and "No" to more descriptive terms can make the data easier to understand at a glance.
2. Replace 'NotApplicable' to 'Unknown'

In [18]:
# Map the values to new, more descriptive categories
mapping = {
    'Yes': 'Wants Access to Justice',
    'No': 'Does Not Want Access',
    'NotApplicable': 'Unknown'
}

# Apply the mapping
df['DOES THE SURVIVOR WANT ACCESS TO JUSTICE'] = df['DOES THE SURVIVOR WANT ACCESS TO JUSTICE'].replace(mapping)

# # Fill any remaining NaN values with 'Unknown'
# df['DOES THE SURVIVOR WANT ACCESS TO JUSTICE'] = df['DOES THE SURVIVOR WANT ACCESS TO JUSTICE'].fillna('Unknown')

print("\nNew value counts:")
print(df['DOES THE SURVIVOR WANT ACCESS TO JUSTICE'].value_counts(dropna=False))


New value counts:
DOES THE SURVIVOR WANT ACCESS TO JUSTICE
Does Not Want Access       26852
Wants Access to Justice    17466
Unknown                       32
NaN                           10
Name: count, dtype: int64


#### 'OUTCOME OF PROSECUTION'
* Replace all NaN values with 'Unknown'

In [19]:
# Replace all NaN values with 'Unknown'
df['OUTCOME OF PROSECUTION'] = df['OUTCOME OF PROSECUTION'].fillna('Unknown')

print("\nNew value counts:")
print(df['OUTCOME OF PROSECUTION'].value_counts(dropna=False))


New value counts:
OUTCOME OF PROSECUTION
Unknown                        40408
Dropped at Investigation        2243
Conviction                       642
Recantation                      511
Discontinued by Prosecution      307
Acquittal                        136
False                             93
item                              20
Name: count, dtype: int64


#### 'HAS THE CASE BEEN CLOSED'
1. Change "Yes" and "No" to more descriptive words to improves the readability and understanding of data
 * 'Yes': 'Case Closed'
 * 'No': 'Case Open'
2. Replace 'NotApplicable' with 'Unknown'
3. Create and apply the mapping to replace 'Yes' to 'Case Closed', 'No' to 'Case Open', and 'NotApplicable' to 'Unknown'

In [20]:
# Create a mapping dictionary for the changes
mapping = {
    'Yes': 'Case Closed',
    'No': 'Case Open',
    'NotApplicable': 'Unknown'
}

# Apply the mapping
df['HAS THE CASE BEEN CLOSED'] = df['HAS THE CASE BEEN CLOSED'].replace(mapping)

# # Fill any NaN values with 'Unknown'
# df['HAS THE CASE BEEN CLOSED'] = df['HAS THE CASE BEEN CLOSED'].fillna('Unknown')

print("\nNew value counts:")
print(df['HAS THE CASE BEEN CLOSED'].value_counts(dropna=False))


New value counts:
HAS THE CASE BEEN CLOSED
Unknown        29336
Case Open      12540
Case Closed     2474
NaN               10
Name: count, dtype: int64


#### 'WHO CLOSED THE CASE?'
* Replace any NaN values with 'Unknown'

In [21]:
# Fill any NaN values with 'Unknown'
df['WHO CLOSED THE CASE?'] = df['WHO CLOSED THE CASE?'].fillna('Unknown')

print("\nNew value counts:")
print(df['WHO CLOSED THE CASE?'].value_counts(dropna=False))


New value counts:
WHO CLOSED THE CASE?
Unknown                            41892
Case closed by survivor/victim       986
Case closed by family/community      919
Case closed by police                322
Case closed by court                 241
Name: count, dtype: int64


#### 'WHO REPORTED THE INCIDENT', 'PARENT/GUARDIAN EMPLOYMENT STATUS' and 'SURVIVOR/VICTIM EMPLOYMENT STATUS'
1. Define Mappings for 'WHO REPORTED THE INCIDENT'
2. Define Mappings for Employment Status for both Parent/Guardian and Survivor/Victim columns.
3. Create a Helper Function to Apply Mappings 
4. Apply to 'PARENT/GUARDIAN EMPLOYMENT STATUS' and 'SURVIVOR/VICTIM EMPLOYMENT STATUS' respectively

In [None]:
# It's good practice to clean up column names by stripping any leading/trailing whitespace.
df.columns = df.columns.str.strip()

# --- 1. Define Mappings for 'WHO REPORTED THE INCIDENT' ---
# Here, we group the many unique raw values into our standardized categories.
who_reported_mapping = {
    'Self': ['Self', 'Survivor', 'The victim', 'self reported', 'walk in', 'walk-in', 'Client', 'victim', 'Self Referral', 'I FIGURE IT OUT', 'Self (Survivor)','self after full recovery from the hospital', 'self after recovery','owner of the case', 'Self Identified', 'THE VICTIM', 'SURVIVOR', 'Self ', 'SELF REFERRAL', 'SELF REFERRED'],
    'Family': ['Mother', 'Father', 'Parent', 'Spouse', 'Brother', 'Sister', 'Relative', 'In-law', 'Guardian', 'Family member', 'Husband', 'Wife', 'son', 'daughter', 'uncle', 'aunt', 'cousin', 'grandmother', 'grandfather', 'sibling', 'inlaw', 'children', "Survivor's brother ", 'Parents and police', 'Parents and police ', 'uncle', 'Police and parents ', "Survivor's son", 'Brother', 'Elder brother ', 'Police and parent', "victim's sister", 'Wife', 'Daughters', 'She and her mother', 'brother', "Survivor's brother", 'Neighbor', 'Aunt', 'relative ', 'FRIEND ', 'Relative ', 'Sister ', 'daughter ', 'husband', 'sibling', "Survivor's daughter", 'Husband', "Victim's cousin", "Mother's friend", 'Sister', 'sister', 'The daughter', 'Daughter', 'Grandmother', 'Parent and police', 'grandmother', 'Paternal Grandfather', 'SON', 'MOTHER', 'SISTER', 'Family member ', "Child's mother", 'children mother', "children's mother", "CHILD'S MOTHER", 'FATHER TO CHILD', 'Sibling\'s Friend', 'CareGiver', 'CHILDREN MOTHER', 'CHILD MOTHER', 'Children Mother', 'Child mother', 'survivors mother', 'Survivors mother', 'mother of children', 'Spouse ', 'BROTHER', 'Cousin of spouse/survivor', 'Survivor ', 'Maternal Uncle', 'Paternal Uncle', 'Self and parents', 'AUNTY', 'BROTHER INLAW', 'HUSBAND', 'COUSIN', 'mother', 'SURVIVORS SISTER', 'sister of the survivor', 'Older Sister', 'CHILDREN', 'spouse (Mother of Children)', 'Both husband and wife ', 'Mother of children', 'RELATIVE', 'RELATIVE(aunt)', 'GRAND FATHER TO SURVIVOR', "SURVIVOR'S FRIEND ", 'GRAND MOTHER', 'GRANDPARENT ', 'UNCLE TO SURVIVOR', 'BROTHER TO SURVIVOR', 'FOSTER MOTHER TO SURVIVOR', "CHILDREN'S MOTHER", "CHILD'S MOTHER ", "CHILDREN'S  MOTHER", 'WIFE', 'mother ', 'MOTHER OF CHILDREN', 'Parents ', 'Family Member', "Child's father", 'Step father ', 'sp0use', 'Sibling of survivor', 'house wife', 'Elder brother', 'spouse  ', 'Survivors relative', 'Girlfriend', "Child's older sibling", "Children's older sibling", 'SURVIVORS DAUGHTER', 'victims child', 'child', 'Child Family', 'Brother inlaw', 'Child', 'partner', 'Intimate sexual partner', 'Foster mother', 'Grandparent', 'her  sister', "CLIENT'S SCHOOL COUNSELLING", 'Step sister ', 'Spouse and sister to the wife', "Victim's Brother", 'Stepmother', "Survivor's sister", 'Son', 'OtherSon', 'child', 'Child family'],
    'Community Member': ['Neighbour', 'Community Leader', 'Witness', 'Friend', 'Community member', 'community case worker', 'Community volunteer', 'Village head', 'Concerned person', 'Anonymous', 'Bystander', 'Whistle blower', 'Good Samaritan', 'community case manager', 'Women Leader', 'Neighbour ', 'Hisbah Commandant', 'Hisban commondan', 'mandate reporter', 'Mandate reporter', 'survival', 'CP COMMUNITY WORKER', '10Relative', 'Other(Community Member)', 'friend', 'Community gatekeeper', 'Community Worker', 'volunteer', 'neighbour', 'children', 'Social worker', 'Otherself', 'WARIF volunteer', 'Community worker', 'Community Leader', 'Community learder', 'Neighbour', 'CSOs', 'The doctor treating her', 'Springs  of life organisation for Women and girl child development initiative', 'Social welfare', 'Neighbor', 'Referred from Today for Tomorrow Foundation', 'Village head', 'concerned person. ', 'Concerned person', 'A friend ', 'Concerned neighbor  ', 'Community leader', 'An individual', 'Family friend', 'Community Member', 'National Grass root Association of Nigeria  ', 'National Grass root association of Nigeria ', 'Other - Friend', 'community member', 'concerned person. ', 'Concerned neighbor ', 'Church member(Watchman)', 'Church member', 'gender unit', 'Other a volunteer of the foundation', 'Traditional Ruler', 'Women leader', 'neighbor ', 'Community ', 'Community member', 'Human rights liberty access and peace defender foundation', 'Tangaza surveillance team', 'eye witness ', 'Concerned Adult', 'Hon Commissioner', 'Community Case worker', 'A friend to the survivor', 'the wife', 'Community Villiglantee Group', 'Community gate keepers', 'Community social welfare officer', 'Passer-by', 'Classmate', 'Neigbour', 'Not Stated', 'MINISTRY OF WOMEN  AFFAIRS AND SOCIAL DEVELOPMENT EBONY STATE', 'MIN. OF JUSTICE', 'Boos in the office', 'Civilian JTF', 'mai anguwa', 'mai auguwa cigari', 'Friend of late mum', 'Whistle blower ', 'A government Staff', 'field', 'Aunty', 'ADHOC STAFF', 'M-CCAD', 'VIRGO HOSPITAL', 'Counsel to the complainant', 'Help line ', 'Anonymous ', 'neigbour', 'Class mate', 'Whisttle Blower ', 'Whisttle Blower', 'Orphanage home', 'HHGSF', 'DSVA', 'Domestic and Sexual Violence Agency (D.S.V.A)', 'neigbours', 'Religious leader', 'Hisbah', 'Hisbah ', 'concerned person[ woman]', 'Abuja Municipal Area Council Coordinator', 'Good Samaritan', 'social medial ', 'Community clan head', 'IN-LAW TO THE PERPETRATOR', 'multi-sectoral Imam', 'Freedom for humanity ', 'Passer by', 'passer -by', 'IFIOK EKANEM', 'A member of the community', 'home', '𝐶𝑎𝑠𝑒 𝑤𝑜𝑟𝑘𝑒𝑟 ', '𝐶𝑎𝑠𝑒𝑤𝑜�𝑘𝑒𝑟', 'Neigbours', 'Intimate partner', 'concern  neigbour', 'Family friends ', 'Individual ', "Report was made by our Girls' Advocate", 'The victim ', 'community', 'Passer-by', 'neighbour (anonymous)', ' neighbour', 'Retired Principal', ' data clark ', 'Survivor Friend', 'vigilante', 'headmistress ', 'neigbours', 'Data Clark', 'cbo', 'Vigilante ', 'security', 'Surveillance Team Sokoto South LGA ', 'Surveillance Team coordinator Sokoto South LGA ', 'Boss', 'Data Clark ', 'Community Case worker ', 'Community Case worker', 'Community worker', 'community worker', 'COMMUNITY CASE WORKER', 'COMMUNITY CASE MANAGER', 'Community volunteer', 'COMMUNITY VOLUNTEER', 'Community Stakeholder', 'Community Learder', 'Community learder', 'Community gatekeeper', 'Community clan head', 'Community Villiglantee Group', 'Community gate keepers', 'COMMUNITY MEMBER', 'Community Member', 'community member', 'community', 'Community ', 'ommunity volunteer', 'village volunteer', 'IDP Camp Chairman', 'Aunt to the victim or survivor', 'Acquaintant', 'Whistle blower', 'Whistle blower ', 'Whisttle Blower', 'Anonymous', 'anonymous', 'Friend', 'A friend ', 'A FREIND', 'FRIEND', "SURVIVOR'S FRIEND ", 'Friend of late mum', 'Family friend', 'friends', 'Friend ', 'Sibling\'s Friend', "Survivor's friend", 'friend', 'FRIEND '],
    'Health Worker': ['Health provider', 'Health worker', 'Nurse', 'Doctor', 'Medical doctor', 'PHC', 'Hospital', 'Health care worker', 'Health Worker', 'Health worker ', 'Health Worker ', 'Health Worker1', 'Nurse who treated the child', 'The doctor treating her', 'Primary Health Care Worker', 'nurse', 'Nurse', "Pastor's wife", 'PHC Kofar Kade ', 'Health Providers ', 'Gidan Hamma PHC ', 'Specialist hospital ', 'Medical doctor', 'In-Charge of Seven Days Adventist Primary Health Post', 'MENTHOR MOTHER', 'MENTOR MOTHER', 'Health providers', 'health provider', 'Nurse that treated  the case', 'PHC Director', 'Health care worker', 'Health care workers', 'A nurse where the child was receiving treatment', 'Facility Index Tester', 'Index Testing Provider', 'A health Worker', 'Health Care Provider (Araromi TEENSMATA Youth Hub)', 'SPIYMH HOSPITAL', 'MSF HEALTH CARE FACILITY', 'MSF health care Facility', 'counsellor tester', 'COUNSELLOR TESTER', 'CT', 'Councilor tester ', 'ct', 'C T'],
    'Law Enforcement/Justice System': ['Police', 'NSCDC', 'Hisbah', 'Vigilante', 'Lawyer', 'Court', 'Magistrate', 'Legal Council', 'Prosecutor', 'law-enforcement', 'Police and parents ', 'Hisbah Commandant', 'Hisban commondan', 'Police and parent', 'Police officer', 'Legal coucil from OPD', 'Legal Council form OPD', 'Legal Council from FIDA', 'police', 'Police and parents', 'Prosecutor', 'NSCDC PERSONNEL', 'Para military', 'para military', 'Police and Parent', 'Parents and NSCDC', 'gender unit', 'Lawyer', 'Gender Office Asokoro Police station  ', 'Court', 'Police officer and Parent', 'NSCDC', 'NPF', 'Nigeria immigration service', 'Nigirean imegeration service', 'Naptip survilans team', 'Asokoro Police Station', 'Police Officer', 'Human Right Activist', 'Vigilante', 'Child protection service', 'Juvenile welfare police officer', 'Chief magistrate', 'Officers of Vigilante', 'OKENE COMMAND VIGILANTE', 'VIGILANTE', 'NIgerian Immigration Service (NIS)', 'POLICE OFFICER', 'POLICE', 'PARA MILITIRY', 'Police inspector', 'FIDA Ebonyi State', 'Law firm', 'Nscdc staf', 'NSCDC STAFF', 'NSCDC Staff', 'NSCDC staff', 'reported by FIDA staff', 'NSCDC STAFF ', 'Security Agent ', 'NAPTIP', 'Hisbah commission ', 'Legal counsel', 'MIN. OF JUSTICE', 'Civilian JTF', 'NAPTIP', 'Oyo State Command', 'LGEA COUNCILLOR ETI-OSA', 'LGEA COUNSELOR', 'LGEA Councilor', 'L.G.A Counsellor', 'L.G.A Counsellor ', 'L.G.A. Counsellor', 'LGA Counsellor', 'LGEA COUNSELLOR', 'LGA Counseller', 'LGA Counsellor Ajeromi', 'Magistrate', 'Police officer and Parent ', 'SECURITY', 'Police officer and parent', 'Civil  Defence officer and parent', 'police officer and parent', "Reported by Survivor's father & NSCDC officer.", 'security', 'Federal Ministry of Justice', 'Legal- Ministry of Justice', 'Legal- ministry of justice', 'Legal ministry of justice', 'legal -ministry of justice', 'legal- ministry of justice', 'Legal - Ministry of justice', 'Legal -ministry of justice', 'legal(ministry of justice)', 'legal (ministry of justice)', 'GBV FOCAL PERSON', 'Nscdc', 'Civil Defence ', 'Civil Defence officer ', 'FRSC', 'NAPTIP Surveillance Team', 'NAPTIP SURVELLIANCE TEAM', 'NAPTIP Survelliance Team', 'Rescue team', 'Naptip Surviellance team', 'Naptip Survielliance Taem', 'NAPTIP Team', 'Law Enforcement Officer', 'GENDER FOCAL PERSON (HELIN)'],
    'NGO/CSO': ['NGO', 'CSO', 'Case worker', 'Social worker', 'PLAN International', 'FIDA', 'WARIF volunteer', 'CPN', 'Human rights commission', 'community case worker', 'Referred by Adamawa concern citizen foundation', 'Parents and Adamawa Concern Citizens Forum (NGO)', 'NGO/CPN', 'Child Protection Network', 'Case manager', 'Social worker', 'Community volunteer', 'NGO', 'CSOs', 'Adamawa Concerned Citizen and parents', 'CCW (NGO)', 'UN Agency', 'gender unit', 'Social Worker', 'The barrister they referred case to her desk', 'Human rights liberty access and peace defender foundation', 'NGO (Caritas Nigeria)', 'SMWCA', 'Parents and Adamawa Concern Citizen (CBO)', 'INGO', 'Positive Care Development Foundation', 'Gender desk officer uwanse police', 'BRCI', "NGO's", 'GPI/BRCI', 'Parent and Caritas', 'Parents and Adamawa Concern Citizens', 'FIDA Ebonyi State', 'Care for Social Welfare International', "AUNTY LANDA'S FOUNDATION", 'Child Advocate', 'CSO/NGO ', 'Adamawa  Concern Citizen', 'Adamawa Concern Citizens Foundation', 'CPN REPRESENTATIVES', 'JEI STAFF', 'Adamawa Concern Citizens', 'Referral', 'Project Charilove', 'Project Charilove (hotel guardian)', 'CHILD RIGHT AGENCY', 'AHI Staff', 'M-CCAD', 'Human rights commission ', 'DSVA', 'Domestic and Sexual Violence Agency (D.S.V.A)', 'FIDA', 'KADVS ', 'NACO', 'NAC0', 'CBO', 'cbo', 'Wotclef Adamawa ', 'WRAPA Staff', 'A concerned Nigerian', 'CECDI', 'UNFPA', 'TCF staff', 'Case woker', 'case worker', 'Child protection Committee', 'WAPA', 'case workers', 'Case Worker', 'Case worker ', 'multi-sectoral Imam', 'BRCI Referral', 'program assistant', 'programmer assistant', 'REFRRAL', 'Refered', 'community child protection committee(CCPC)', 'Kwara State Sexual and Assault Referral Centre', 'MYSD Officer', 'MSWCD', 'Caseworker  ', 'SOCIAL WELFARE OFFICER', 'EMEM UDOFIA', 'HHGSF', 'Caseworker ', 'Caseworker', 'caseworker', 'SURVEILLANCE TEAM', 'Surveillance team', 'Surveillance Team ', 'NACO', 'NAC0', 'caseworker '],
    'School/Education': ['Teacher', 'School', 'School Principal', 'Headmistress', 'SUBEB', 'LGEA COUNCILLOR', 'School Counsellor', 'Teacher', 'School', 'Education providers', 'School Teacher', 'School proprietor ', 'SUBEB', 'SUBEB OFFICER, UNCLE', "CLIENT'S SCHOOL COUNSELLING", 'Edo State Universal basis Education Board (SUBEB)', 'Teacher ', 'Educational providers', 'SUBEB (State Universal Basic Education Board)', 'State universal basic education (SUBEB)', 'State Universal Basic Education Board', 'Tteacher', 'State Universal Basic Education Board ', 'School Principal & Councellor', "School's Counsellor", 'Oshodi Guidance Counsellor', 'School Consellor', 'School Counsellor', 'SCHOOL HEAD TEACHER', 'school authority', 'school teacher', 'class teacher', 'School principal', 'headmistress ', 'secondary school teacher', 'Headmistress ', 'Vice Principal '],
    'Government Agency': ['Government agency', 'SMWCA', 'WAPA', 'MOWCA', 'NAPTIP', 'Government social worker', 'Government agency', 'Government worker', 'government agency', 'Lagos state social worker', 'Lagos state social service', 'Government Agency ', 'MOWCA ', 'MINISTRY OF WOMEN  AFFAIRS AND SOCIAL DEVELOPMENT EBONY STATE', 'Women Affairs and Poverty Alleviation mihistry25', 'Ministry of Women Affair ', 'Office of the Special Adviser on Gender Matter to Her Excellency', 'Data Clark', 'Data clark ', 'Data clark', 'Wotclef Adamawa ', 'Federal Ministry of Justice'],
    'Other': ['Other', 'Any Other', 'Media', 'News Reporter', 'on_line', 'Whatapp', 'Not Stated', 'na', 'Unknown', 'Not reported', 'other', 'media', 'international Media Platform', 'Abandoned', 'TELEPHONE', 'partner', 'other'],
    'Unknown/Not Stated': ['Not Stated', 'Unknown', 'na', 'Not reported', 'not specified', 'Not Applicable']
}


# --- 2. Define Mappings for Employment Status ---
# This mapping can be reused for both Parent/Guardian and Survivor/Victim columns.
employment_mapping = {
    'Employed': ['Currently employed', 'Teacher', 'Nurse', 'Civil servant', 'media personal', 'Private Teacher', 'Clergy', 'Police officer', 'bussiness man', 'domestic staff', 'DOMESTIC STAFF', 'MECHANIC', 'CAPINTER', 'TAILOR', 'HEIR DRESSER', 'LEBOURER', 'FRUIT SALER', 'FOOD VENDOR', 'civil servant', 'tecaher', 'Clergy ', 'Walks in a Barracks', 'Business Women', 'fisher woman', 'Night worker', 'PETTY TRADER', 'FOOD VENDOR', 'CAPINTER', 'TAILOR', 'HEIR DRESSER', 'LEBOURER', 'FRUIT SALER', 'MECHANIC', 'Working ', 'Footballer', 'FARMER', 'domestic staff', 'Serving', 'Serving Corps member', 'corper', 'currently serving', 'media personal', 'Private clinic', 'Pastoring '],
    'Unemployed': ['Unemployed', 'Not employed', 'looking for job', 'No income', 'Not working', 'CURRENTLY UNEMPOLYED', 'NONE', 'Nil', 'Not working '],
    'Self-employed': ['Self employed', 'Self- employed', 'Business', 'Trader', 'Farming', 'farmer', 'trader', 'ENTREPRENEUR', 'Petty trader', 'fisher woman', 'Bike rider', 'BUSINESS', 'Petty Business', 'PETTY TRADER', 'Farming', 'farming', 'Business ', 'TRADER', 'bussiness man', 'Petty trade ', 'trading', 'Trading', 'Trading '],
    'Student': ['Student', 'student', 'STUDENT', 'Schooling', 'Pupil', 'PURPIL', 'Undergraduate', 'Serving Corps member', 'Corper', 'Apprentice', 'student trainee', 'Learning trade', 'PUPIL', 'Under age ', 'Studient', 'student ', 'pupil', 'Undergraduate ', 'Student trainee', 'apprentice', 'learning how to sew'],
    'Retired': ['Retired', 'pensioner', 'Retiree', 'RETIREE', 'Retired ', 'pensioner ', 'Retiree '],
    'Housewife': ['House wife', 'house wife', 'HOUSE WIFE', 'full house wife', 'Full Time Housewife', 'Housewife '],
    'Other': ['Other', 'other', 'Dependent', 'DEPENDENT', 'Child', 'Minor', 'Baby', 'Orphan', 'Not alive', 'Deceased', 'House help', 'sex worker', 'Orphan ', 'Minor ', 'child', 'MINOR', 'MONOR', 'CHILD LABOUR', 'not known', 'Caseworker', 'case of trafficking', 'House help', 'Sex worker ', 'Sex Worker', 'Sex Worker ', 'Sex '],
    'Unknown': ['Not Applicable', 'Not applicable', 'na', 'Not Stated', 'Not stated', 'not specified', 'not known', 'Unknown', 'Not reported', 'Dt know ', 'not applcable', 'not applicable']
}


# --- 3. Create a Helper Function to Apply Mappings ---
# This function inverts the dictionaries above so we can map each raw value to its new, clean category.
def get_category_mapper(mapping):
    """
    Inverts a mapping dictionary to create a new dictionary where each
    individual raw value maps to its standardized category.
    Args:
        mapping (dict): The original dictionary with categories as keys
                        and lists of raw values as values.
    Returns:
        dict: An inverted dictionary for use with pandas .map().
    """
    category_map = {}
    for category, values in mapping.items():
        for value in values:
            # Cleaning the value (lowercase, strip whitespace) makes the matching more robust.
            cleaned_value = str(value).strip().lower()
            category_map[cleaned_value] = category
    return category_map

# Create the specific mapping dictionaries for each column
who_reported_category_map = get_category_mapper(who_reported_mapping)
employment_category_map = get_category_mapper(employment_mapping)


# --- 4. Apply the Mappings to the DataFrame ---

# Apply to 'WHO REPORTED THE INCIDENT'
# We create a new column to keep the original data intact.
# .str.strip().str.lower() cleans the data before mapping.
# .fillna('Other') assigns 'Other' to any value that wasn't in our mapping.
df['WHO REPORTED THE INCIDENT'] = df['WHO REPORTED THE INCIDENT'].str.strip().str.lower().map(
    who_reported_category_map).fillna('Other')

# Apply to 'PARENT/GUARDIAN EMPLOYMENT STATUS'
df['PARENT/GUARDIAN EMPLOYMENT STATUS'] = df['PARENT/GUARDIAN EMPLOYMENT STATUS'].str.strip().str.lower().map(
    employment_category_map).fillna('Other')

# Apply to 'SURVIVOR/VICTIM EMPLOYMENT STATUS'
df['SURVIVOR/VICTIM EMPLOYMENT STATUS'] = df['SURVIVOR/VICTIM EMPLOYMENT STATUS'].str.strip().str.lower().map(
    employment_category_map).fillna('Other')


# Display the value counts for the new categorical columns to verify the results.
print("\n--- Value Counts for Standardized 'WHO REPORTED THE INCIDENT' ---")
print(df['WHO REPORTED THE INCIDENT'].value_counts())

print("\n--- Value Counts for Standardized 'PARENT/GUARDIAN EMPLOYMENT STATUS' ---") 
print(df['PARENT/GUARDIAN EMPLOYMENT STATUS'].value_counts())

print("\n--- Value Counts for Standardized 'SURVIVOR/VICTIM EMPLOYMENT STATUS' ---")
print(df['SURVIVOR/VICTIM EMPLOYMENT STATUS'].value_counts())


--- Value Counts for Standardized 'WHO REPORTED THE INCIDENT' ---
WHO REPORTED THE INCIDENT
Self                              29928
Family                             9580
Community Member                   3093
NGO/CSO                             808
Law Enforcement/Justice System      457
Other                               219
Health Worker                       142
School/Education                     80
Government Agency                    50
Unknown/Not Stated                    3
Name: count, dtype: int64

--- Value Counts for Standardized 'PARENT/GUARDIAN EMPLOYMENT STATUS' ---
PARENT/GUARDIAN EMPLOYMENT STATUS
Other            29347
Unknown           5244
Self-employed     5131
Unemployed        3227
Employed          1310
Student             85
Housewife           11
Retired              5
Name: count, dtype: int64

--- Value Counts for Standardized 'SURVIVOR/VICTIM EMPLOYMENT STATUS' ---
SURVIVOR/VICTIM EMPLOYMENT STATUS
Unemployed       12843
Self-employed    11857
Other  

#### 'SEX OF PERPETRATOR'
1. Standardized Values: All entries with only "Male" or "Female" (e.g., Male,Male,Male) were consolidated into a single "Male" or "Female" category.
2. Identified Mixed Genders: Any entry containing both "Male" and "Female" was grouped into a new category, "Male & Female." This includes descriptive phrases like Parents, husband and wife, and INLAW.
3. Handled Missing and Unspecified Data: All missing values (<NA>, NaN) and Other entries were consolidated into a single "Unknown" category.

In [23]:
# To handle all variations, we'll create a new column, convert to string, and make it uppercase and strip spaces for consistency.
df['SEX OF PERPETRATOR_cleaned'] = df['SEX OF PERPETRATOR'].astype(str).str.upper().str.strip()

# Define the replacement logic
def clean_sex_of_perpetrator(value):
    # Consolidate variations of 'Male' and 'Female'
    if value.replace(',', '').strip() == 'MALE':
        return 'Male'
    elif value.replace(',', '').strip() == 'FEMALE':
        return 'Female'
    
    # Check for 'Male' and 'Female' together
    if 'MALE' in value and 'FEMALE' in value:
        return 'Male & Female'
    
    # Map specific descriptive values
    mapping = {
        'PARENTS': 'Male & Female',
        'HUSBAND AND WIFE': 'Male & Female',
        'BOTH PARENTS FATHER-50YEARS. MOTHER-47YEARS': 'Male & Female',
        'MALE & FEMALE': 'Male & Female',
        'FAMILY/HUSBAND TO BE': 'Male & Female',
        'MALE AND FEMALE': 'Male & Female',
        'INLAW': 'Male & Female',
        'OTHER': 'Unknown',
        'NAN': 'Unknown',
        '<NA>': 'Unknown',
        'MALE,MALE': 'Male & Female',
        'MALE,MALE,MALE': 'Male & Female',
        'MALE,MALE,MALE,MALE': 'Male & Female',
        'MALE,MALE,MALE,MALE,MALE': 'Male & Female'
    }
    return mapping.get(value, value)

# Apply the cleaning function to the column
df['SEX OF PERPETRATOR_cleaned'] = df['SEX OF PERPETRATOR_cleaned'].apply(clean_sex_of_perpetrator)

# Replace the original column with the cleaned one
df['SEX OF PERPETRATOR'] = df['SEX OF PERPETRATOR_cleaned']

# Drop the temporary cleaning column
df.drop('SEX OF PERPETRATOR_cleaned', axis=1, inplace=True)

# Print the new value counts to verify the changes
print("Value counts after cleaning:")
print(df['SEX OF PERPETRATOR'].value_counts(dropna=False))

Value counts after cleaning:
SEX OF PERPETRATOR
Male             34701
Female            4607
Unknown           3062
Male & Female     1990
Name: count, dtype: int64


In [24]:
# --- 2. Define a Function to Extract the Correct Age ---
def get_correct_age(age_entry):
    """
    Processes a single entry from the 'AGE OF PERPETRATOR' column based on the revised logic.
    - If the first value is non-zero, it's used.
    - If the first value is zero, the highest value in that entry is used.
    - Filters out unrealistic ages (e.g., > 120).
    """
    if not isinstance(age_entry, str):
        return np.nan
    try:
        ages = [float(age) for age in age_entry.split(',')]
        if not ages:
            return np.nan
        first_age = ages[0]
        if 0 < first_age <= 120:
            return first_age
        elif first_age == 0:
            max_age = max(ages)
            return max_age if 0 < max_age <= 120 else np.nan
        else:
            return np.nan
    except (ValueError, TypeError):
        return np.nan

# --- 3. Apply the Age Function and Create a Processed Age Column ---
df['AGE OF PERPETRATOR'] = df['AGE OF PERPETRATOR'].apply(get_correct_age)

# --- 4. Calculate the Mean Age for Imputation ---
valid_ages = df['AGE OF PERPETRATOR'].dropna()
mean_age = valid_ages.mean()
print(f"\nCalculated mean age for imputation (from valid, realistic values): {mean_age:.2f}")

# --- 5. Impute Missing and Zero Age Values ---
df['AGE OF PERPETRATOR'] = df['AGE OF PERPETRATOR'].fillna(mean_age).astype(int)


# --- 6. Define Mappings for 'RELATIONSHIP WITH PERPETRATOR' ---
relationship_mapping = {
    'Spouse/Partner': ['Spouse', 'Husband', 'Wife', 'Partner', 'spouse', 'Ex-partner', 'Boyfriend', 'Girlfriend', 'Intimate Partner', 'Ex-Boyfriend', 'Ex-Husband', 'Spouse/cohabiting', 'SPOUSE', 'HUSBAND', 'WIFE', 'Ex husband', 'ex-husband', 'ex-boyfriend', 'ex-wife', 'Concubine', 'Co-habiting partner', 'partner', 'sp[ouse', 'spouse  ', 'sp0use'],
    'Family Member': ['Father', 'Mother', 'Son', 'Daughter', 'Brother', 'Sister', 'Parent', 'Sibling', 'Family member', 'family member', 'Step-father', 'Step-mother', 'Step-son', 'Step-daughter', 'Step-brother', 'Step-sister', 'FATHER', 'MOTHER', 'SON', 'DAUGHTER', 'BROTHER', 'SISTER', 'Family Member', 'Father inlaw', 'Mother inlaw', 'Brother inlaw', 'Sister inlaw', 'In-law', 'in-law', 'child', 'children'],
    'Extended Family': ['Uncle', 'Aunt', 'Cousin', 'Grandfather', 'Grandmother', 'Nephew', 'Niece', 'Relative', 'relative', 'UNCLE', 'AUNT', 'COUSIN', 'GRANDFATHER', 'GRANDMOTHER', 'Guardian', 'Foster father', 'Foster mother', 'Family relative'],
    'Acquaintance': ['Friend', 'Neighbor', 'Neighbour', 'Acquaintance', 'Co-worker', 'Colleague', 'Classmate', 'FRIEND', 'NEIGHBOUR', 'neighbor', 'Friend of the family', 'friends'],
    'Authority Figure': ['Teacher', 'Employer', 'Boss', 'Landlord', 'Religious leader', 'Clergy', 'Police', 'Security personnel', 'Doctor', 'Health worker', 'Coach', 'TEACHER', 'BOSS', 'Police Officer', 'Security guard', 'security guard'],
    'Stranger': ['Stranger', 'Unknown to survivor', 'STRANGER'],
    'Other': ['Other', 'others', 'OTHER'],
    'Unknown/Not Stated': ['Unknown', 'Not Stated', 'Not reported', 'Not Applicable', 'unknown', 'NOT APPLICABLE']
}

# --- 7. Define Mappings for 'TYPE OF VIOLENCE' ---
violence_mapping = {
    'Sexual Violence': [
        'SEXUAL ASSAULT', 'RAPE', 'SEXUAL ASSAULT, RAPE', 'SEXUAL ASSAULT, PHYSICAL ASSAULT',
        'SEXUAL ASSAULT, PHYSICAL ASSAULT, PSYCHOLOGICAL/EMOTIONAL ABUSE',
        'SEXUAL ASSAULT, PSYCHOLOGICAL/EMOTIONAL ABUSE',
        'SEXUAL ASSAULT, RAPE, PSYCHOLOGICAL/EMOTIONAL ABUSE',
        'SEXUAL ASSAULT, PHYSICAL ASSAULT, RAPE',
        'SEXUAL ASSAULT, PHYSICAL ASSAULT, RAPE, PSYCHOLOGICAL/EMOTIONAL ABUSE',
        'SEXUAL ASSAULT, DEFILEMENT', 'SEXUAL ASSAULT, RAPE, DEFILEMENT',
        'RAPE, DEFILEMENT'
    ],
    'Physical Violence': [
        'PHYSICAL ASSAULT', 'PHYSICAL ASSAULT, CHILD ABUSE AND NEGLECT',
        'PHYSICAL ASSAULT, PSYCHOLOGICAL/EMOTIONAL ABUSE, CHILD ABUSE AND NEGLECT',
        'PHYSICAL ASSAULT, DEFILEMENT', 'PHYSICAL ASSAULT, FINANCIAL/ECONOMIC',
        'PHYSICAL ASSAULT, FINANCIAL/ECONOMIC, DENIAL OF RESOURCES, PSYCHOLOGICAL/EMOTIONAL ABUSE'
    ],
    'Emotional/Psychological Abuse': [
        'PSYCHOLOGICAL/EMOTIONAL ABUSE', 'DENIAL OF RESOURCES', 'DENIAL OF RESOURCES, CHILD ABUSE AND NEGLECT',
        'DENIAL OF RESOURCES, PSYCHOLOGICAL/EMOTIONAL ABUSE, CHILD ABUSE AND NEGLECT',
        'FINANCIAL/ECONOMIC, DENIAL OF RESOURCES, PSYCHOLOGICAL/EMOTIONAL ABUSE',
        'PHYSICAL ASSAULT, DENIAL OF RESOURCES, PSYCHOLOGICAL/EMOTIONAL ABUSE, VIOLATION OF PROPERTY & INHERITANCE RIGHTS'
    ],
    'Other Forms of Violence': [
        'FORCED MARRIAGE', 'VIOLATION OF PROPERTY & INHERITANCE RIGHTS', 'FINANCIAL/ECONOMIC', 'CHILD ABUSE AND NEGLECT'
    ]
}


# --- 8. Helper Function to Create Mapper ---
def get_category_mapper(mapping):
    category_map = {}
    for category, values in mapping.items():
        for value in values:
            # Clean the value for robust matching
            cleaned_value = str(value).strip().lower().replace(" ", "")
            category_map[cleaned_value] = category
    return category_map

# Create mappers for relationship and violence
relationship_category_map = get_category_mapper(relationship_mapping)
violence_category_map = get_category_mapper(violence_mapping)

# --- 9. Apply Mappings ---
# Apply Relationship Mapping
df['RELATIONSHIP WITH PERPETRATOR'] = df['RELATIONSHIP WITH PERPETRATOR'].str.strip().str.lower(
    ).map(relationship_category_map).fillna('Other')

# Apply Violence Type Mapping
# We need a consistent cleaning approach for the violence column as well
df['TYPE OF VIOLENCE'] = df['TYPE OF VIOLENCE'].str.strip().str.lower().str.replace(" ", "").map(
    violence_category_map).fillna('Other Forms of Violence')


# --- 10. Categorize Vulnerable Population ---
# Create a function to categorize the vulnerable population based on keywords
def categorize_vulnerability(value):
    # Handle missing values first
    if pd.isna(value) or value == '<NA>':
        return 'Unknown'
    
    # Standardize the string for easier searching
    value = str(value).upper()
    
    # Handle 'NOT APPLICABLE' and its combinations
    if 'NOT APPLICABLE' in value:
        if value.strip() == 'NOT APPLICABLE':
            return 'No Vulnerability'
        # If 'NOT APPLICABLE' is combined with another vulnerability, prioritize the vulnerability
        else:
            value = value.replace('NOT APPLICABLE', '').strip()
    
    # Child/Youth Vulnerability
    if any(keyword in value for keyword in ['MINOR', 'CHILD', 'YOUTH', 'ORPHANS', 'CHILD APPRENTICE']):
        return 'Child/Youth Vulnerability'
    
    # Health-Related Vulnerability
    if any(keyword in value for keyword in ['PLHIV', 'DRUG USER', 'DISABILITY']):
        return 'Health-Related Vulnerability'
    
    # Gender/Social Vulnerability
    if any(keyword in value for keyword in ['WIDOW', 'SEX WORKER']):
        return 'Gender/Social Vulnerability'
    
    # Displacement/Labor Vulnerability
    if any(keyword in value for keyword in ['IDP', 'HOUSE MAIDS/DOMESTIC STAFF', 'DOMESTIC STAFF']):
        return 'Displacement/Labor Vulnerability'
        
    # If no category is found, assume it's unknown
    return 'Unknown'

# Apply the function to the 'VULNERABLE POPULATION' column
df['VULNERABLE POPULATION'] = df['VULNERABLE POPULATION'].apply(categorize_vulnerability)


print("\n--- Value Counts for Standardized Results---")
print(df['AGE OF PERPETRATOR'].value_counts())

print("\n--- Vulnerable Population Processing Results (Sample) ---")
print(df['VULNERABLE POPULATION'].value_counts(dropna=False))

print("\n--- Value Counts for Standardized 'RELATIONSHIP' ---")
print(df['RELATIONSHIP WITH PERPETRATOR'].value_counts())

print("\n--- Value Counts for Standardized 'VIOLENCE TYPE' ---")
print(df['TYPE OF VIOLENCE'].value_counts())


Calculated mean age for imputation (from valid, realistic values): 37.39

--- Value Counts for Standardized Results---
AGE OF PERPETRATOR
37    4366
40    3542
35    2750
30    2707
45    2440
      ... 
90       2
95       1
99       1
79       1
86       1
Name: count, Length: 89, dtype: int64

--- Vulnerable Population Processing Results (Sample) ---
VULNERABLE POPULATION
No Vulnerability                    21351
Child/Youth Vulnerability            9686
Health-Related Vulnerability         5190
Unknown                              3849
Displacement/Labor Vulnerability     3709
Gender/Social Vulnerability           575
Name: count, dtype: int64

--- Value Counts for Standardized 'RELATIONSHIP' ---
RELATIONSHIP WITH PERPETRATOR
Other                 17788
Spouse/Partner        10680
Stranger               6526
Extended Family        3775
Family Member          3013
Acquaintance           2392
Authority Figure        166
Unknown/Not Stated       20
Name: count, dtype: int64

--- Valu

In [37]:
# Print the shape of the original DataFrame
print(f"Original shape of the DataFrame: {df.shape}")

# Print the value counts of the 'Type_of_Organisation' column before dropping rows
print("\nValue counts of 'Type_of_Organisation' before dropping NaN:")
print(df['Type_of_Organisation'].value_counts(dropna=False))

# --- Drop rows with NaN values in the specified column ---
# The 'subset' argument specifies the column to check for NaN values.
# 'inplace=True' modifies the DataFrame directly, so you don't need to reassign it.
df.dropna(subset=['Type_of_Organisation'], inplace=True)

# Print the shape of the DataFrame after dropping rows
print(f"\nShape of the DataFrame after dropping rows with NaN: {df.shape}")

# Print the value counts of the 'Type_of_Organisation' column after dropping rows
print("\nValue counts of 'Type_of_Organisation' after dropping NaN:")
print(df['Type_of_Organisation'].value_counts(dropna=False))

Original shape of the DataFrame: (44360, 31)

Value counts of 'Type_of_Organisation' before dropping NaN:
Type_of_Organisation
CSO                30422
ServiceProvider    13928
NaN                   10
Name: count, dtype: int64

Shape of the DataFrame after dropping rows with NaN: (44350, 31)

Value counts of 'Type_of_Organisation' after dropping NaN:
Type_of_Organisation
CSO                30422
ServiceProvider    13928
Name: count, dtype: int64


In [39]:
# Remove 'ESTIMATED AVERAGE MONTHLY INCOME' column, it empty, does not contain any useful data
df.drop(columns=['ESTIMATED AVERAGE MONTHLY INCOME'], inplace=True)

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 44350 entries, 0 to 44359
Data columns (total 30 columns):
 #   Column                                    Non-Null Count  Dtype         
---  ------                                    --------------  -----         
 0   Type_of_Organisation                      44350 non-null  object        
 1   Was the Violence Fatal                    44350 non-null  object        
 2   Sex of survivor                           44350 non-null  object        
 3   Age of survivor                           44350 non-null  float64       
 4   MARITAL STATUS                            44350 non-null  object        
 5   DATE OF INCIDENT                          44350 non-null  datetime64[ns]
 6   DATE REPORTED                             44350 non-null  datetime64[ns]
 7   LOCATION OF VIOLENCE (STATE)              44350 non-null  object        
 8   LOCATION OF VIOLENCE (L.G.A)              44350 non-null  object        
 9   LOCATION OF VIOLENCE (WARD)      

#### Save preprocessed data

In [43]:
# Define the folder path and filename separately
data_folder_path = 'c:\\08_AHFID\\gbv-predictive-tool\\data'
output_filename = 'processed_data.csv'

# Combine the path and filename into a single, valid file path
output_file_path = os.path.join(data_folder_path, output_filename)

# Save the DataFrame to the correct file path
df.to_csv(output_file_path, index=False)

print(f"\nProcessing complete. Final cleaned data saved to '{output_file_path}'")


Processing complete. Final cleaned data saved to 'c:\08_AHFID\gbv-predictive-tool\data\processed_data.csv'
