In [1]:
# This script finds the path to the 'data' folder in the parent directory of the current notebook.
import os

# Get the directory of the current notebook
notebook_directory = os.getcwd()
print(f"Current notebook directory: {notebook_directory}")

# Go up one level to the main project directory
parent_directory = os.path.dirname(notebook_directory)
print(f"Parent directory: {parent_directory}")

# Specify the path to the 'data' folder from the parent directory
data_folder_path = os.path.join(parent_directory, 'data')
print(f"Data directory: {data_folder_path}")

Current notebook directory: C:\09_AHFID\gbv-predictive-tool\notebooks
Parent directory: C:\09_AHFID\gbv-predictive-tool
Data directory: C:\09_AHFID\gbv-predictive-tool\data


In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import unicodedata
import re

 # Set pd.set_option to show all columns in the DataFrame
pd.set_option('display.max_columns', None) 

In [None]:
# Load the dataset from the 'data' folder
file_path = os.path.join(data_folder_path, 'NGBV Dashboard dataset.xlsx')
df = pd.read_excel(file_path, header=2)
df.info()

In [None]:
# Display the original shape and columns of the DataFrame
print("The original shape of the DataFrame:", df.shape)
print("The original columns of the DataFrame:", df.columns.tolist())

In [None]:
# Create a copy of the dataframe to avoid SettingWithCopyWarning
df_copy = df.copy()

## **Data Cleaning and Preprocessing**

### **Handle 'Contact Channel' and 'Contact Channel Other' for proper cleaning**
* Create a comprehensive mapping dictionary
* Use dictionary to map the different values from 'Contact Channel Other' to the primary categories in 'Contact Channel'
* Handles edge cases by mapping any remaining values to Other
* Drops the now-redundant Contact Channel Other column

In [None]:
print(df['Contact Channel'].unique())  # Display unique values in 'Contact Channel' column
print(df['Contact Channel Other'].unique())  # Display unique values in 'Contact Channel Other' column

In [None]:
# From the DataFrame 'df', select the 'Contact Channel' and 'Contact Channel Other' columns for all rows where 'Contact Channel' is NaN.
result = df[df['Contact Channel'].isna()][['Contact Channel', 'Contact Channel Other']]
print(result.head(10))  # Display the first few rows of the result DataFrame
#result.to_csv(os.path.join(data_folder_path, 'contact_channel_missing.csv'), index=False)

In [None]:
# --- 1) NORMALIZATION FUNCTION ---
def normalize_text(s: str) -> str:
    if pd.isna(s):
        return ""
    s = str(s).strip().lower()
    s = unicodedata.normalize("NFKD", s).encode("ascii", "ignore").decode("ascii")
    s = re.sub(r"\s+", " ", s)
    return s

# Apply normalization
df['Contact Channel Other'] = df['Contact Channel Other'].apply(normalize_text)

# --- 2) MAPPING DICTIONARY ---
mapping_contact = {
    # Hotline
    "hotline": "Hotline", "call": "Hotline", "phone call": "Hotline", "telephone": "Hotline",
    "helpline": "Hotline", "phone": "Hotline", "call center": "Hotline", "emergency number": "Hotline",
    'call': 'Hotline', 'telephone': 'Hotline', 'tellephone call': 'Hotline', 'office phone': 'Hotline',
    'table phone': 'Hotline', 'phone contact call': 'Hotline', 'hotline and physical presence': 'Hotline',
    'cece yara child helpline- 08008008001': 'Hotline', 'tcf gbv helpline': 'Hotline', 
    'tcf gbv help line': 'Hotline', 'tcf helpline': 'Hotline', '08068700240': 'Hotline', 'called in': 'Hotline',

    # Referred
    "referred": "Referred", "referral": "Referred", "referrals": "Referred",
    "police referral": "Referred", "hospital referral": "Referred", "court referral": "Referred",
    "ngo referral": "Referred", "agency referral": "Referred", "ref by police": "Referred",
    "case referral": "Referred", "school referral": "Referred", "case manager": 'Referred',
    'mandate reporter': 'Referred', 'volunteer': 'Referred', 'friend': 'Referred', 
    'volunteer': 'Referred', 'mandate reporter': 'Referred', 'neighbour': 'Referred', 
    'community leader': 'Referred', 'government agency': 'Referred', 'ngo': 'Referred',
    'community worker': 'Referred', 'neighbour': 'Referred', 'community workers': 'Referred',
    'community worker ': 'Referred', 'community leader': 'Referred', 'case manager': 'Referred', 
    'mandate reporter': 'Referred', 'volunteer': 'Referred','friend': 'Referred', 
    'mandated reporter': 'Referred', 'volunteer reporter': 'Referred', 'community volunteer': 'Referred',
    'community leader': 'Referred','government agency': 'Referred','ngo': 'Referred', 
    'community worker': 'Referred', 'neighbour': 'Referred', 'reffered by d.n.f': 'Referred', 
    'referred by someone': 'Referred',  'reported by a volunteer': 'Referred',
    'referred by an ngo': 'Referred', 'referred by parent': 'Referred', 'referred by parents': 'Referred',
    'referred by community volunteer': 'Referred', 'referred by cv': 'Referred', 
    'referred by someone': 'Referred', 'reffered by friend': 'Referred', 'referred by friend': 'Referred', 
    'reffered by a friend': 'Referred', 'referred by a friend': 'Referred', 'reffered by something': 'Referred',
    'reffered': 'Referred', 'report': 'Referred', 'reported by the sibling': 'Referred', 'witness': 'Referred', 
    'reported by survivor\'s father & nscdc officer': 'Referred', 'was directed': 'Referred', 
    'community child protection committee': 'Referred', 'community child protection committee ': 'Referred', 
    'idp leader': 'Referred', 'ccpc support': 'Referred', 'ccpc': 'Referred', 'provider client': 'Referred',
    'providers client': 'Referred', 'provider': 'Referred', 'family member': 'Referred', 
    'family friend': 'Referred','member of the network': 'Referred', 'nscdc staff': 'Referred', 
    'the nscdc personnel met the survivor in a park and decided to offer assistance': 'Referred',
    'witnessed': 'Referred', 'e-wei': 'Referred', 'cbo': 'Referred', 'nscdc': 'Referred', 'jcmf': 'Referred',  
    'cbo ': 'Referred', 'kishimi': 'Referred', 'surveillance team sokoto south lga': 'Referred',
    'surveillance team': 'Referred', 'kadvs': 'Referred', 'civil defence officer and parent': 'Referred', 
    'police officer and parent': 'Referred','police and parent': 'Referred', 'police brought them': 'Referred', 
    'police officer': 'Referred', 'police': 'Referred', 'police gender unit': 'Referred', 
    'community': 'Referred', 'mandated reporter': 'Referred', 'on supportive supervision to hf.': 'Referred',
    'through petition to attorney general': 'Referred',     'social welfare': 'Referred',
    'counsellor tester': 'Referred', 'colleague': 'Referred', 'barnawa police division': 'Referred',
    'nasarawa police station': 'Referred', 'project alert surveillance team somolu': 'Referred',
    'fhi 360': 'Referred','gender specialist of cccrn': 'Referred', 'adhoc staff': 'Referred',
    'adhoc staff in the community': 'Referred', 'adherance': 'Referred', 'joint effort': 'Referred',
    'monthly data validation': 'Referred', 'referral letter': 'Referred',
    'social development secretarial': 'Referred', 'police 0fficer and parent': 'Referred',
    'police officer andparent': 'Referred','service provider': 'Referred', 'service provider ': 'Referred',
    'services provider': 'Referred', 'services provider ': 'Referred', 'services': 'Referred',
    'service provider (doctor)': 'Referred','services provider (doctor)': 'Referred',
    'through a friend': 'Referred', 'during school sensitisation': 'Referred', 'index session': 'Referred',
    'index testing': 'Referred', 'friend of late mum': 'Referred', 'older sister': 'Referred',
    'during the celebration/school sensitisation on the international day of the girl child': 'Referred',
    'during the international day of the girl child school sensitisation': 'Referred',
    'whistle blower': 'Referred', 'whisttle blower': 'Referred', 'court': 'Referred', 
    'flyers and stickers': 'Referred', 'flyers': 'Referred', 'police officer': 'Referred',
    'police': 'Referred', 'hisbah': 'Referred', 'community case worker': 'Referred',
    'community case worker ': 'Referred', 'communication case worker': 'Referred',
    'case worker': 'Referred', 'case worker ': 'Referred', 'case worker (ccw)': 'Referred',
    'case finding': 'Referred', 'case finding ': 'Referred', 'community case finding': 'Referred',
    'community case finding ': 'Referred', 'community service member': 'Referred',
    'community case worker(ccw)': 'Referred',     'ccw': 'Referred', 'ccw ': 'Referred',
    'cwrf volunteer': 'Referred', 'ccw (ngo)': 'Referred', 'other class teacher': 'Referred',
    'invitation from court': 'Referred', 'letter': 'Referred',     'community': 'Referred',
    'mandated reported': 'Referred', 'written petition': 'Referred', 'petition': 'Referred',
    'through petition to attorney general': 'Referred', 'anonymous': 'Referred',


    # Walk-in
    "walk in": "Walk-in", "walk-in": "Walk-in", "walkin": "Walk-in",
    "physical visit": "Walk-in", "in person": "Walk-in", "visit": "Walk-in",
    "walked in": "Walk-in", "came in": "Walk-in", 'no body direct them': 'Walk-in',
    "face to face": "Walk-in", "face to face ": "Walk-in", "face to face":  "Walk-in",
    'face to face': 'Walk-in', 'face to face ': 'Walk-in', 'face-to-face': 'Walk-in',
    'the survivor came and reported the case herself.': 'Walk-in',
    'the victim came and reported the case in our office.': 'Walk-in',
    'the victim came and reported the case her self.': 'Walk-in', 
    'one on one': 'Walk-in', 'brought in': 'Walk-in',  'in person': 'Walk-in',

    # Instagram
    "instagram": "Instagram", "ig": "Instagram", "insta": "Instagram", "instagram dm": "Instagram",

    # Email
    "email": "Email", "e-mail": "Email", "mail": "Email", "gmail": "Email", "yahoo mail": "Email",

    # Twitter
    "twitter": "Twitter", "tweet": "Twitter", "twitter dm": "Twitter", "x": "Twitter",

    # Whatsapp
    "whatsapp": "Whatsapp", "whats app": "Whatsapp", "whatsapp message": "Whatsapp",
    "whatsapp call": "Whatsapp", "whatsapp chat": "Whatsapp", "whatsapp dm": "Whatsapp",
    'whats-app': 'Whatsapp', 'whats-app ': 'Whatsapp', 'whatsapp': 'Whatsapp',
    
     # Mapping to 'Rescue'
    'rescued': 'returnescue', 'rescue': 'runfileescue', 'brought in unconscious': 'rescue',
    'arrested by nscdc staff': 'rescue', 'arrested by nscdc officer': 'rescue',

    # Mapping to 'Outreach'
    'community outreach': 'outreach', 'outreach': 'outreach', 'medical outreach': 'outreach',
    'community case finding': 'outreach', 'school sensitization': 'outreach', 
    'school sensitisation': 'outreach', 'health talk': 'outreach', 'sensitization': 'outreach',
    'school sensitization on sgbv': 'outreach',
    
    # Mapping 'Facebook'
    'facebook': 'facebook',
    'facebook post': 'facebook',
}

# --- 3) SAFE MAPPER FUNCTION ---
def map_contact_channel(value: str) -> str:
    value = normalize_text(value)
    if value in mapping_contact:
        return mapping_contact[value]
    return "Other"

# --- 4) FILL NaN VALUES IN MAIN COLUMN ---
df['contact_channel'] = df.apply(
    lambda r: map_contact_channel(r['Contact Channel Other'])
              if pd.isna(r['Contact Channel']) else r['Contact Channel'],
    axis=1
)

# --- 5) DROP THE OTHER COLUMN ---
df = df.drop(columns=['Contact Channel Other'])

# Optional: view result counts
print(df['Contact Channel'].value_counts(dropna=False))

### Convert 'Was the Violence Fatal' values 
* Yes and No value to Fatal and Not Fatal

In [None]:
# Reassigning the column (an alternative to inplace=True)
df['was_the_violence_fatal'] = df['Was the Violence Fatal'].replace({'Yes': 'Fatal', 'No': 'Not Fatal', 'NotApplicable': 'Other'})

# To verify the changes, you can print the value counts
print(df['was_the_violence_fatal'].value_counts(dropna=False))

### **Handle 'Who Reported the Incident' and 'WHO REPORTED THE INCIDENT_OTHER' for cleaning**
* Normalize'WHO REPORTED THE INCIDENT_OTHER' 
* Create a expanded mapping dictionary
* Build a mapper that falls back to other
* Use the expanded dictionary to fill only the NaNs in 'Who Reported the Incident'
* Drop the source column 'WHO REPORTED THE INCIDENT_OTHER' for cleaning

In [None]:
print(df['Who Reported the Incident'].unique())
print(df['WHO REPORTED THE INCIDENT_OTHER'].unique())

In [None]:
# From the DataFrame 'df', select the 'Who Reported the Incident' and 'WHO REPORTED THE INCIDENT_OTHER' columns for all rows where 'Who Reported the Incident' is NaN.
result = df[df['Who Reported the Incident'].isna()][['Who Reported the Incident', 'WHO REPORTED THE INCIDENT_OTHER']]
print(result.head(10))  # Display the first few rows of the result DataFrame
#result.to_csv(os.path.join(data_folder_path, 'Who_Reported_the_Incident_missing.csv'), index=False)

In [None]:
# --- 1) NORMALIZE 'WHO REPORTED THE INCIDENT_OTHER' ---
def normalize_text(s: str) -> str:
    if pd.isna(s):
        return ""
    s = str(s).strip().lower()
    # fold accents
    s = unicodedata.normalize("NFKD", s).encode("ascii", "ignore").decode("ascii")
    # collapse inner spaces
    s = re.sub(r"\s+", " ", s)
    return s

# Apply normalization (safe if column exists)
df['WHO REPORTED THE INCIDENT_OTHER'] = df['WHO REPORTED THE INCIDENT_OTHER'].apply(normalize_text)

# --- 2) MAPPING DICTIONARY ---
# Primary buckets expected in the final column
#['Guardian', 'Witness', 'Self', 'Spouse', 'Parent', 'Children', 'Relative', 'Friend(s)', 'Caregiver', 'Employer', 'Community', 'Neighbor', 'Health Worker', 'Legal/Government', 'NGO/CSO', 'School/Teacher', 'Other']

mapping = {
    # ----- Parent -----
    "father": "Parent", "fathers": "Parent", "dad": "Parent", "daddy": "Parent", "papa": "Parent",
    "mother": "Parent", "mothers": "Parent", "mum": "Parent", "mummy": "Parent", "mama": "Parent",
    "parent": "Parent", "parents": "Parent", "childs father": "Parent", "childs mother": "Parent", 
    "children mother": "Parent", "child mother": "Parent", "mother of child": "Parent", 
    "mother of children": "Parent","survivors mother": "Parent", "survivors father": "Parent", 
    "survivors parents": "Parent","survivors mum": "Parent", "survivors dad": "Parent",
    "survivors mother ": "Parent", "childs mother ": "Parent", "childs father ": "Parent",
    'parents and police': 'Parent', 'police and parents': 'Parent',
    'child\'s mother': 'Parent', 'children mother': 'Parent','children\'s mother': 'Parent',
    'child\'s mother': 'Parent', 'father to child': 'Parent', 'child\'s father': 'Parent',
    'stepparent': 'Parent', 'stepparents': 'Parent', 'she and her mother': 'Parent',
    'parent and police': 'Parent', 'parents and nscdc': 'Parent','parents and self': 'Parent',
    'parent and nscdc': 'Parent', 'parent and first step action': 'Parent',
    'parent and caritas': 'Parent', 'parents and anti-retroviral therapy center': 'Parent',
    'parents and adamawa concern citizens foundation': 'Parent',  'foster mother': 'Parent',
    'parents and adamawa concern citizens': 'Parent', 'parents and adamawa concern citizen (cbo)': 'Parent',
    "foster mother": "Parent", "foster father": "Parent", 'stepmother': 'Parent',
    

    # ----- Spouse -----
    "spouse": "Spouse", "wife": "Spouse", "husband": "Spouse", "sexual partner": "Spouse", 
    "intimate partner": "Spouse", "partner": "Spouse","cohabiting": "Spouse", "forced marriage": "Spouse",
    "house wife": "Spouse", 'spouse': 'Spouse', 'wife': 'Spouse',  'the wife': 'Spouse',
    'spouse and sister to the wife': 'Spouse',

    # ----- Guardian -----
    "guardian": "Guardian", 
    "grandparent": "Guardian", "grandparents": "Guardian", "grandmother": "Guardian", 
    "grandfather": "Guardian", "grandma": "Guardian", "grandpa": "Guardian",
    "paternal grandma": "Guardian", "paternal grandfather": "Guardian",
    "maternal grandma": "Guardian", "maternal grandfather": "Guardian", 'paternal grandfather': 'Guardian',
    'mother’s friend': 'Guardian', 'mentor mother': 'Guardian', 'menthor mother': 'Guardian',
    'uncle': 'Guardian', 'aunt': 'Guardian', 'aunt ': 'Guardian', 'her aunt': 'Guardian', 
    'auntie': 'Guardian', 'aunties': 'Guardian', 'grandmother': 'Guardian', 'grandparents': 'Guardian',
    'grandparent': 'Guardian', 'grand father': 'Guardian', 'grand father ': 'Guardian',
    'grand mother': 'Guardian', 'grand parent': 'Guardian', 'guardian and police': 'Guardian',

    # ----- Self -----
    "self": "Self", "alone": "Self", "survivor": "Self", "victim": "Self", "walk in": "Self", 
    "walkin": "Self", "walk-in": "Self", "self referral": "Self", "self referred": "Self", 
    "self identified": "Self", "self survivor": "Self", "self survivors": "Self",
    "the victim": "Self", "the survivor": "Self", 'seif': 'Self',
    'alone': 'Self', 'living in a separate apartment': 'Self', 'self reported': 'Self',
    'self referral': 'Self', 'self after full recovery from the hospital': 'Self',
    'self after recovery': 'Self', 'otherself': 'Self', 'the victim': 'Self', 'self and police': 'Self',
    'self and nscdc': 'Self', 'on_line': 'Self', # Assuming an online self-report
    

    # ----- Children -----
    "son": "Children", "sons": "Children", "daughter": "Children", "daughters": "Children",
    "child": "Children", "children": "Children", "survivors son": "Children", "survivors daughter": "Children",
    "victims child": "Children", "victims children": "Children","daughter of victim": "Children", 
    "son of victim": "Children", 'survivor son': 'Children', 'survivor\'s son': 'Children',
    'survivor daughter': 'Children', 'survivor\'s daughter': 'Children', 'son': 'Children',
    'survivor\'s son': 'Children',

    # ----- Relative -----
    "relative": "Relative", "relatives": "Relative", "uncle": "Relative", "aunt": "Relative", 
    "aunty": "Relative", "auntie": "Relative", "niece": "Relative", "nephew": "Relative",
    "cousin": "Relative", "cousins": "Relative", "brother": "Relative", "brothers": "Relative", 
    "elder brother": "Relative", "younger brother": "Relative", "sister": "Relative", 
    "sisters": "Relative", "elder sister": "Relative", "younger sister": "Relative",
    "sibling": "Relative", "siblings": "Relative", "step father": "Relative", "stepfather": "Relative",
    "step mother": "Relative", "stepmother": "Relative", "step brother": "Relative", "stepbrother": "Relative",
    "step sister": "Relative", "stepsister": "Relative", "maternal uncle": "Relative", 
    "paternal uncle": "Relative", "brother inlaw": "Relative", "brother in law": "Relative", 
    "brother in-law": "Relative", "sister inlaw": "Relative", "sister in law": "Relative", 
    "sister in-law": "Relative", "mother inlaw": "Relative", "mother in law": "Relative",
    "father inlaw": "Relative", "father in law": "Relative", "her aunt": "Relative", 
    "his aunt": "Relative", "their aunt": "Relative", "her brother": "Relative", 
    "his brother": "Relative", "their brother": "Relative", "her sister": "Relative", 
    "his sister": "Relative", "their sister": "Relative", "child family": "Relative",
    'cousin': 'Relative', 'relatives': 'Relative', 'brother': 'Relative', 'in-law': 'Relative',
    'survivor\'s brother': 'Relative', 'family member': 'Relative', 'survivor\'s family': 'Relative',
    'ct': 'Relative', 'c t': 'Relative', 'acm': 'Relative', 'elder brother': 'Relative',
    'sibling': 'Relative', 'victim\'s sister': 'Relative', 'step sister': 'Relative',
    'brother inlaw': 'Relative', 'her sister': 'Relative', 'victims brother': 'Relative',
    'victim\'s cousin': 'Relative',  'sister': 'Relative', 'survivor\'s sister': 'Relative', 

    # ----- Friend(s) -----
    "friend": "Friend(s)", "friends": "Friend(s)", "family friend": "Friend(s)", "family friends": "Friend(s)",
    "boyfriend": "Friend(s)", "girlfriend": "Friend(s)","roommate": "Friend(s)", "roommates": "Friend(s)",
    "classmate": "Friend(s)", "class mate": "Friend(s)", "survivors friend": "Friend(s)", 
    "victims friend": "Friend(s)", 'friend\'s elder sister': 'Friend(s)', 'family friend': 'Friend(s)',
    'a friend': 'Friend(s)', 'a friend to the survivor': 'Friend(s)', 'survivor\'s friend': 'Friend(s)',
    'mother\'s friend': 'Friend(s)', 'classmate': 'Friend(s)', 'sibling\'s friend': 'Friend(s)',
     'other-friend': 'Friend(s)',

    # ----- Caregiver -----
    "caregiver": "Caregiver", "care giver": "Caregiver", "caregivers": "Caregiver",
    "mentor mother": "Caregiver", "caretaker": "Caregiver",

    # ----- Employer -----
    "employer": "Employer", "her employer": "Employer", "boss": "Employer",
    "madam": "Employer", "her madam": "Employer",

    # ----- Community -----
    "community member": "Community", "community leader": "Community",
    "community worker": "Community", "community leaders": "Community",
    "community workers": "Community", "community volunteer": "Community",
    "community volunteer paralegal": "Community",
    "community gatekeeper": "Community", "community gatekeepers": "Community",
    "community stakeholder": "Community", "community clan head": "Community",
    "church member": "Community", "pastor of her church": "Community", "pastors wife": "Community",
    "village head": "Community", "traditional ruler": "Community", "women leader": "Community", 
    "woman leader": "Community", "a member of the community": "Community",'community member': 'Community',
    'community member ': 'Community', 'community learder': 'Community', 'community leader': 'Community',
    'community leaders': 'Community', 'community worker': 'Community',  'community workers': 'Community',
    'community volunteer': 'Community', 'community volunteer paralegal': 'Community',
    'village head': 'Community', 'hisbah commandant': 'Community', 'hisban commondan': 'Community', 
    'maiadugu': 'Community', 'mai auguwa': 'Community', 'mai auguwa cigari': 'Community', 
    'women leader': 'Community', 'church member': 'Community', 'pastor of her church': 'Community', 
    'a by stander': 'Community', 'bystander': 'Community', 'watchman': 'Community', 
    'concerned adult': 'Community', 'vigilante': 'Community', 'okene command vigilante': 'Community', 
    'officers of vigilante': 'Community', 'sumoto': 'Community', 'other townman': 'Community', 
    'civilian jtf': 'Community', 'a community case worker': 'Community',
    'other(community member)': 'Community','community gate keepers': 'Community', 
    'community gatekeeper': 'Community', 'community stakeholder': 'Community', 'community worker': 'Community',

    # ----- Neighbor -----
    "neighbor": "Neighbor", "neighbour": "Neighbor", "neigbour": "Neighbor",
    "neigbours": "Neighbor", "concerned neighbor": "Neighbor", "concerned neighbour": "Neighbor",
    "street brothers": "Neighbor", 'neighbor': 'Neighbor', 'neighbour': 'Neighbor', 'neigbour': 'Neighbor',
    'concerned person': 'Neighbor',
    
    # ----- Witness -----
    "witness": "Witness", "eye witness": "Witness", "eyewitness": "Witness",

    # ------ Health Worker -----
    'health provider': 'Health Worker', 'health worker': 'Health Worker',
    'health worker1': 'Health Worker', 'health care worker': 'Health Worker',
    'nurse': 'Health Worker', 'nurse that treated the child': 'Health Worker',
    'primary health care worker': 'Health Worker', 'phc': 'Health Worker',
    'phc kofar kade': 'Health Worker', 'phc director': 'Health Worker',
    'a nurse where the child was receiving treatment': 'Health Worker',
    'medical doctor': 'Health Worker', 'in-charge of seven days adventist primary health post': 'Health Worker',
    "community health worker": "Health Worker",
    
    # ----- 'Legal/Government' --------- 
    'police': 'Legal/Government', 'police officer': 'Legal/Government', 'police inspector': 'Legal/Government',
    'police and parents': 'Legal/Government', 'police and parent': 'Legal/Government',
    'police and nscdc': 'Legal/Government', 'police officer': 'Legal/Government', 
    'prosecutor': 'Legal/Government', 'lawyer': 'Legal/Government', 'family lawyer': 'Legal/Government',
    'chief magistrate': 'Legal/Government', 'legal coucil from opd': 'Legal/Government',
    'legal council form opd': 'Legal/Government', 'legal council from fida': 'Legal/Government',
    'fida ebonyi state': 'Legal/Government', 'government agency': 'Legal/Government',
    'government worker': 'Legal/Government', 'social worker': 'Legal/Government',
    'government social worker': 'Legal/Government', 'juvenile welfare police officer': 'Legal/Government',
    'gender officer': 'Legal/Government', 'gender unit': 'Legal/Government',
    'gender desk officer uwanse police': 'Legal/Government',
    'gender office asokoro police station': 'Legal/Government', 'naptip team': 'Legal/Government',
    'naptip': 'Legal/Government', 'naptip survilans team': 'Legal/Government', 
    'naptip team': 'Legal/Government', 'nscdc': 'Legal/Government', 'nscdc personnel': 'Legal/Government',
    'nscdc staff': 'Legal/Government', 'nscdc staf': 'Legal/Government', 'npf': 'Legal/Government',
    'para military': 'Legal/Government', 'asokoro police station': 'Legal/Government',
    'human rights commission': 'Legal/Government', 'traditional ruler': 'Legal/Government',
    'court': 'Legal/Government', 'hon commissioner': 'Legal/Government', 
    'tangaza surveillance team': 'Legal/Government', 'surveillance team': 'Legal/Government',
    'security personnel': 'Legal/Government', 'security agent': 'Legal/Government',
    'nigeria immigration service': 'Legal/Government', 'nigirean imegeration service': 'Legal/Government',
    'ministry of women affairs and social development ebony state': 'Legal/Government',
    'smwca': 'Legal/Government', 'ccpc': 'Legal/Government', 'a government staff': 'Legal/Government',
    'subeb officer, uncle': 'Legal/Government', 'subeb': 'Legal/Government', 
    'edo state universal basis education board (subeb)': 'Legal/Government',
    'state universal basic education board': 'Legal/Government', 'gender officer': 'Legal/Government',
    'state universal basic education (subeb)': 'Legal/Government', 'm-ccad': 'Legal/Government',
    'help line': 'Legal/Government', 'other a volunteer of the foundation': 'Legal/Government',
    'child advocate': 'Legal/Government', 'case manager': 'Legal/Government', 'case worker': 'Legal/Government',
    'case managers': 'Legal/Government', 'a case worker': 'Legal/Government', 
    'social welfare': 'Legal/Government', 'social welfare officer': 'Legal/Government',
    "community case worker": "Legal/Government", "community caseworker": "Legal/Government",
    "community case manager": "Legal/Government",  'community case worker': 'Legal/Government', 
    'community case manager': 'Legal/Government', 'community case worker': 'Legal/Government', 
    'community case manager': 'Legal/Government',  'community case worker': 'Legal/Government', 

    # -------'NGO/CSO'--------
    'ngo': 'NGO/CSO', 'ngo/cpn': 'NGO/CSO', 'cpn': 'NGO/CSO', 'cso': 'NGO/CSO', 'cso/ngo': 'NGO/CSO',
    'un agency': 'NGO/CSO', 'human rights liberty access and peace defender foundation': 'NGO/CSO',
    'ngo (caritas nigeria)': 'NGO/CSO', 'other ngo': 'NGO/CSO', 'plan international': 'NGO/CSO',
    'referred from today for tomorrow foundation': 'NGO/CSO', 'an organization (ster)': 'NGO/CSO',
    'an ngo supporter': 'NGO/CSO', 'positive care development foundation': 'NGO/CSO', 'brci': 'NGO/CSO',
    'gpi/brci': 'NGO/CSO', 'caritas': 'NGO/CSO', 'ahic staff': 'NGO/CSO', 'project charilove': 'NGO/CSO',
    'project charilove (hotel guardian)': 'NGO/CSO', 'child right agency': 'NGO/CSO',
    'auntie landa\'s foundation': 'NGO/CSO', 'care for social welfare international': 'NGO/CSO',
    'human right activist': 'NGO/CSO', 'first step action': 'NGO/CSO', 'warif volunteer': 'NGO/CSO',
    
    # -------'School/Teacher'----------
    'teacher': 'School/Teacher', 'school teacher': 'School/Teacher', 'headmistress': 'School/Teacher',
    'school principal': 'School/Teacher', 'school proprietor': 'School/Teacher',
    'secondary school teacher': 'School/Teacher', 'education providers': 'School/Teacher',
    'client\'s school counselling': 'School/Teacher', 'head of school': 'School/Teacher',
    'principal of school': 'School/Teacher', 'school principal': 'School/Teacher',
    'school administrator': 'School/Teacher', 'school head': 'School/Teacher',
    'school proprietor': 'School/Teacher', 'school teacher': 'School/Teacher','school staff': 'School/Teacher',
    'school authority': 'School/Teacher', 'school': 'School/Teacher',

}

# Optional helper synonyms to catch common phrases seen in noisy entries
# This lets a simple lookup work, since we normalize before mapping.
synonym_expansions = {
    # Relative phrases
    "her aunt": "aunt",
    "his aunt": "aunt",
    "their aunt": "aunt",
    "her brother": "brother",
    "his brother": "brother",
    "their brother": "brother",
    "her sister": "sister",
    "his sister": "sister",
    "their sister": "sister",
    "brother to survivor": "brother",
    "sister to survivor": "sister",
    "uncle to survivor": "uncle",
    "aunt to survivor": "aunt",
    "aunt to the victim or survivor": "aunt",

    # Children phrases
    "daughter of victim": "daughter",
    "son of victim": "son",
    "survivors son": "son",
    "survivors daughter": "daughter",

    # Parent phrases
    "survivors mother": "mother",
    "survivors father": "father",
    "childs mother": "mother",
    "childs father": "father",
    "child mother": "mother",
    "children mother": "mother",
    "mother of child": "mother",
    "mother of children": "mother",

    # Self phrases
    "the victim": "survivor",
    "the survivor": "survivor",
    "walkin": "walk in",

    # Community phrases
    "a member of the community": "community member",
    "pastor's wife": "pastors wife",

    # Neighbor phrases
    "concern  neigbour": "concerned neighbour",
}

# Expand mapping to include synonyms automatically
for k, v in list(synonym_expansions.items()):
    mapping.setdefault(k, mapping.get(v, v))

# --- 3) BUILD A SAFE MAPPER THAT FALLS BACK TO 'Other' ---
def map_other_to_category(value: str) -> str:
    value = normalize_text(value)
    if value in mapping:
        return mapping[value]
    return "Other"

# --- 4) FILL ONLY THE NaNs IN 'Who Reported the Incident' ---
df['who_reported_the_incident'] = df.apply(
    lambda r: map_other_to_category(r['WHO REPORTED THE INCIDENT_OTHER'])
              if pd.isna(r['Who Reported the Incident']) else r['Who Reported the Incident'],
    axis=1
)

# --- 5) DROP THE SOURCE COLUMN ---
df = df.drop(columns=['WHO REPORTED THE INCIDENT_OTHER'])

# --- 6) QUICK COUNTS TO SPOT CHECK THE RESULT ---
print(df['who_reported_the_incident'].value_counts(dropna=False).sort_index())


In [None]:
print(df['EMPLOYMENT STATUS OF PARENT/GUARDIAN'].unique())
print(df['EMPLOYMENT STATUS OF PARENT/GUARDIAN_OTHERS'].unique())  

In [None]:
# From the DataFrame 'df', select the 'EMPLOYMENT STATUS OF PARENT/GUARDIAN' and 'WHO REPORTED THE INCIDENT_OTHER' columns for all rows where 'Who Reported the Incident' is NaN.
result = df[
    df['EMPLOYMENT STATUS OF PARENT/GUARDIAN'].isna() &
    df['EMPLOYMENT STATUS OF PARENT/GUARDIAN_OTHERS'].notna()
][[
    'EMPLOYMENT STATUS OF PARENT/GUARDIAN',
    'EMPLOYMENT STATUS OF PARENT/GUARDIAN_OTHERS'
]]

print(result.head(20))  # Display the first few rows of the result DataFrame
#result.to_csv(os.path.join(data_folder_path, 'employment_status_parentguard_missing.csv'), index=False)

In [None]:
if df is not None:
    # 1. Clean up column names for safe access
    df.columns = df.columns.str.strip()
    df.rename(columns={
        'EMPLOYMENT STATUS OF PARENT/GUARDIAN': 'employment_status_main',
        'EMPLOYMENT STATUS OF PARENT/GUARDIAN_OTHERS': 'employment_status_others'
    }, inplace=True)

    # 2. Fill NaN in main column with values from others column
    df['employment_status_main'] = df['employment_status_main'].fillna(df['employment_status_others'])

    # 3. Fill remaining NaN with 'Unknown'
    df['employment_status_main'] = df['employment_status_main'].fillna('Unknown')

    # 4. Common typo corrections (before mapping)
    typo_corrections = {
        'walks in a barracks': 'works in a barracks',
        'capinter': 'carpenter',
        'heir dresser': 'hair dresser',
        'lebourer': 'labourer',
        'fruit saler': 'fruit seller',
        'she is 26 years pld': 'she is 26 years old'
    }
    df['employment_status_main'] = df['employment_status_main'].str.lower().str.strip()
    df['employment_status_main'] = df['employment_status_main'].replace(typo_corrections)

    # 5. Mapping for standardization
    mapping = {
        # Currently employed
        'caseworker': 'Currently employed',
        'civil servant': 'Currently employed',
        'teacher': 'Currently employed',
        'bike rider': 'Currently employed',
        'nurse': 'Currently employed',
        'clergy': 'Currently employed',
        'works in a barracks': 'Currently employed',
        'domestic staff': 'Currently employed',
        "currently employed": "Currently employed", 
        "hotel steward": "Currently employed",
        "teacher": "Currently employed",

        # Self employed
        'farming': 'Self employed',
        'farmer': 'Self employed',
        'farmers': 'Self employed',
        'business': 'Self employed',
        'business women': 'Self employed',
        'trader': 'Self employed',
        'entrepreneur': 'Self employed',
        'hawker': 'Self employed',
        'petty trader': 'Self employed',
        'food vendor': 'Self employed',
        'labourer': 'Self employed',
        'fruit seller': 'Self employed',
        'self employed': 'Self employed',
        'self- employed': 'Self employed',
        'self-employed': 'Self employed',
        'fisher woman': 'Self employed',
        "tailor": "Self employed", 
        "trading": "Self employed",
        "bussiness man": "Self employed",
        "sex worker": "Self employed",
        'carpenter': 'Self employed',
        'hair dresser': 'Self employed',
        'mechanic': 'Self employed',
        'petty business': 'Self employed',           
        "child labour": "Other",        

        # Unemployed
        'unemployed': 'Unemployed',
        'not working': 'Unemployed',
        'house wife': 'Unemployed',
        'house wife ': 'Unemployed',
        'housewife': 'Unemployed',
        'retired': 'Unemployed',
        'pensioner': 'Unemployed',
        'student': 'Unemployed',
        'pupil': 'Unemployed',
        'schooling': 'Unemployed',
        'minor': 'Unemployed',
        'child': 'Unemployed',
        'baby': 'Unemployed',
        'house help': 'Unemployed',
        'night worker': 'Unemployed',
        'deceased': 'Unemployed',
        'not alive': 'Unemployed',
        'none': 'Unemployed',
        'orphan': 'Unemployed',
        'no parent/guardian': 'Unemployed',
        'no parent /guardian': 'Unemployed',
        "dependent": "Unemployed",
        "retiree": "Unemployed",
        "full house wife": "Unemployed",
        "studien": "Unemployed",
        "corper": "Unemployed",
        "under age": "Unemployed",
        "non": "Unemployed",
        "studient": "Unemployed",                 
        "purpil": "Unemployed",                  
        "married woman": "Unemployed",           
        "monor": "Unemployed",                   

        # Not reported
        'not reported': 'Not reported',
        'unknown': 'Unknown',  # keep Unknown separate
        'not specified': 'Not reported',
        'not stated': 'Not reported',
        'not applicable': 'Not reported',
        'she is 26 years old': 'Not reported',
        'case of trafficking': 'Not reported',
        "not educated": "Not reported",
        "how to pae": "Not reported",
        "dt know": "Not reported",
        "not known": "Not reported",
        "other": "Other",
        "tecaher": "Currently employed",
    }

    # 6. Apply mapping
    df['employment_status_main'] = df['employment_status_main'].replace(mapping)

    # 7. Drop redundant column
    df.drop(columns=['employment_status_others'], inplace=True)

    # 8. Output results
    print("\nData cleaning and standardization complete.")
    print("----------------------------------------")
    #print(df.info())
    print("\nValue counts for 'employment_status_main':")
    print(df['employment_status_main'].value_counts())
    #print("\nFirst 10 rows of cleaned data:")
    #print(df.head(10))


In [None]:
print(df['EMPLOYMENT STATUS OF SURVIVOR/VICTIM'].unique())
print(df['EMPLOYMENT STATUS OF SURVIVOR/VICTIM_OTHERS'].unique())

In [None]:
# From the DataFrame 'df', select the 'EMPLOYMENT STATUS OF PARENT/GUARDIAN' and 'EMPLOYMENT STATUS OF PARENT/GUARDIAN_OTHERS' columns for all rows where 'EMPLOYMENT STATUS OF PARENT/GUARDIAN' is NaN.
result = df[
    df['EMPLOYMENT STATUS OF SURVIVOR/VICTIM'].isna() &
    df['EMPLOYMENT STATUS OF SURVIVOR/VICTIM_OTHERS'].notna()
][[
    'EMPLOYMENT STATUS OF SURVIVOR/VICTIM',
    'EMPLOYMENT STATUS OF SURVIVOR/VICTIM_OTHERS'
]]

print(result.head(20))  # Display the first few rows of the result DataFrame
# result.to_csv(os.path.join(data_folder_path, 'employment_status_victim_missing.csv'), index=False)

In [None]:
if df is not None:
    # 1. Standardize column names
    df.columns = df.columns.str.strip()
    df.rename(columns={
        'EMPLOYMENT STATUS OF SURVIVOR/VICTIM': 'employment_status_victim_main',
        'EMPLOYMENT STATUS OF SURVIVOR/VICTIM_OTHERS': 'employment_status_victim_others'
    }, inplace=True)

    # 2. Merge main and 'others' columns
    df['employment_status_victim_main'] = df['employment_status_victim_main'].fillna(df['employment_status_victim_others'])

    # 3. Fill remaining NaNs with 'Unknown'
    df['employment_status_victim_main'] = df['employment_status_victim_main'].fillna('Unknown')

    # 4. Fix common typos before mapping
    typo_corrections = {
        'pensionia': 'pensioner',
        'house wife': 'housewife',
        'laborer': 'labourer',
        'househelper': 'house helper',
        'sex': 'sex worker'
    }
    df['employment_status_victim_main'] = df['employment_status_victim_main'].str.lower().str.strip()
    df['employment_status_victim_main'] = df['employment_status_victim_main'].replace(typo_corrections)

    # 5. Standardization mapping
    mapping = {
        # Currently employed
        'private teacher': 'Currently employed',
        'missionary': 'Currently employed',
        'caseworker': 'Currently employed',
        'clergy': 'Currently employed',

        # Self employed
        'farmer': 'Self employed',
        'farming': 'Self employed',
        'petty trader': 'Self employed',
        'trader': 'Self employed',
        'entrepreneur': 'Self employed',
        'business': 'Self employed',
        'labourer': 'Self employed',
        'house helper': 'Self employed',
        'domestic help': 'Self employed',
        'apprentice': 'Self employed',
        'sex worker': 'Self employed',

        # Unemployed
        'unemployed': 'Unemployed',
        'not working': 'Unemployed',
        'student': 'Unemployed',
        'minor': 'Unemployed',
        'housewife': 'Unemployed',
        'retired': 'Unemployed',
        'pensioner': 'Unemployed',

        # Not reported
        'not specified': 'Not reported',
        'not applicable': 'Not reported',
        'unknown': 'Unknown', # Keep separate from "Not reported"
        "self employed": "Self employed", 
        "not reported": "Not reported",
        "not applcable": "Not reported",
        "sales representative": "Currently employed",
        "less than 18": "Unemployed",
        "dependant": "Unemployed",
        "23": "Unemployed",
        
        # -----Others-----
        "currently employed": "Currently employed",
        "civil servant": "Currently employed",
        "house help": "Currently employed",
        "hawking": "Self employed",
        "trading": "Self employed",
        "peasant farming": "Self employed",
        "peasant farmer": "Self employed",
        "not employed": "Unemployed",
        "retiree": "Unemployed",
        "dependent": "Unemployed",
        "pupil": "Unemployed",
        "undergraduate": "Unemployed",
        "child": "Unemployed",
        "student trainee": "Unemployed",
        "corper": "Unemployed",
        "schooling": "Unemployed",
        "child labour": "Other",
        "media personal": "Currently employed",
        "not stated": "Not reported",
        "private clinic": "Currrently employed",
        "purpil": "Unemployed",
        "student-ss1": "Unemployed",
        "stdudent": "Unemployed",
        "studient": "Unemployed",
        "full time housewife": "Unemployed",
        "nursing mother": "Unemployed",
        "pastoring": "Currently employed",
        "domestic worker": "Currently employed",
        "no income": "Unemployed",
        "looking for job": "Unemployed",
        "copper": "Unemployed",
        "salesgirl": "Currently employed",
        "house-wife": "Unemployed",
        "apprentice": "Unemployed",
        "traditional ruler": "Currently employed",
        "students": "Unemployed",
        "out of school": "Unemployed",
        "panel beater": "Self employed",
        "ss3 student": "Unemployed",
        "volunteer": "Unemployed",
        "volunteering": "Unemployed",
        "under-graduate": "Unemployed",
        "nil": "Not reported",
        "still schooling": "Unemployed",
        "serving": "Unemployed",
        "not working yet": "Unemployed",
        "not yet employed": "Unemployed",
        "hair dresser": "Self employed",
        "retired accountant": "Unemployed",
        "hair dressing": "Self employed",
        "currently unemployed": "Unemployed",
        "food vendor": "Self employed",
        "10000": "Not reported",
        "serving corps member": "Unemployed",
        "learning how to sew": "Unemployed",
        "domestic staff": "Currently employed",
        "fruit saler": "Self employed",
        "currently serving": "Unemployed",
        "footballer": "Self employed",
        "learning trade": "Unemployed",
        "capinter": "Self employed",
        "full house wife": "Unemployed",
        "working": "Currently employed",
        "petty trade": "Self employed",
        "none": "Not reported",
        "apprentices": "Unemployed",
        "outreach": "Unemployed",
        "apprentice": "Unemployed",
        "currently employed": "Currently employed",
        "currently unemployed": "Unemployed",
        "not yet employed": "Unemployed",
        "aprintice": "Unemployed",
        "Currrently employed": "Currently employed",
        "currently unempolyed": "Unemployed",
        "not yet  employed": "Unemployed", 
        "Not reported": "Unknown"

    }

    # 6. Apply mapping
    df['employment_status_victim_main'] = df['employment_status_victim_main'].replace(mapping)

    # 7. Drop redundant 'others' column
    df.drop(columns=['employment_status_victim_others'], inplace=True)

    # 8. Output results
    print("\nData cleaning and standardization complete.")
    print("----------------------------------------")
    # print(df.info())
    print("\nValue counts for 'employment_status_victim_main':")
    print(df['employment_status_victim_main'].value_counts())
    # print("\nFirst 10 rows of cleaned data:")
    # print(df.head(10))


In [None]:
# 1) Rename to your preferred labels
rename_map = {
    'VULNERABLE POPULATION_Person living with disability': 'PLWD',
    'VULNERABLE POPULATION_PLHIV': 'PLHIV',
    'VULNERABLE POPULATION_Female sex worker': 'female_sex_worker',
    'VULNERABLE POPULATION_IDP': 'IDP',
    'VULNERABLE POPULATION_DRUG USER': 'drug_user',
    'VULNERABLE POPULATION_WIDOW': 'widow',
    'VULNERABLE POPULATION_OUT OF SCHOOL CHILD': 'out_of_school_child',
    'VULNERABLE POPULATION_MINOR': 'minor',
    'VULNERABLE POPULATION_House maids/domestic staff': 'household_help',
    'VULNERABLE POPULATION_CHILD APPRENTICE': 'child_apprentice',
    'VULNERABLE POPULATION_ORPHANS': 'orphans',
    'VULNERABLE POPULATION_NOT APPLICABLE': 'not_applicable',
    'VULNERABLE POPULATION_OTHER': 'other_vulnerability'
}
df = df.rename(columns=rename_map)

# 2) Convert only those columns from Yes or No
vuln_cols = list(rename_map.values())

# Booleans with missing preserved as <NA>
df[vuln_cols] = df[vuln_cols].eq('Yes').astype('boolean')

# If you prefer 1 and 0 instead
# df[vuln_cols] = df[vuln_cols].eq('Yes').astype('Int64')  # keeps <NA> too

#### 'Age of survivor'
* Correct Unrealistic Values: Values like 3434 were assumed to be typos, and only the first two digits were kept to fix the data while preserving the intended information.
* Handle Missing Ages: The value 0.0 was converted to NaN because it is not a valid age.
* Impute NaN Values: All missing NaN values were filled with the median age, which is a statistically sound method to handle missing data without skewing the distribution.

In [None]:
# Correct the typo values by keeping only the first two digits
df['age_of_survivor'] = df['Age of survivor'].astype(str).str[:2]

# Use pd.to_numeric to convert the column to a number, coercing errors to NaN
df['age_of_survivor'] = pd.to_numeric(df['age_of_survivor'], errors='coerce')

# Replace "nan" strings with NumPy's NaN
df['age_of_survivor'] = df['age_of_survivor'].replace("nan", np.nan)

# Set an upper age limit to handle any other potential errors
upper_age_limit = 100
df.loc[df['age_of_survivor'] > upper_age_limit, 'age_of_survivor'] = np.nan

# Handle the 0.0 values by replacing them with NaN
df['age_of_survivor'] = df['age_of_survivor'].replace(0.0, np.nan)

# Fill the remaining NaN values with the median age
median_age = df['age_of_survivor'].median()
df['age_of_survivor'] = df['age_of_survivor'].fillna(median_age)

#### 'MARITAL STATUS'
1. Consolidated 'Never married' and 'Single': The categories were merged into a single 'Single' label for simplicity and to combine similar statuses, which is good for analysis.

2. Consolidated 'Other' and NaN: Both were merged into a single 'Unknown' category to handle unclassified data consistently without losing any information.

In [None]:
# Standardize Marital Status
df['marital_status'] = df['MARITAL STATUS'].replace({
    'Never married': 'Single',
    'Other': 'Unknown'
}).fillna('Unknown')

# Check the updated distribution
df['marital_status'].value_counts(dropna=False)

#### 'WHO SURVIVOR/VICTIM LIVE WITH' and 'DOES THE SURVIVOR/VICTIM LIVE ALONE'
1. Conditional Imputation: We filled missing 'WHO SURVIVOR/VICTIM LIVE WITH' values based on 'DOES THE SURVIVOR/VICTIM LIVE ALONE' to infer if the survivor lived 'Alone' or with 'Parent/guardian', leveraging existing information.
2. Fill Remaining Missing: Any remaining NaN values in 'WHO SURVIVOR/VICTIM LIVE WITH' were labeled 'Unknown' to ensure no missing data and provide a clear category for unstated living situations.
3. Standardize and Consolidate Categories: Various spellings and similar categories (e.g., 'Mother', 'MOTHER', 'Grandmother', 'Aunt') were mapped to a unified set of broader categories (e.g., 'PARENT/GUARDIAN', 'SPOUSE/PARTNER', 'CHILDREN') to reduce noise and improve consistency for analysis.
4. Drop Redundant Column: 'DOES THE SURVIVOR/VICTIM LIVE ALONE' was dropped as its information was successfully transferred and consolidated into 'WHO SURVIVOR/VICTIM LIVE WITH', simplifying the dataset.

In [None]:
print(df['WHO DOES THE SURVIVOR/VICTIM LIVE WITH'].value_counts(dropna=False))
print(df['DOES THE SURVIVOR/VICTIM LIVE ALONE'].value_counts(dropna=False))

In [None]:
col = 'WHO DOES THE SURVIVOR/VICTIM LIVE WITH'
alone_col = 'DOES THE SURVIVOR/VICTIM LIVE ALONE'

# a) use the live alone question to fill obvious missing values
df.loc[(df[alone_col].astype(str).str.upper() == 'YES') & (df[col].isna()), col] = 'Alone'
# If they do not live alone and we have no text, default to Parent, not Guardian
# This is a pragmatic choice, change to 'Guardian' if you prefer the opposite
df.loc[(df[alone_col].astype(str).str.upper() == 'NO') & (df[col].isna()), col] = 'Parent'

# b) fill remaining NaN with Unknown
df[col] = df[col].fillna('Unknown')

# c) normalise raw text for matching
df[col] = df[col].astype(str).str.upper().str.strip()

# d) explicit mapping for frequent variants, mapped to final categories
# final categories used: Parent, Guardian, Relative, Partner, Friend(s), Caregiver,
# Employer, Perpetrator(s), IDP Camp, Shelter, Community, Neighbor, Unknown, Other, Alone, Children

mapping = {
    # Parent related
    'PARENTS': 'Parent',
    'MOTHER': 'Parent',
    'M0THER': 'Parent',
    'FATHER': 'Parent',
    'PARENT': 'Parent',
    'BIOLOGICAL/STEP MOTHER': 'Parent',
    'STEP MOTHER': 'Parent',
    'FOSTER MOTHER': 'Parent',
    'FATHER(MOTHER IS LATE)': 'Parent',
    'MOTHER AND GRAND MOTHER': 'Parent',
    'MOTHER AND STEP FATHER': 'Parent',
    'PARENT AND SIBLINGS': 'Parent',

    # Guardian explicit
    'GUARDIAN': 'Guardian',
    'BOSS ACTING AS GUARDIAN': 'Guardian',
    'BOSS ACTING AS A GUARDIAN': 'Guardian',
    'GUARDIAN/BOSS': 'Guardian',
    'BOSS/ GUARDIAN': 'Guardian',
    'GRANDMOTHER': 'Guardian',
    'GRAND MOTHER': 'Guardian',
    'GRANDMA': 'Guardian',
    'GRANDPARENTS': 'Guardian',
    'GREAT GRANDMOTHER': 'Guardian',
    'GRANDPA': 'Guardian',
    'AUNT': 'Guardian',
    'AUNTY': 'Guardian',
    "AUNTY LANDA'S HOME": 'Guardian',
    'AUNT AND UNCLE': 'Guardian',
    'UNCLE': 'Guardian',
    'ELDER SISTER': 'Guardian',
    'PATERNAL STEP GRANDMA': 'Guardian',
    'MATERNAL GRANDPARENT': 'Guardian',
    'GRANDPARENT': 'Guardian',
    'GREAT AUNT': 'Guardian',
    "HER AUNT": 'Guardian',

    # Relative family
    'STEP SISTER': 'Relative',
    'SISTER': 'Relative',
    'SIBLING': 'Relative',
    'BROTHER': 'Relative',
    'HALF BROTHER': 'Relative',
    'DAUGHTER IN-LAW': 'Relative',
    'BROTHER IN-LAW': 'Relative',
    'IN-LAWS': 'Relative',
    'INLAW': 'Relative',
    'SIBLINGS': 'Relative',
    'COUSIN': 'Relative',
    'FAMILY MEMBERS': 'Relative',
    'SISTER AND HER HUSBAND': 'Relative',
    'SISTER AND  HER HUSBAND': 'Relative',
    'SISTER AND  HER HUSBAND': 'Relative',

    # Partner and spouse
    'SPOUSE/COHABITING': 'Partner',
    'SPOUSE': 'Partner',
    'HUSBAND': 'Partner',
    'HUSBAND AND CHILDREN': 'Partner',
    'OTHER SPOUSE': 'Partner',
    'PARTNER': 'Partner',
    'SEXUAL PARTNER': 'Partner',
    'COHABITING': 'Partner',
    'FORCED MARRIAGE': 'Partner',
    'CO-HABITING': 'Partner',
    'HUSBAND HOUSE': 'Partner',
    'OLD HUSBAND HOUSE': 'Partner',
    'INTIMATE PARTNER RELATIVE': 'Partner',
    'INTIMATE PARTNER': 'Partner',
    'FORMER INTIMATE PARTNER': 'Partner',
    'FORMER INTIMATE PARTNER ': 'Partner',
    'CURRENT INTIMATE PARTNERS ': 'Partner',
    'CURRENT INTIMATE PARTNERS': 'Partner',
    'EX-HUSBAND': 'Partner',
    'NEW PARTNER': 'Partner',
    'WIFE & CHILDREN (FAMILY)': 'Partner',
    'SPOUSE AND CHILD': 'Partner',
    'SPOUSE/CHILDREN': 'Partner',
    "SPOUSE /HIS FAMILY": 'Partner',
    'GOT MARRIED TO ANOTHER HUSBAND': 'Partner',
    'BABY DADDY': 'Partner',
    'SPOUS': 'Partner',
    'HUSBAND  HOUSE': 'Partner',
    'COHIBITING': 'Partner',

    # Children
    'CHILDREN': 'Children',
    'CHILD': 'Children',
    'SON': 'Children',
    "CHILD'S MOTHER": 'Children',
    'DAUGHTER': 'Children',
    'HER CHILDREN': 'Children',
    'WITH HER CHILDREN': 'Children',
    'WITH CHILDREN': 'Children',
    'WITH HER CHILD': 'Children',
    'ALONE WITH CHILD': 'Children',
    'GRAND CHILDREN': 'Children',

    # Friend types, including boyfriend girlfriend per rule
    'FRIENDS': 'Friend(s)',
    'A FRIEND': 'Friend(s)',
    'FRIEND': 'Friend(s)',
    'FAMILY FRIEND': 'Friend(s)',
    'OTHER FRIEND': 'Friend(s)',
    'FRIENDS ': 'Friend(s)',
    'FREIND': 'Friend(s)',
    'FREINDS': 'Friend(s)',
    'ROOMMATES': 'Friend(s)',
    'ROOM MATE': 'Friend(s)',
    'ROOMMATE': 'Friend(s)',
    'STUDENTS AT YABATECH': 'Friend(s)',
    'SCHOOL MATE': 'Friend(s)',
    'WITH A FRIEND': 'Friend(s)',
    'BOYFRIEND': 'Friend(s)',
    'GIRLFRIEND': 'Friend(s)',
    'A GIRL FRIEND': 'Friend(s)',
    'FRIENDS/FAMILY': 'Friend(s)',

    # Caregiver vs Employer split
    'CAREGIVER': 'Caregiver',
    'CARE GIVER': 'Caregiver',
    'CARE  GIVER': 'Caregiver',
    'CARE GIVER': 'Caregiver',
    'CARE GIVER': 'Caregiver',
    'CARE GIVER ': 'Caregiver',
    'BOSS': 'Employer',
    'EMPLOYER': 'Employer',
    'HER MADAM': 'Employer',
    'HER EMPLOYER': 'Employer',
    'BOSS AND MADAM': 'Employer',
    'MADAM': 'Employer',
    'EMPLOYEE': 'Employer',
    'BOSS HOUSE': 'Employer',
    'LIVES WHERE SHE WORKS AS A MAID': 'Employer',
    'HOUSE GIRL': 'Caregiver',
    'WITH HIS BOSS': 'Employer',
    "THE FAMILY OF THE BOSS": 'Employer',
    'MADAM FAMILY': 'Employer',
    "BOSS'S HOUSE": 'Employer',
    'A LADY TO WHOM SHE WAS HOUSE MAID': 'Employer',
    'LIVES WITH HER EMPOLYER': 'Employer',
    'WITH HER EMPLOYER': 'Employer',

    # Alone synonyms
    'ALONE': 'Alone',
    'SELF': 'Alone',
    'LIVE ALONE': 'Alone',
    'STAY ALONE': 'Alone',
    'LEAVE ALONE': 'Alone',
    'LIVING IN A SEPARATE APARTMENT': 'Alone',

    # Perpetrator and trafficker to Perpetrator(s)
    'PERPETRATOR': 'Perpetrator(s)',
    'THE PERPETRATOR': 'Perpetrator(s)',
    'PERPETRATOR (PARTERNAL UNCLE)': 'Perpetrator(s)',
    'TRAFFICKER': 'Perpetrator(s)',
    'TRAFFICKERS': 'Perpetrator(s)',
    'CHILD TRAFFICKER ': 'Perpetrator(s)',
    'TRAFFICKED': 'Perpetrator(s)',
    'ABDUCTORS ': 'Perpetrator(s)',
    'ABDUCTORS': 'Perpetrator(s)',
    'CHILD TRAFFICKER': 'Perpetrator(s)',
    'ABDUCTOR': 'Perpetrator(s)',
    'ABDUCTED': 'Perpetrator(s)',

    # IDP Camp versus Shelter split
    'IDP CAMPS': 'IDP Camp',
    'IDP CAMP': 'IDP Camp',
    'IDPS': 'IDP Camp',
    "CAMP WITH OTHER IDP'S": 'IDP Camp',
    'IDP': 'IDP Camp',

    'POLICE STATION': 'Shelter',
    'POLICE SHELTER': 'Shelter',
    'NAPTIP SHELTER': 'Shelter',
    'JUVENILE HOME': 'Shelter',
    'REHABILITATION CENTER': 'Shelter',
    'BOARDING HOUSE': 'Shelter',
    'SCHOOL HOSTEL': 'Shelter',
    'SHELTER': 'Shelter',
    'EMERGENCY SHELTER': 'Shelter',
    'EMERGENCY ACCOMMODATION': 'Shelter',
    'HOSPITAL WARD': 'Shelter',
    'ORPHANAGE HOME': 'Shelter',
    'ORPHANAGE': 'Shelter',
    'FOSTER HOME': 'Shelter',
    'HOSTEL': 'Shelter',
    'BROTHEL': 'Shelter',
    'BABY FACTORY HOME': 'Shelter',
    'BABY FACTORY APARTMENT': 'Shelter',
    'STUDENT LIVING IN HOSTEL': 'Shelter',
    'STUDENT LIVING IN THE HOSTEL': 'Shelter',
    'HOUSE': 'Other',
    'HOME': 'Other',

    # Community versus neighbor
    'COMMUNITY MEMBER': 'Community',
    'COMMNUITY MEMBER': 'Community',
    'COMMUNITY LEADER (HAKIMI)': 'Community',
    'CHURCH MEMBER': 'Community',
    'CHURCH MEMBER/GUARDIAN': 'Community',
    'RELIGIOUS LEADER': 'Community',
    'BISHOP': 'Community',
    'PASTOR OF HER CHURCH': 'Community',
    'PASTOR': 'Community',
    'A GOOD SAMARITAN': 'Community',
    'SOMEONE FROM HER VILLAGE': 'Community',
    'OTHER RETURNEES FROM CAPTIVITY': 'Community',
    'COLLEAGUE\'S FAMILY': 'Community',
    'HER FRIEND\'S KINSMAN': 'Community',
    'STREET BROTHERS': 'Community',
    'NEIGHBOR': 'Neighbor',
    'NEIGHBOUR': 'Neighbor',
    'NEIGHBOR ': 'Neighbor',
    'NEIGBOUR': 'Neighbor',

    # Unknown tokens
    'UNKNOWN': 'Unknown',
    'NOT REPORTED': 'Unknown',
    'NOT SPECIFIED': 'Unknown',
    'NONE': 'Unknown',
    'DOES NOT HAVE A HOME': 'Other',
    'FOUND ON THE ROAD': 'Other',
    'WITNESS': 'Other',
    'STRANGER': 'Other',
    'STRENGER': 'Other',
    'ABANDONED': 'Other',
    'ABANDONED CHILD': 'Other',
    'FORCED COHABITATION': 'Other',
    'BOREHOLE': 'Other',
    "MOTHER AND MOTHER'S BOYFRIEND": 'Other',
    'FATHER GIRLFRIEND': 'Other',
    'CHASED OUT BY HER SPOUSE AND FAMILY': 'Other',
    'HELPER': 'Other',
    'SEPARATED': 'Other',
    'ON THE STREET': 'Other',
}

# e) fallback function for values not explicitly listed in mapping
def _fallback_category(s):
    s = (s or '').upper().strip()
    if s in ('UNKNOWN', 'NOT REPORTED', 'NOT SPECIFIED', 'NONE'):
        return 'Unknown'
    if re.search(r'\bALONE\b', s):
        return 'Alone'
    if re.search(r'\bCHILDREN?\b|\bDAUGHTER\b|\bSON\b', s):
        return 'Children'
    # boyfriend girlfriend mapped to Friend(s) per your instruction
    if re.search(r'\bBOYFRIEND\b|\bGIRLFRIEND\b|\bA GIRL FRIEND\b', s):
        return 'Friend(s)'
    if 'FRIEND' in s or 'ROOMMATE' in s or 'ROOM MATE' in s or 'ROOMMATES' in s:
        return 'Friend(s)'
    if re.search(r'\bGUARDIAN\b|\bAUNT\b|\bUNCLE\b|\bCOUSIN\b|\bGRAND\b', s):
        return 'Guardian'
    if re.search(r'\bMOTHER\b|\bFATHER\b|\bPARENT\b|\bPARENTS\b', s):
        return 'Parent'
    if re.search(r'\bSISTER\b|\bBROTHER\b|\bSIBLING\b|\bHALF BROTHER\b|\bINLAW\b|\bIN LAW\b', s):
        return 'Relative'
    # check partner related terms, but keep boyfriend girlfriend out because they are friends
    if re.search(r'\bHUSBAND\b|\bWIFE\b|\bSPOUSE\b|\bPARTNER\b|\bINTIMATE\b|\bEX HUSBAND\b|\bBABY DADDY\b', s):
        return 'Partner'
    # employer related
    if 'BOSS' in s or 'EMPLOYER' in s or 'MADAM' in s or 'HER MADAM' in s:
        return 'Employer'
    if 'CARE' in s or 'CARER' in s or 'HOUSE GIRL' in s:
        return 'Caregiver'
    if re.search(r'TRAFFICK|PERPETRAT|ABDUCT|ABDUCTOR|ABDUCTORS|TRAFFICKER|TRAFFICKED', s):
        return 'Perpetrator(s)'
    if 'IDP' in s:
        return 'IDP Camp'
    if 'SHELTER' in s or 'HOSTEL' in s or 'ORPHAN' in s or 'JUVENILE' in s or 'REHABILITATION' in s:
        return 'Shelter'
    if 'NEIGH' in s:
        return 'Neighbor'
    if 'COMMUN' in s or 'PASTOR' in s or 'CHURCH' in s or 'BISHOP' in s or 'RELIGIOUS' in s or 'COLLEAGUE' in s:
        return 'Community'
    # conservative default
    return 'Other'

# f) apply mapping then fallback where needed
df['who_survivor/victim_stay_with'] = df[col].map(mapping)
mask_unmapped = df['who_survivor/victim_stay_with'].isna()
df.loc[mask_unmapped, 'who_survivor/victim_stay_with'] = df.loc[mask_unmapped, col].apply(_fallback_category)

# g) tidy up, drop the live alone column if you no longer need it
df = df.drop(columns=[alone_col])

# h) final check
print("\nValue counts for 'who_survivor/victim_stay_with' after cleaning:")
print(df['who_survivor/victim_stay_with'].value_counts(dropna=False))

#### 'TIME OF THE DAY THAT INCIDENT TOOK PLACE'
* Replace 'CannotRemember' into 'Unknown'

In [None]:
# Merge 'CannotRemember' into 'Unknown'
df['incident_time_of_day'] = df['TIME OF THE DAY THAT INCIDENT TOOK PLACE'].replace(
    'CannotRemember', 'Unknown'
)

# Print the new value counts to verify the changes
print(df['incident_time_of_day'].value_counts(dropna=False))

#### 'DOES THE SURVIVOR WANT ACCESS TO JUSTICE'
1. Change "Yes" and "No" to more descriptive terms can make the data easier to understand at a glance.
2. Replace 'NotApplicable' to 'Unknown'

In [None]:
# Map the values to new, more descriptive categories
mapping = {
    'Yes': 'Wants Access to Justice',
    'No': 'Does Not Want Access',
    'NotApplicable': 'Unknown'
}

# Apply the mapping
df['seeks_justice'] = df['DOES THE SURVIVOR WANT ACCESS TO JUSTICE'].replace(mapping)

print("\nNew value counts:")
print(df['seeks_justice'].value_counts(dropna=False))

#### 'HAS THE CASE BEEN CLOSED'
1. Change "Yes" and "No" to more descriptive words to improves the readability and understanding of data
 * 'Yes': 'Case Closed'
 * 'No': 'Case Open'
2. Replace 'NotApplicable' with 'Unknown'
3. Create and apply the mapping to replace 'Yes' to 'Case Closed', 'No' to 'Case Open', and 'NotApplicable' to 'Unknown'

In [None]:
# Create a mapping dictionary for the changes
mapping = {
    'Yes': 'Case Closed',
    'No': 'Case Open',
    'NotApplicable': 'Unknown'
}

# Apply the mapping
df['case_closed_status'] = df['HAS THE CASE BEEN CLOSED'].replace(mapping)


print("\nNew value counts:")
print(df['case_closed_status'].value_counts(dropna=False))

#### 'OUTCOME OF PROSECUTION'
* Replace all NaN values with 'Unknown'

In [None]:
# Replace all NaN values with 'Unknown'
df['prosecution_outcome'] = df['OUTCOME OF PROSECUTION'].fillna('Unknown')

print("\nNew value counts:")
print(df['prosecution_outcome'].value_counts(dropna=False))

#### 'WHO CLOSED THE CASE?'
* Replace any NaN values with 'Unknown'

In [None]:
# Fill any NaN values with 'Unknown'
df['case_closed_by'] = df['WHO CLOSED THE CASE?'].fillna('Unknown')

print("\nNew value counts:")
print(df['case_closed_by'].value_counts(dropna=False))

In [None]:
print(df['EDUCATIONAL STATUS'].unique())
print(df['EDUCATIONAL STATUS_OTHER'].unique())

#### **EDUCATIONAL STATUS**
* No rows that are NaN in 'EDUCATIONAL STATUS' and not 'EDUCATIONAL STATUS_OTHER'
* Meaning that EDUCATIONAL STATUS_OTHER is just a duplication of EDUCATIONAL STATUS values
* Drop EDUCATIONAL STATUS_OTHER from df.

In [None]:
# From the DataFrame 'df', select the 'EMPLOYMENT STATUS OF PARENT/GUARDIAN' and 'EMPLOYMENT STATUS OF PARENT/GUARDIAN_OTHERS' columns for all rows where 'EMPLOYMENT STATUS OF PARENT/GUARDIAN' is NaN.
result = df[
    df['EDUCATIONAL STATUS'].isna() &
    df['EDUCATIONAL STATUS_OTHER'].notna()
][[
    'EDUCATIONAL STATUS',
    'EDUCATIONAL STATUS_OTHER'
]]

print(result.head(20))  # Display the first few rows of the result DataFrame
#result.to_csv(os.path.join(data_folder_path, 'educational_missing.csv'), index=False)

In [None]:
# Clean up whitespace and convert to lowercase for consistent mapping
df['EDUCATIONAL STATUS'] = df['EDUCATIONAL STATUS'].str.strip().str.lower()

# Define the new, more granular mapping dictionary.
mapping_dict = {
    'no education': 'No formal',
    'some primary': 'Some primary',
    'some primary school': 'Some primary',
    'completed primary': 'Completed primary',
    'some secondary': 'Some secondary',
    'completed secondary': 'Completed secondary',
    'diploma': 'Diploma',
    'undergraduate': 'Undergraduate',
    'graduate': 'Graduate',
    'postgraduate': 'Postgraduate',
}

# Apply the mapping dictionary to the 'EDUCATIONAL STATUS' column.
# Any values not found in the dictionary will become NaN.
df['educational_status'] = df['EDUCATIONAL STATUS'].map(mapping_dict)

# Use .fillna() to handle the NaN values, which includes any original NaNs,
# empty strings, or text not found in the mapping like 'not captured' or 'not reported'.
df['educational_status'].fillna('Unknown', inplace=True)

# Print the value counts for the new categorized column to see the result
#print("Value counts for the original 'EDUCATIONAL STATUS' column:")
#print(df['EDUCATIONAL STATUS'].value_counts())
#print("\n" + "="*50 + "\n")
print("Value counts for the new 'educational_status' column:")
print(df['educational_status'].value_counts())

#### 'WHO DOES THE SURVIVOR/VICTIM LIVE WITH' and 'WHO DOES THE SURVIVOR/VICTIM LIVE WITH_OTHER',

In [None]:
print(df['WHO DOES THE SURVIVOR/VICTIM LIVE WITH'].unique())
print(df['WHO DOES THE SURVIVOR/VICTIM LIVE WITH_OTHER'].unique())

In [None]:
df['victim_lives_with'] = df['WHO DOES THE SURVIVOR/VICTIM LIVE WITH'].fillna(
    df['WHO DOES THE SURVIVOR/VICTIM LIVE WITH_OTHER']
)

df['victim_lives_with'].value_counts(dropna=False)

In [None]:
# Mapping of old column names to new ones
date_columns_map = {
    'DATE OF INCIDENT': 'incident_date',
    'DATE REPORTED': 'reported_date'
}

# Convert columns to datetime and rename
for old_col, new_col in date_columns_map.items():
    df[new_col] = pd.to_datetime(df[old_col], errors='coerce')

# Optionally drop the old columns if not needed
df.drop(columns=date_columns_map.keys(), inplace=True)

In [None]:
# Mapping of old column names to new ones
location_map = {
    'LOCATION OF VIOLENCE (STATE)': 'location_state',
    'LOCATION OF VIOLENCE (L.G.A)': 'location_lga',
    'LOCATION OF VIOLENCE (WARD)': 'location_ward'
}

# Rename columns
df.rename(columns=location_map, inplace=True)

# Replace NaN with 'Unknown' in these columns
df[list(location_map.values())] = df[list(location_map.values())].fillna('Unknown')

In [None]:
# df['location_state'].value_counts(dropna=False)
# df['location_lga'].value_counts(dropna=False)
df['location_ward'].value_counts(dropna=False)

In [None]:
print(df['PLACE OF INCIDENT'].unique())

print(df['PLACE OF INCIDENT_OTHER'].unique())

In [None]:
df['incident_location'] = df['PLACE OF INCIDENT'].fillna(
    df['PLACE OF INCIDENT_OTHER']
)

print(df['incident_location'].value_counts(dropna=False))

In [None]:
# Clean up whitespace, convert to lowercase, and handle both "survivor/victim's home"
# and "survivor’s/victim's home" by converting the special apostrophe
df['incident_location'] = df['incident_location'].str.strip().str.lower().str.replace('’', "'")

# Define the new mapping dictionary.
# Note that we include 'bush/forest' here in case those values exist.
mapping_dict = {
    "survivor/victim's home": "Survivor/victim's home",
    "perpetrator's house": "Perpetrator's house",
    "road": "Road",
    "bush/forest": "Bush/forest",
}

# Apply the mapping to a new column. This will create NaN for any unmapped values.
df['incident_location'] = df['incident_location'].map(mapping_dict)

# Now, handle the remaining uncategorized values and NaNs.
# We fill any remaining NaNs with 'Other' as per your request.
df['incident_location'] = df['incident_location'].fillna('Other')

# We can now handle the original NaN values by replacing the 'Other' label
# for those specific rows. Since the provided data doesn't have true NaNs,
# this is a general-purpose step.
df['incident_location'] = df['incident_location'].mask(
    df['incident_location'].isna(), 'Unknown'
)

# Print the value counts for the new categorized column to see the result
print("Value counts for the original 'incident_location' column:")
print(df['incident_location'].value_counts())

In [None]:
# Map old column names to new concise ones
violence_cols_map = {
    'TYPE OF VIOLENCE_SEXUAL ASSAULT': 'sexual_assault',
    'TYPE OF VIOLENCE_PHYSICAL ASSAULT': 'physical_assault',
    'TYPE OF VIOLENCE_FINANCIAL/ECONOMIC': 'financial_economic',
    'TYPE OF VIOLENCE_ONLINE/CYBER': 'online_cyber',
    'TYPE OF VIOLENCE_RAPE': 'rape',
    'TYPE OF VIOLENCE_DEFILEMENT': 'defilement',
    'TYPE OF VIOLENCE_FORCED MARRIAGE': 'forced_marriage',
    'TYPE OF VIOLENCE_DENIAL OF RESOURCES': 'denial_of_resources',
    'TYPE OF VIOLENCE_PSYCHOLOGICAL/EMOTIONAL ABUSE': 'psychological_emotional_abuse',
    'TYPE OF VIOLENCE_FEMALE GENITAL MUTILATION': 'female_genital_mutilation',
    'TYPE OF VIOLENCE_VIOLATION OF PROPERTY & INHERITANCE RIGHTS': 'violation_property_inheritance_rights',
    'TYPE OF VIOLENCE_CHILD ABUSE AND NEGLECT': 'child_abuse_neglect',
    'TYPE OF VIOLENCE_OTHER': 'other_violence'
}

# Rename the columns
df.rename(columns=violence_cols_map, inplace=True)

# Columns to convert (exclude 'other_violence')
bool_cols = [col for col in violence_cols_map.values() if col != 'other_violence']

# Convert Yes/No to True/False
df[bool_cols] = df[bool_cols].applymap(
    lambda x: True if str(x).strip().lower() == 'yes' else False
)

In [None]:
# Since most values are empty, we'll start by filling them with 'Unknown'.
df['other_violence'].fillna('', inplace=True)

# Clean up whitespace and convert to lowercase for consistent mapping.
df['other_violence'] = df['other_violence'].str.strip().str.lower()

# Define the new, more granular mapping dictionary.
# The keys are the raw values, and the values are the categories.
mapping_dict = {
    'threat to life': 'Threats',
    'threat to kill with a cutlass': 'Threats',
    'neglect': 'Neglect',
    'child neglect': 'Neglect',
    'neglect of parental responsibility': 'Neglect',
    'wife battery': 'Physical/Sexual Assault',
    'physical assault': 'Physical/Sexual Assault',
    'defilement and sexual assault': 'Physical/Sexual Assault',
    'emotional/psychosocial assault': 'Emotional/Verbal Abuse',
    'verbal abuse': 'Emotional/Verbal Abuse',
    'access to children denial': 'Custody/Access Issues',
    'denied access to children': 'Custody/Access Issues',
    'custody of children': 'Custody/Access Issues',
    'abduction': 'Custody/Access Issues',
    'financial/economic': 'Financial/Economic Abuse',
    'denial of resources': 'Financial/Economic Abuse',
    'human trafficking': 'Other',
    'defilement': 'Physical/Sexual Assault',
}

# Apply the mapping dictionary to the 'other_violence' column.
# This will create NaN for any values not in the dictionary.
df['other_violence'] = df['other_violence'].map(mapping_dict)

# Handle the empty strings and any unmapped values.
# An empty string is considered 'Unknown' as per the prompt.
# Any other string that didn't match a key in the mapping_dict will be 'Other'.
df['other_violence'] = df['other_violence'].mask(
    df['other_violence'] == '', 'Unknown'
).fillna('Other')

# Print the value counts for the original and new categorized columns.
print("Value counts for the original 'other_violence' column:")
print(df['other_violence'].value_counts())

In [None]:
# Columns to combine
sex_cols = ['SEX OF PERPETRATOR_1', 'SEX OF PERPETRATOR_2', 'SEX OF PERPETRATOR_3', 'SEX OF PERPETRATOR_OTHER']
age_cols = ['AGE OF PERPETRATOR_1', 'AGE OF PERPETRATOR_2', 'AGE OF PERPETRATOR_3', 'AGE OF PERPETRATOR_OTHER']

# 1. Combine sex columns into one
df['perpetrator_sex'] = df[sex_cols].astype(str).apply(lambda x: ','.join([v for v in x if v not in ['nan', 'NaN', '<NA>']]), axis=1)

# Cleaning function for sex
def clean_sex_of_perpetrator(value):
    value = value.upper().replace(',', '').strip()
    
    if value == 'MALE':
        return 'Male'
    elif value == 'FEMALE':
        return 'Female'
    elif 'MALE' in value and 'FEMALE' in value:
        return 'Male & Female'
    
    mapping = {
        'PARENTS': 'Male & Female',
        'HUSBAND AND WIFE': 'Male & Female',
        'BOTH PARENTS FATHER-50YEARS. MOTHER-47YEARS': 'Male & Female',
        'MALE & FEMALE': 'Male & Female',
        'FAMILY/HUSBAND TO BE': 'Male & Female',
        'MALE AND FEMALE': 'Male & Female',
        'INLAW': 'Male & Female',
        'OTHER': 'Unknown',
        'NAN': 'Unknown',
        '<NA>': 'Unknown',
        'MALEMALE': 'Male & Female',
        'MALEMALEMALE': 'Male & Female',
        'MALEMALEMALEMALE': 'Male & Female',
        'MALEMALEMALEMALEMALE': 'Male & Female'
    }
    return mapping.get(value, 'Unknown')

df['perpetrator_sex'] = df['perpetrator_sex'].apply(clean_sex_of_perpetrator)

# 2. Combine age columns into one
df['perpetrator_age_raw'] = df[age_cols].astype(str).apply(lambda x: ','.join([v for v in x if v not in ['nan', 'NaN', '<NA>']]), axis=1)

# Cleaning function for age
def get_correct_age(age_entry):
    if not isinstance(age_entry, str):
        return np.nan
    try:
        ages = [float(age) for age in age_entry.split(',') if age]
        if not ages:
            return np.nan
        first_age = ages[0]
        if 0 < first_age <= 120:
            return first_age
        elif first_age == 0:
            max_age = max(ages)
            return max_age if 0 < max_age <= 120 else np.nan
        else:
            return np.nan
    except (ValueError, TypeError):
        return np.nan

df['perpetrator_age'] = df['perpetrator_age_raw'].apply(get_correct_age)

# Impute missing/zero ages with mean of valid ages
mean_age = df['perpetrator_age'].dropna().mean()
df['perpetrator_age'] = df['perpetrator_age'].fillna(mean_age).astype(int)

# 3. Drop redundant columns
df.drop(sex_cols + age_cols + ['perpetrator_age_raw'], axis=1, inplace=True)

# Check result
print(df[['perpetrator_sex', 'perpetrator_age']].head())


In [45]:
df.columns

Index(['Time Stamp Date', 'Time Stamp Time', 'Type_of_Organisation',
       'Location of Organisation State', 'Organisation LGA', 'Contact Channel',
       'Was the Violence Fatal', 'Who Reported the Incident',
       'Sex of survivor', 'Age of survivor', 'MARITAL STATUS',
       'employment_status_main', 'employment_status_victim_main', 'PLWD',
       'PLHIV', 'female_sex_worker', 'IDP', 'drug_user', 'widow',
       'out_of_school_child', 'minor', 'household_help', 'child_apprentice',
       'orphans', 'not_applicable', 'other_vulnerability',
       'EDUCATIONAL STATUS', 'EDUCATIONAL STATUS_OTHER',
       'WHO DOES THE SURVIVOR/VICTIM LIVE WITH',
       'WHO DOES THE SURVIVOR/VICTIM LIVE WITH_OTHER',
       'ESTIMATED AVERAGE MONTHLY INCOME', 'location_state', 'location_lga',
       'location_ward', 'PLACE OF INCIDENT', 'PLACE OF INCIDENT_OTHER',
       'TIME OF THE DAY THAT INCIDENT TOOK PLACE', 'sexual_assault',
       'physical_assault', 'financial_economic', 'online_cyber', 'rape'

#### **Drop redundant dates'**
* 'Time Stamp Date'
* 'Time Stamp Time' 
* 'DATE JUSTICE WAS RECEIVED',
* 'DATE CASE WAS CLOSED',
* 'APPROVED BY ORG. SUPERVISOR_DATE',
* 'APPROVED BY LGA SUPERVISOR_DATE',
* 'APPROVED BY STATE SUPERVISOR_DATE'

They are redundant dates and time, they would be irrelevant and noise to the model

In [None]:
#@title Drop 'Time Stamp Date' and 'Time Stamp Time', and other redundant dates
df = df.drop(
    columns=[
    'Time Stamp Date', 
    'Time Stamp Time',
    'DATE JUSTICE WAS RECEIVED',
    'DATE CASE WAS CLOSED',
    'APPROVED BY ORG. SUPERVISOR_DATE',
    'APPROVED BY LGA SUPERVISOR_DATE',
    'APPROVED BY STATE SUPERVISOR_DATE'
    ]
)

#### 'SEX OF PERPETRATOR'
1. Standardized Values: All entries with only "Male" or "Female" (e.g., Male,Male,Male) were consolidated into a single "Male" or "Female" category.
2. Identified Mixed Genders: Any entry containing both "Male" and "Female" was grouped into a new category, "Male & Female." This includes descriptive phrases like Parents, husband and wife, and INLAW.
3. Handled Missing and Unspecified Data: All missing values (<NA>, NaN) and Other entries were consolidated into a single "Unknown" category.

In [None]:
# To handle all variations, we'll create a new column, convert t o string, and make it uppercase and strip spaces for consistency.
df['SEX OF PERPETRATOR_cleaned'] = df['SEX OF PERPETRATOR'].astype(str).str.upper().str.strip()

# Define the replacement logic
def clean_sex_of_perpetrator(value):
    # Consolidate variations of 'Male' and 'Female'
    if value.replace(',', '').strip() == 'MALE':
        return 'Male'
    elif value.replace(',', '').strip() == 'FEMALE':
        return 'Female'
    
    # Check for 'Male' and 'Female' together
    if 'MALE' in value and 'FEMALE' in value:
        return 'Male & Female'
    
    # Map specific descriptive values
    mapping = {
        'PARENTS': 'Male & Female',
        'HUSBAND AND WIFE': 'Male & Female',
        'BOTH PARENTS FATHER-50YEARS. MOTHER-47YEARS': 'Male & Female',
        'MALE & FEMALE': 'Male & Female',
        'FAMILY/HUSBAND TO BE': 'Male & Female',
        'MALE AND FEMALE': 'Male & Female',
        'INLAW': 'Male & Female',
        'OTHER': 'Unknown',
        'NAN': 'Unknown',
        '<NA>': 'Unknown',
        'MALE,MALE': 'Male & Female',
        'MALE,MALE,MALE': 'Male & Female',
        'MALE,MALE,MALE,MALE': 'Male & Female',
        'MALE,MALE,MALE,MALE,MALE': 'Male & Female'
    }
    return mapping.get(value, value)

# Apply the cleaning function to the column
df['SEX OF PERPETRATOR_cleaned'] = df['SEX OF PERPETRATOR_cleaned'].apply(clean_sex_of_perpetrator)

# Replace the original column with the cleaned one
df['SEX OF PERPETRATOR'] = df['SEX OF PERPETRATOR_cleaned']

# Drop the temporary cleaning column
df.drop('SEX OF PERPETRATOR_cleaned', axis=1, inplace=True)

# Print the new value counts to verify the changes
print("Value counts after cleaning:")
print(df['SEX OF PERPETRATOR'].value_counts(dropna=False))

#### Handling 'AGE OF PERPETRATOR' 
* Define a Function to Extract the Correct Age to process a single entry from the 'AGE OF PERPETRATOR' column based on the revised logic.
       * If the first value is non-zero, it's used.
       * If the first value is zero, the highest value in that entry is used.
       * Filters out unrealistic ages (e.g., > 120).
* Apply the Age Function and Create a Processed Age Column
* Calculate the Mean Age for Imputation
* Impute Missing and Zero Age Values 

#### Handling 'RELATIONSHIP WITH PERPETRATOR' and 'TYPE OF VIOLENCE' 
* Define Mappings for 'RELATIONSHIP WITH PERPETRATOR' and 'TYPE OF VIOLENCE' 
* Helper Function to Create Mapper -  Create mappers for relationship and violence
* Apply Mappings for 'RELATIONSHIP WITH PERPETRATOR' and 'TYPE OF VIOLENCE'
* Categorize Vulnerable Population and apply the function to the 'VULNERABLE POPULATION' column


In [None]:
# --- 1. Define a Function to Extract the Correct Age ---
def get_correct_age(age_entry):
    """
    Processes a single entry from the 'AGE OF PERPETRATOR' column based on the revised logic.
    - If the first value is non-zero, it's used.
    - If the first value is zero, the highest value in that entry is used.
    - Filters out unrealistic ages (e.g., > 120).
    """
    if not isinstance(age_entry, str):
        return np.nan
    try:
        ages = [float(age) for age in age_entry.split(',')]
        if not ages:
            return np.nan
        first_age = ages[0]
        if 0 < first_age <= 120:
            return first_age
        elif first_age == 0:
            max_age = max(ages)
            return max_age if 0 < max_age <= 120 else np.nan
        else:
            return np.nan
    except (ValueError, TypeError):
        return np.nan

# --- 2. Apply the Age Function and Create a Processed Age Column ---
df['AGE OF PERPETRATOR'] = df['AGE OF PERPETRATOR'].apply(get_correct_age)

# --- 3. Calculate the Mean Age for Imputation ---
valid_ages = df['AGE OF PERPETRATOR'].dropna()
mean_age = valid_ages.mean()
print(f"\nCalculated mean age for imputation (from valid, realistic values): {mean_age:.2f}")

# --- 4. Impute Missing and Zero Age Values ---
df['AGE OF PERPETRATOR'] = df['AGE OF PERPETRATOR'].fillna(mean_age).astype(int)


# --- 5. Define Mappings for 'RELATIONSHIP WITH PERPETRATOR' ---
relationship_mapping = {
    'Spouse/Partner': ['Spouse', 'Husband', 'Wife', 'Partner', 'spouse', 'Ex-partner', 'Boyfriend', 'Girlfriend', 'Intimate Partner', 'Ex-Boyfriend', 'Ex-Husband', 'Spouse/cohabiting', 'SPOUSE', 'HUSBAND', 'WIFE', 'Ex husband', 'ex-husband', 'ex-boyfriend', 'ex-wife', 'Concubine', 'Co-habiting partner', 'partner', 'sp[ouse', 'spouse  ', 'sp0use'],
    'Family Member': ['Father', 'Mother', 'Son', 'Daughter', 'Brother', 'Sister', 'Parent', 'Sibling', 'Family member', 'family member', 'Step-father', 'Step-mother', 'Step-son', 'Step-daughter', 'Step-brother', 'Step-sister', 'FATHER', 'MOTHER', 'SON', 'DAUGHTER', 'BROTHER', 'SISTER', 'Family Member', 'Father inlaw', 'Mother inlaw', 'Brother inlaw', 'Sister inlaw', 'In-law', 'in-law', 'child', 'children'],
    'Extended Family': ['Uncle', 'Aunt', 'Cousin', 'Grandfather', 'Grandmother', 'Nephew', 'Niece', 'Relative', 'relative', 'UNCLE', 'AUNT', 'COUSIN', 'GRANDFATHER', 'GRANDMOTHER', 'Guardian', 'Foster father', 'Foster mother', 'Family relative'],
    'Acquaintance': ['Friend', 'Neighbor', 'Neighbour', 'Acquaintance', 'Co-worker', 'Colleague', 'Classmate', 'FRIEND', 'NEIGHBOUR', 'neighbor', 'Friend of the family', 'friends'],
    'Authority Figure': ['Teacher', 'Employer', 'Boss', 'Landlord', 'Religious leader', 'Clergy', 'Police', 'Security personnel', 'Doctor', 'Health worker', 'Coach', 'TEACHER', 'BOSS', 'Police Officer', 'Security guard', 'security guard'],
    'Stranger': ['Stranger', 'Unknown to survivor', 'STRANGER'],
    'Other': ['Other', 'others', 'OTHER'],
    'Unknown/Not Stated': ['Unknown', 'Not Stated', 'Not reported', 'Not Applicable', 'unknown', 'NOT APPLICABLE']
}

# --- 6. Define Mappings for 'TYPE OF VIOLENCE' ---
violence_mapping = {
    'Sexual Violence': [
        'SEXUAL ASSAULT', 'RAPE', 'SEXUAL ASSAULT, RAPE', 'SEXUAL ASSAULT, PHYSICAL ASSAULT',
        'SEXUAL ASSAULT, PHYSICAL ASSAULT, PSYCHOLOGICAL/EMOTIONAL ABUSE',
        'SEXUAL ASSAULT, PSYCHOLOGICAL/EMOTIONAL ABUSE',
        'SEXUAL ASSAULT, RAPE, PSYCHOLOGICAL/EMOTIONAL ABUSE',
        'SEXUAL ASSAULT, PHYSICAL ASSAULT, RAPE',
        'SEXUAL ASSAULT, PHYSICAL ASSAULT, RAPE, PSYCHOLOGICAL/EMOTIONAL ABUSE',
        'SEXUAL ASSAULT, DEFILEMENT', 'SEXUAL ASSAULT, RAPE, DEFILEMENT',
        'RAPE, DEFILEMENT'
    ],
    'Physical Violence': [
        'PHYSICAL ASSAULT', 'PHYSICAL ASSAULT, CHILD ABUSE AND NEGLECT',
        'PHYSICAL ASSAULT, PSYCHOLOGICAL/EMOTIONAL ABUSE, CHILD ABUSE AND NEGLECT',
        'PHYSICAL ASSAULT, DEFILEMENT', 'PHYSICAL ASSAULT, FINANCIAL/ECONOMIC',
        'PHYSICAL ASSAULT, FINANCIAL/ECONOMIC, DENIAL OF RESOURCES, PSYCHOLOGICAL/EMOTIONAL ABUSE'
    ],
    'Emotional/Psychological Abuse': [
        'PSYCHOLOGICAL/EMOTIONAL ABUSE', 'DENIAL OF RESOURCES', 'DENIAL OF RESOURCES, CHILD ABUSE AND NEGLECT',
        'DENIAL OF RESOURCES, PSYCHOLOGICAL/EMOTIONAL ABUSE, CHILD ABUSE AND NEGLECT',
        'FINANCIAL/ECONOMIC, DENIAL OF RESOURCES, PSYCHOLOGICAL/EMOTIONAL ABUSE',
        'PHYSICAL ASSAULT, DENIAL OF RESOURCES, PSYCHOLOGICAL/EMOTIONAL ABUSE, VIOLATION OF PROPERTY & INHERITANCE RIGHTS'
    ],
    'Other Forms of Violence': [
        'FORCED MARRIAGE', 'VIOLATION OF PROPERTY & INHERITANCE RIGHTS', 'FINANCIAL/ECONOMIC', 'CHILD ABUSE AND NEGLECT'
    ]
}


# --- 7. Helper Function to Create Mapper ---
def get_category_mapper(mapping):
    category_map = {}
    for category, values in mapping.items():
        for value in values:
            # Clean the value for robust matching
            cleaned_value = str(value).strip().lower().replace(" ", "")
            category_map[cleaned_value] = category
    return category_map

# Create mappers for relationship and violence
relationship_category_map = get_category_mapper(relationship_mapping)
violence_category_map = get_category_mapper(violence_mapping)

# --- 8. Apply Mappings ---
# Apply Relationship Mapping
df['RELATIONSHIP WITH PERPETRATOR'] = df['RELATIONSHIP WITH PERPETRATOR'].str.strip().str.lower(
    ).map(relationship_category_map).fillna('Other')

# Apply Violence Type Mapping
# We need a consistent cleaning approach for the violence column as well
df['TYPE OF VIOLENCE'] = df['TYPE OF VIOLENCE'].str.strip().str.lower().str.replace(" ", "").map(
    violence_category_map).fillna('Other Forms of Violence')


# --- 9. Categorize Vulnerable Population ---
# Create a function to categorize the vulnerable population based on keywords
def categorize_vulnerability(value):
    # Handle missing values first
    if pd.isna(value) or value == '<NA>':
        return 'Unknown'
    
    # Standardize the string for easier searching
    value = str(value).upper()
    
    # Handle 'NOT APPLICABLE' and its combinations
    if 'NOT APPLICABLE' in value:
        if value.strip() == 'NOT APPLICABLE':
            return 'No Vulnerability'
        # If 'NOT APPLICABLE' is combined with another vulnerability, prioritize the vulnerability
        else:
            value = value.replace('NOT APPLICABLE', '').strip()
    
    # Child/Youth Vulnerability
    if any(keyword in value for keyword in ['MINOR', 'CHILD', 'YOUTH', 'ORPHANS', 'CHILD APPRENTICE']):
        return 'Child/Youth Vulnerability'
    
    # Health-Related Vulnerability
    if any(keyword in value for keyword in ['PLHIV', 'DRUG USER', 'DISABILITY']):
        return 'Health-Related Vulnerability'
    
    # Gender/Social Vulnerability
    if any(keyword in value for keyword in ['WIDOW', 'SEX WORKER']):
        return 'Gender/Social Vulnerability'
    
    # Displacement/Labor Vulnerability
    if any(keyword in value for keyword in ['IDP', 'HOUSE MAIDS/DOMESTIC STAFF', 'DOMESTIC STAFF']):
        return 'Displacement/Labor Vulnerability'
        
    # If no category is found, assume it's unknown
    return 'Unknown'

# Apply the function to the 'VULNERABLE POPULATION' column
df['VULNERABLE POPULATION'] = df['VULNERABLE POPULATION'].apply(categorize_vulnerability)


print("\n--- Value Counts for Standardized Results---")
print(df['AGE OF PERPETRATOR'].value_counts())

print("\n--- Vulnerable Population Processing Results (Sample) ---")
print(df['VULNERABLE POPULATION'].value_counts(dropna=False))

print("\n--- Value Counts for Standardized 'RELATIONSHIP' ---")
print(df['RELATIONSHIP WITH PERPETRATOR'].value_counts())

print("\n--- Value Counts for Standardized 'VIOLENCE TYPE' ---")
print(df['TYPE OF VIOLENCE'].value_counts())

#### Remove empty rows
* There were 10 empty rows which  does not contain any useful data 
* Drop the empty rows use 'Type_of_Organisation'

In [None]:
# Print the shape of the original DataFrame
print(f"Original shape of the DataFrame: {df.shape}")

# Print the value counts of the 'Type_of_Organisation' column before dropping rows
print("\nValue counts of 'Type_of_Organisation' before dropping NaN:")
print(df['Type_of_Organisation'].value_counts(dropna=False))

# --- Drop rows with NaN values in the specified column ---
# The 'subset' argument specifies the column to check for NaN values.
# 'inplace=True' modifies the DataFrame directly, so you don't need to reassign it.
df.dropna(subset=['Type_of_Organisation'], inplace=True)

# Print the shape of the DataFrame after dropping rows
print(f"\nShape of the DataFrame after dropping rows with NaN: {df.shape}")

# Print the value counts of the 'Type_of_Organisation' column after dropping rows
print("\nValue counts of 'Type_of_Organisation' after dropping NaN:")
print(df['Type_of_Organisation'].value_counts(dropna=False))

#### 'ESTIMATED AVERAGE MONTHLY INCOME' is empty
* Remove 'ESTIMATED AVERAGE MONTHLY INCOME', it does not contain any useful data 

In [None]:
# Remove 'ESTIMATED AVERAGE MONTHLY INCOME' column, it empty, does not contain any useful data
df.drop(columns=['ESTIMATED AVERAGE MONTHLY INCOME'], inplace=True)

#### Check the concise summary of a DataFrame, including the data types, non-null values, and memory usage.

In [None]:
df.info()

#### 'LOCATION OF VIOLENCE (WARD)' has some missing values
* Replace all NaN values in the 'LOCATION OF VIOLENCE (WARD)' column with the string 'Unknown'

In [None]:
# Replace all NaN values in the 'LOCATION OF VIOLENCE (WARD)' column with the string 'Unknown'
df['LOCATION OF VIOLENCE (WARD)'] = df['LOCATION OF VIOLENCE (WARD)'].fillna('Unknown')

print("\nDataFrame after replacing NaN with 'Unknown':")
print(df)
print("\nNew value counts for the target column:")
print(df['LOCATION OF VIOLENCE (WARD)'].value_counts(dropna=False))

In [None]:
df.isna().sum()

#### Save preprocessed data

In [None]:
# Define the folder path and filename separately
data_folder_path = 'c:\\08_AHFID\\gbv-predictive-tool\\data'
output_filename = 'processed_data.csv'

# Combine the path and filename into a single, valid file path
output_file_path = os.path.join(data_folder_path, output_filename)

# Save the DataFrame to the correct file path
df.to_csv(output_file_path, index=False)

print(f"\nProcessing complete. Final cleaned data saved to '{output_file_path}'")