# Family Planning Data Cleaning 

This project involves cleaning a messy dataset related to family planning services. The goal is to prepare the dataset for further analysis by handling missing values, renaming columns, correcting data types, and removing irrelevant information.



In [98]:
import re
import os
import time
import pandas as pd
import numpy as np
from dateutil import parser
from datetime import datetime
from dotenv import load_dotenv
from tqdm import tqdm
from openai import OpenAI

import seaborn as sns 
import plotly.express as px        
import matplotlib.pyplot as plt  


## Reading the CSV File

We begin by reading the CSV file containing family planning data using `pandas`

In [122]:
#read the document it is csv file
df=pd.read_csv("C:/Users/COCOCE/NHIC/Family_planning.csv")
df.head()

  df=pd.read_csv("C:/Users/COCOCE/NHIC/Family_planning.csv")


Unnamed: 0,name,creation,modified,modified_by,owner,docstatus,idx,wife,wife_name,wife_date_of_birth,...,postpartum_family_planning,the_period_the_method_will_last,discharging,discharging_status,discharge_date,Unnamed: 69,Unnamed: 70,Unnamed: 71,Unnamed: 72,Unnamed: 73
0,FP-2408-0096384,43:20.4,44:08.1,7c87dee9d0afc9803b2aeac969e31f0187afdd196e9670...,88f56eaf0d0a2a71cfb6d8f6e860fe6dc083276e4eb3f5...,0,0,,,,...,No,3M,0,,,,,,,
1,FP-2408-0096411,57:39.0,25:07.7,7c87dee9d0afc9803b2aeac969e31f0187afdd196e9670...,88f56eaf0d0a2a71cfb6d8f6e860fe6dc083276e4eb3f5...,0,0,,,,...,,2/12/2025,No,3 MONTHS,0.0,,,,,
2,FP-2408-0096436,07:19.1,49:52.9,7c87dee9d0afc9803b2aeac969e31f0187afdd196e9670...,88f56eaf0d0a2a71cfb6d8f6e860fe6dc083276e4eb3f5...,0,0,,,,...,No,,0,,,,,,,
3,FP-2408-0096523,45:05.0,45:05.0,3c2e3e344106fa3217a5fe2ba2d871d4708c1db7059e33...,3c2e3e344106fa3217a5fe2ba2d871d4708c1db7059e33...,0,0,,,,...,,,0,,,,,,,
4,FP-2408-0096587,06:50.8,06:50.8,3c2e3e344106fa3217a5fe2ba2d871d4708c1db7059e33...,3c2e3e344106fa3217a5fe2ba2d871d4708c1db7059e33...,0,0,,,,...,,,0,,,,,,,


In [123]:
# View column names
print("Column names:\n", df.columns.tolist())

# View shape
print("\nDataset shape:", df.shape)


Column names:
 ['name', 'creation', 'modified', 'modified_by', 'owner', 'docstatus', 'idx', 'wife', 'wife_name', 'wife_date_of_birth', 'wife_telephone', 'number_of_children', 'inscription_date', 'husband_name', 'husband_date_of_birth', 'husband_telephone', 'method_initiation_date', 'province', 'sector', 'village', 'district', 'cell', 'existing_family_plan', 'existing_family_plan_method', 'company', '_user_tags', '_comments', '_assign', '_liked_by', 'patient', 'patient_name', 'date_of_birth', 'gender', 'telephone', 'consultation_date', 'practitioner', 'practitioner_name', 'education_level', 'profession', 'catchment_area', 'marital_status', 'accompanied_by_partner', 'partner_name', 'partner_date_of_birth', 'partner_telephone', 'gravidity', 'parity', 'children_in_life', 'desired_number_of_children', 'date_of_last_delivery', 'date_of_last_abortion', 'number_of_children_died', 'birth_spacing', 'birth_limitation', 'desired_method', 'first_date_of_last_menstrual_period_', 'previous_utilizatio

##  Identifying Columns with High Missing Values

To clean the dataset, we first calculate the proportion of missing values in each column. This helps identify which columns have too many missing values and may need to be dropped.

The code below displays the top 50 columns with the highest proportion of missing data.


In [124]:
#check columns to drop
#drop those columns with high missing data

((df.isnull().sum()/len(df)).sort_values(ascending=False)).head(50)

husband_telephone                       1.000000
inscription_date                        1.000000
_user_tags                              1.000000
existing_family_plan_method             1.000000
existing_family_plan                    1.000000
husband_date_of_birth                   1.000000
husband_name                            1.000000
wife                                    1.000000
wife_name                               1.000000
wife_date_of_birth                      1.000000
wife_telephone                          1.000000
number_of_children                      1.000000
Unnamed: 73                             0.999868
Unnamed: 72                             0.999802
Unnamed: 71                             0.999209
Unnamed: 70                             0.997891
Unnamed: 69                             0.997628
_assign                                 0.994860
_comments                               0.994662
_liked_by                               0.994201
partner_telephone   

In [125]:
# we drop column that have 60% and above of missing values
columns_to_drop=['husband_telephone', 'inscription_date', '_user_tags', 'existing_family_plan_method', 'existing_family_plan',
                 'husband_date_of_birth', 'husband_name', 'wife', 'wife_name', 'wife_date_of_birth', 'wife_telephone', 'number_of_children',
                 'Unnamed: 73', 'Unnamed: 72', 'Unnamed: 71', 'Unnamed: 70', 'Unnamed: 69', '_assign', '_comments', '_liked_by', 'partner_telephone', 
                 'partner_name', 'partner_date_of_birth', 'family_planning_method', 'date_of_last_abortion', 'discharging_status', 'discharge_date', 
                 'modified_by', 'owner','creation','modified','docstatus', 'idx','telephone', 'practitioner', 
                 'practitioner_name','patient_name','patient']

# drop columns

df.drop(columns_to_drop, axis=1, inplace=True)

In [127]:
df.info()
print(f'\n Shape of the data {df.shape}')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15174 entries, 0 to 15173
Data columns (total 36 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   name                                   15174 non-null  object 
 1   method_initiation_date                 11581 non-null  object 
 2   province                               14927 non-null  object 
 3   sector                                 14926 non-null  object 
 4   village                                14857 non-null  object 
 5   district                               14927 non-null  object 
 6   cell                                   14926 non-null  object 
 7   company                                15174 non-null  object 
 8   date_of_birth                          15166 non-null  object 
 9   gender                                 15164 non-null  object 
 10  consultation_date                      15174 non-null  object 
 11  ed

#### Our data contains a lot of mistakes that we need to address. We will now proceed to clean the data column by column.

##  Converting Date Columns to Datetime Format

To ensure proper handling of date-related columns in the dataset, we define a function that automatically detects and converts columns containing the word `'date'` in their name to `datetime` format.



In [128]:
#1.convert date columns
def convert_columns_to_datetime(df, columns):
    """
    Converts specified columns of a DataFrame to datetime format.

    Parameters:
    - df (pd.DataFrame): The DataFrame to modify.
    - columns (list): List of column names to convert.

    Returns:
    - pd.DataFrame: The DataFrame with specified columns converted to datetime.
    """
    date_cols = [col for col in df.columns if 'date' in col.lower()]
    for col in date_cols:
        try:
            df[col] = pd.to_datetime(df[col], errors='coerce')
        except Exception as e:
            print(f"Could not convert column '{col}' to datetime: {e}")
    return df


In [129]:
columns_to_convert = [
    'method_initiation_date',
    'date_of_birth',
    'consultation_date',
    'date_of_last_delivery',
    'first_appointment_date',
    'first_date_of_last_menstrual_period_'
]

df = convert_columns_to_datetime(df, columns_to_convert)


In [130]:
df[columns_to_convert]

Unnamed: 0,method_initiation_date,date_of_birth,consultation_date,date_of_last_delivery,first_appointment_date,first_date_of_last_menstrual_period_
0,2024-08-14,1991-04-10,2024-08-14,2021-08-20,2024-11-14,2024-07-12
1,2024-11-12,NaT,NaT,NaT,NaT,NaT
2,NaT,2001-10-11,2024-08-14,2020-02-01,NaT,2024-07-01
3,2024-08-14,2002-07-07,2024-08-14,2024-07-03,2024-08-21,2024-08-01
4,2024-08-14,2003-02-02,2024-08-14,2024-06-26,2024-08-21,2024-07-11
...,...,...,...,...,...,...
15169,2025-03-27,2000-01-01,2025-03-27,2024-01-31,2025-06-26,2025-03-25
15170,NaT,1976-01-01,2025-03-27,NaT,NaT,NaT
15171,2022-06-02,1993-01-01,2025-03-27,2023-02-24,2025-03-27,NaT
15172,2025-03-27,1996-01-01,2025-03-27,2025-03-25,2025-04-03,NaT


##  Cleaning the 'Province' Column

The `province` column contains inconsistent entries, such as names with parentheses or extra keywords like "Province" or "City". 

To clean this, we define a function that:
- Checks for missing values.
- Extracts province names from within parentheses when present.
- Splits strings by `'wa'` to isolate the name when parentheses aren't used.
- Removes unnecessary terms like `'Province'` or `'City'`.

The cleaned names are then applied back to the `province` column.


In [131]:
#clean province
#remove inconsistence
def extract_province_name(val):
    #check if there is null values
    if pd.isna(val):
        return None

    # If it contains parentheses, extract inside them
    if '(' in val and ')' in val:
        name = val.split('(')[1].split(')')[0]
    else:
        # If no parentheses, try to split by '#' and take last part
        parts = val.split('wa')
        
        name = parts[-1].strip()

    # Remove the word "Province" or "City" if it appears
    name = name.replace('Province', '').replace('City', '').strip()
    
    return name
df['province'] = df['province'].apply(extract_province_name)

In [132]:
df['province'].value_counts()

province
Kigali      7070
Northern    6433
Eastern     1315
Southern      88
Western       21
Name: count, dtype: int64

## Extracting and Cleaning District, Sector, Cell, and Village Names

The location fields (`district`, `sector`, `cell`, and `village`) may contain inconsistent formatting such as:
- Extra spaces.
- Prefixes separated by `#`.
- Inconsistent capitalization.

We define a function that:
- Checks if the value is a string.
- Splits the text on `#` and takes the last part.
- Strips extra spaces.
- Converts names to title case 


In [133]:
# extract names of districts, sector, and cless
def clean_location_name(value):
    """
    This function processes the values for district, sector, cell, or village columns:
    - Strips any extra spaces.
    - Splits on '#' if applicable.
    - Capitalizes the first letter of each word.
    """
    if isinstance(value, str):  # Check if the value is a string
        cleaned_name = value.split('#')[-1].strip()  # Split and remove extra spaces
        return cleaned_name.title()  # Capitalize the first letter of each word
    return value  # Return the value as it is if it's not a string (e.g., NaN)

# Apply the function to the desired columns
columns_to_clean = ['district', 'sector', 'cell', 'village']

for col in columns_to_clean:
    df[col] = df[col].apply(clean_location_name)


In [135]:
df[['district', 'sector', 'cell', 'village']].head()

Unnamed: 0,district,sector,cell,village
0,Kicukiro,Gatenga,Nyanza,Juru
1,Kicukiro,Kagarama,Kanserege,Kanserege
2,Kicukiro,Gatenga,Nyarurama,Bigo
3,Nyarugenge,Gitega,Gacyamo,
4,Gasabo,Gisozi,Musezero,Byimana


In [136]:
df['district'].value_counts()

district
Rulindo       6147
Kicukiro      2948
Gasabo        2603
Nyarugenge    1518
Bugesera       646
Nyagatare      636
Gakenke        150
Gicumbi        121
Kamonyi         66
Rwamagana       22
Burera          12
Gatsibo          6
Muhanga          6
Huye             6
Rusizi           5
Nyamasheke       4
Nyanza           3
Karongi          3
Gisagara         3
Ruhango          3
Rutsiro          3
Kayonza          3
Rubavu           3
Ngororero        3
Musanze          3
Ngoma            2
Nyaruguru        2
Name: count, dtype: int64

## Cleaning the Health Center ('company') Column

The `company` column, which represents health centers, contains inconsistent naming formats. To standardize this:

- We remove the phrase `'Health Center'` from the name.
- We optionally append `'_hc'` if the original name included `'Health Center'`.
- We also strip out any extra whitespace.

This ensures uniformity in naming, which is essential for analysis and grouping operations.


In [137]:
# crean company column 
def clean_company_name(value):
    """
    This function processes the health center names:
    - Removes 'Health Center'.
    - Removes any text inside brackets.
    - Appends '_hc' to names that contain 'Health Center'.
    """
    if isinstance(value, str):  # Check if the value is a string
        # Remove 'Health Center' 
        cleaned_name = re.sub(r'\s?Health Center', '', value).strip()
        
        # If the name originally contains 'Health Center', add '_hc' to the name
        if 'Health Center' in value:
            return cleaned_name + "_hc"
        
        return cleaned_name
    return value  # Return the value as it is if it's not a string

# Apply the function to the 'health_center' column
df['company'] = df['company'].apply(clean_company_name)


In [138]:
df['company'].value_counts()

company
Busanza_hc                  918
Mushongi_hc                 853
Gahanga_hc                  745
Bethsaida_hc                703
Buyoga_hc                   642
Bubangu_hc                  640
Tare_hc                     605
Nyarugunga_hc               560
Rugarama (Nyarugenge)_hc    512
Gitega_hc                   503
Tumba_hc                    493
Kagugu_hc                   479
Muhima_hc                   475
Nyacyonga_hc                474
Rwahi_hc                    435
Rulindo_hc                  428
Kabuye_hc                   426
Kisaro_hc                   415
Kiyanza_hc                  382
Nyakigando_hc               377
Kinihira_hc                 364
Remera-Mbogo_hc             360
Solace Ministries_hc        349
Kajevuba_hc                 284
Kagitumba_hc                266
Kairos_hc                   246
Murambi_hc                  245
Kabusunzu_hc                241
Nduba_hc                    238
Gakurazo_hc                 221
Burega_hc                   211


## Cleaning the 'Gender' Column

Gender values are often inconsistent due to variations in spelling, case, or abbreviations (e.g., `'f'`, `'female'`, `'Woman'`). 

To standardize these entries, we:
- Strip whitespace.
- Convert all values to lowercase.
- Map common variations to either `'Male'` or `'Female'`.
- Assign `NaN` to unrecognized or missing entries.

This helps ensure reliable gender-based analysis.


In [139]:
def gender_clean(value):
    if isinstance(value, str):
        value = value.strip().lower()
        if value in ['female', 'f', 'woman']:
            return 'Female'
        elif value in ['male', 'm', 'man']:
            return 'Male'
    return  np.nan 
df['gender']=df['gender'].apply(gender_clean)

In [140]:
df['gender'].value_counts()

gender
Female    15048
Male         47
Name: count, dtype: int64

In [141]:
df_gender=df[df['gender']=='Male']

In [142]:
df_gender

Unnamed: 0,name,method_initiation_date,province,sector,village,district,cell,company,date_of_birth,gender,...,previous_utilization_of_contraception,family_plan_method_used,duration_of_utilization,reason_to_stop,choosen_method,method_offered,first_appointment_date,postpartum_family_planning,the_period_the_method_will_last,discharging
336,FP-2409-0174205,NaT,Kigali,Kicukiro,Kicukiro,Kicukiro,Kicukiro,Bethsaida_hc,2000-01-01,Male,...,Yes,⁠Implant/implanon,3,next appointment,,,NaT,,,0
796,FP-2409-0236138,2024-09-08,,,,,,Nyarugunga_hc,2004-01-01,Male,...,No,,,,,,2024-12-08,,3MONTH,0
1270,FP-2410-0312700,2024-10-17,Kigali,Kicukiro,Isangano,Kicukiro,Ngoma,Bethsaida_hc,2000-08-19,Male,...,Yes,Implant/jadelle (MJ),4YEARS,BLEEDING,Injectables (Depo-Provera/Injectables (Depo-Pr...,Injectables (Depo-Provera/Injectables (Depo-Pr...,2028-01-18,,3months,0
1437,FP-2410-0339962,2024-10-23,Kigali,Masaka,Nyirakavomo,Kicukiro,Ayabaraya,Bethsaida_hc,1979-03-16,Male,...,Yes,Pillule ombinee (PC),6 months,none,Pillule ombinee (PC),Pillule ombinee (PC),2025-01-15,,3 months,0
1545,FP-2410-0353993,2024-10-25,Northern,Murambi,Mayange,Rulindo,Bubangu,Bubangu_hc,1991-04-22,Male,...,No,,,,⁠Implant/implanon,⁠Implant/implanon,2024-10-25,,3 YEAR,0
1904,FP-2410-0389001,NaT,Northern,Cyinzuzi,Kanyoni,Rulindo,Budakiranya,Remera-Mbogo_hc,1995-12-12,Male,...,Yes,INJECTION,2,accouchement,,,2024-10-31,,,0
2083,FP-2411-0415918,2024-11-04,Northern,Shyorongi,Karama,Rulindo,Muvumu,Rwahi_hc,1981-01-01,Male,...,Yes,Combined pills,3 year,pregnancy,Combined pills,Combined pills,2025-01-30,,3,0
2536,FP-2411-0474967,2024-11-11,Northern,Murambi,Karambo,Rulindo,Bubangu,Bubangu_hc,1983-01-01,Male,...,No,,,,Barriers,Barriers,2024-11-11,,daily,0
2546,FP-2411-0475569,2024-11-11,Northern,Ntarabana,Gatobotobo,Rulindo,Kiyanza,Kiyanza_hc,2001-09-19,Male,...,No,,,,Implant/jadelle (MJ),Implant/jadelle (MJ),2024-11-18,,5Years,0
2704,FP-2411-0491463,2024-11-13,Kigali,Gitega,Umurabyo,Nyarugenge,Akabahizi,Bethsaida_hc,2000-01-01,Male,...,No,,,,,⁠Implant/implanon,2025-02-13,,3years,0


## Standardizing Education Levels

The `education_level` column contains various forms of educational qualifications, which may vary in wording or capitalization. 

To make analysis easier, we:
- Define a dictionary to map general categories like `'University'`, `'Secondary'`, `'Primary'`, and `'No Education'` to their simplified labels.
- Use a function that checks whether the target keyword is part of the entry (case-insensitive).
- Assign `'unknown'` if no match is found.

This ensures consistency across educational categories.


In [143]:
# education
education_categories={'university':'University',
                       'Secondary':'Secondary',
                       'primary':'Primary',
                       'no education':'No Education'}
def education_cat(edu_level):
    for high_edu, education in education_categories.items():
        if edu_level in education:
            return high_edu
    return np.nan

df['education_level']=df['education_level'].apply(education_cat)

In [144]:
df['education_level'].value_counts()

education_level
university      7506
Secondary       3941
primary         3482
no education     167
Name: count, dtype: int64

## Code to Clean and Standardize the 'profession' Column

In [145]:
#clean profession column and remove inconsistence
# First lowercase and strip spaces (your existing code is good)
df['profession'] = df['profession'].str.lower().str.strip()

# Create mapping dictionary programmatically with all profession variations
profession_mapping = {}

# HEALTHCARE PROFESSIONS
# Nurse variations
nurse_variations = ['nurse', 'nurs', 'rn', 'nurde', 'nures', 'niurse', 'nurses', 'infirmiere', 'infirmier', 
                   'nursin', 'nurese', 'nirse', 'nurfe', 'nuse', 'nurusin', 'nursse', 'nuser', 'narse', 
                   'surse', 'nursr', 'infir', 'nurce', 'nusre', 'murse', 'nburse', 'bnurse', 'nerse', 
                   'nursa', 'nure', 'nusre', 'nus', 'nusse', 'n urse', 'nurse a0', 'nurse a1', 'nursea0', 
                   'nulse', 'nurase', 'nursin', 'nursing', 'clinical nurse', 'register nurse', 
                   'associate nurse', 'nurs.', 'nurse.', 'niurse.', 'nurs.e', 'nutrse', 'nuse.', 'nurse,', 
                   'niurse,', 'nurs,', 'nursed,', 'r.n', 'rna1', 'rna2', 'rn a1', 'nures rn']

for variation in nurse_variations:
    profession_mapping[variation] = 'unknown'

# Midwife variations
midwife_variations = ['midwife', 'midwifery', 'maidwife', 'midwifwe', 'midwifde', 'midwaf', 'midwef', 
                     'midweve', 'midwife ', 'miwife', 'miwdife', 'midiwife', 'middwife', 'mi', 'mid', 'mide', 
                     'midi', 'midwif', 'midwe', 'midwive', 'miduife', 'midiwifery', 'midewife', 'midewifery', 
                     'midiwifely', 'midwifely', 'nidwifery', 'miudwife', 'midwide', 'miduaf', 'rm', 
                     'umubyaza', 'sage femme', 'midy wife', 'mmidwife', ',midwife', 'midwifey', 'm.w', 'mw']

for variation in midwife_variations:
    profession_mapping[variation] = 'unknown'
    
# Manager variations
manager_of_hc_variations = ['hc manager', 'hc manager ', 'head of hc', 'head 0f hc', 'head of health center']
for variation in manager_of_hc_variations:
    profession_mapping[variation] = 'unknown'
    
# Clinical officer variations
clinical_variations = ['clinical officer', 'clinical', 'medical clinical officer']
for variation in clinical_variations:
    profession_mapping[variation] = 'unknown'

# Nutritionist variations
nutritionist_variations = ['nutritionist', 'nutritioniste']
for variation in nutritionist_variations:
    profession_mapping[variation] = 'nutritionist'

# Lab technician variations
lab_variations = ['labolantin']
for variation in lab_variations:
    profession_mapping[variation] = 'lab technician'

# comunity_health
community_health_worker_variation=['chw']
for variation in community_health_worker_variation:
    profession_mapping[variation] = 'community health worker'
    

# AGRICULTURE PROFESSIONS
# Farmer variations
farmer_variations = ['farmer', 'agriculture', 'cultivation', 'cultvatoer', 'cultivartrice', 'cultvatricei',
                    'cultivatrrice', 'cultivatrce', 'culctivatric', 'cultivatyrice', 'cultivatreur', 'cultivatricxe', 
                    'cultivatoe', 'cultivateu', 'cultivateur', 'cultivatuer', 'cultateur', 'culvateur', 
                    'culivateur', 'cutluvatop', 'cultiuvatr', 'culitivateur', 'cultiviteur', 'cultivatrice', 
                    'culitivatrice', 'cultiuvatrice', 'curtivatrice', 'cutivatrice', 'culivatrice', 'cultitrice', 
                    'cultvatrice', 'culvatrice', 'cultrivatrice', 'cultrivatrice', 'culktivatric', 'cuitrivatrice', 
                    'cultuvatrice', 'cultivavatrice', 'cultivatetrice', 'cultivatice', 'cultivatrie', 'cultivatrce', 
                    'cultivatrc', 'cultitrice', 'cltivatrice', 'crivatrice', 'cultivator', 'cultivtor', 'cultuvator', 
                    'cultuvetor', 'culvator', 'cultvator', 'cultvatoer', 'cultivetor', 'cultivor', 'cutivator', 
                    'cutluvator', 'cultivate', 'cultuvateur', 'cult', 'cul', 'cu', 'cut', 'culc', 'cuil', 'culti', 
                    'agri', 'agric', 'agricul', 'agricult', 'agr', 'farm', 'famer', 'farmer', 'framer', 'farmar', 
                    'fermer', 'fumer', 'former', 'faremer', 'fammer', 'femer', 'hinzi', 'umhinzi', 'umuhizi', 
                    'umuhinzi', 'umworozi', 'agriculture', 'agriclture', 'agricuture', 'agriculiture', 'agriculuture', 
                    'agriculiture', 'agriculturer', 'agricul']

for variation in farmer_variations:
    profession_mapping[variation] = 'farmer'

# Agronomist variations
agronomist_variations = ['agronome']
for variation in agronomist_variations:
    profession_mapping[variation] = 'agronomist'

# TRADE PROFESSIONS
# Trader variations
trader_variations = ['trader', 'trading', 'trade', 'shopkeeper', 'merchant', 'seller', 'vendor', 'vendeuse', 
                    'business', 'business woman', 'business women', 'businesswomen', 'busines', 'busnesswomen', 
                    'bussines', 'businwess', 'businness', 'busines women', 'bussiness', 'bussiness women', 'bisinesse', 
                    'biziness', 'buziness', 'buzisen', 'buzisess', 'buziess', 'bussines', 'commercante', 'commercant', 
                    'comercante', 'comercant', 'c0mmercante', 'c0mmercant', 'commerce', 'c0mmerce', 'comerce', 'comercial', 
                    'commercr', 'commercer', 'commence', 'commercante ', 'commercente ', 'commercote', 'commmercante', 
                    'comernce', 'comm', 'petite commerce', 'petit commerce', 'petit commece', 'petite commercant', 
                    'petitit commerce', 'petit commernce', 'petite commercicente', 'entrepreneur', 'self employed', 
                    'self-employee', 'self employment', 'self worker', 'self work', 'agent', 'agent mtn', 'agent momo', 
                    'boutiquiere', 'angent', 'agente', 'umucuruzi', 'umucurizi', 'vandeur', 'vendeur', 'traider', 
                    'umcuruzi', 'curuzi']

for variation in trader_variations:
    profession_mapping[variation] = 'trader'

# EDUCATION PROFESSIONS
# Teacher variations
teacher_variations = ['teacher', 'theacher', 'teach', 'enseignante', 'enseignant', 'enseingante', 'ensegnante', 
                     'einsegnante', 'ensignante', 'metraisse', 'primary', 'umwarimu', 'mwarimu', 'umurenzi']

for variation in teacher_variations:
    profession_mapping[variation] = 'teacher'
    

# Student variations
student_variations = ['student', 'etudiante', 'eleve']
for variation in student_variations:
    profession_mapping[variation] = 'student'

# SERVICE PROFESSIONS
# Hairdresser variations
hairdresser_variations = ['hairdresser', 'coiffeur', 'coiffeure', 'salon', 'saloon', 'salon de coiffeur', 
                         'hair maker', 'hair dressing', 'coufure', 'tressage', 'agent au salon', 'arasuka']

for variation in hairdresser_variations:
    profession_mapping[variation] = 'hairdresser'

# Tailor variations
tailor_variations = ['tailor', 'tailleur', 'taileur', 'taiyeur', 'talleur', 'tailleure', 'tailler', 'taille', 
                    'tayeli', 'taleur', 'traloring', 'couture', 'umudozi']

for variation in tailor_variations:
    profession_mapping[variation] = 'tailor'

# Cook variations
cook_variations = ['cook', 'cuisiniere', 'hotererie', 'hoterelie', 'hotelerie']
for variation in cook_variations:
    profession_mapping[variation] = 'cook'

# Cleaner variations
cleaner_variations = ['cleaner', 'clearner', 'crener', 'criner']
for variation in cleaner_variations:
    profession_mapping[variation] = 'cleaner'

# Driver variations
driver_variations = ['driver', 'chauffeur']
for variation in driver_variations:
    profession_mapping[variation] = 'driver'

# Security officer variations
security_variations = ['security', 'security officer', 'security guard', 'security guide', 'security guardian', 
                      'securte', 'securite', 'irondo']

for variation in security_variations:
    profession_mapping[variation] = 'security officer'

# HOUSEHOLD PROFESSIONS
# Housewife variations
housewife_variations = ['housewife', 'household', 'menagere', 'menaagere', 'menager', 'memagere', 'manegere', 
                       'managere', 'meangere', 'meanagere', 'menagre', 'nenagere', 'menagieri', 'menagiere', 
                       'menogese', 'menarege', 'mewnagere', 'stay at home']

for variation in housewife_variations:
    profession_mapping[variation] = 'housewife'

# Housekeeper variations
housekeeper_variations = ['housekeeper', 'home walker', 'home made', 'children care', 'domestic', 'domestique', 
                         'dostique']

for variation in housekeeper_variations:
    profession_mapping[variation] = 'housekeeper'

# PROFESSIONAL OCCUPATIONS
# Accountant variations
accountant_variations = ['accountant', 'comptable', 'comtabilite']
for variation in accountant_variations:
    profession_mapping[variation] = 'accountant'

# Engineer variations
engineer_variations = ['engineer', 'engeneer']
for variation in engineer_variations:
    profession_mapping[variation] = 'engineer'

# Designer variations
designer_variations = ['designer', 'disigner']
for variation in designer_variations:
    profession_mapping[variation] = 'designer'

# Photographer variations
photographer_variations = ['photographer']
for variation in photographer_variations:
    profession_mapping[variation] = 'photographer'

# Secretary variations
secretary_variations = ['secretary', 'secretaire']
for variation in secretary_variations:
    profession_mapping[variation] = 'secretary'

# Data entry clerk variations
data_entry_variations = ['data entry', 'data entry clerk']
for variation in data_entry_variations:
    profession_mapping[variation] = 'data entry clerk'

# OTHER OCCUPATIONS


# Mason variations
mason_variations = ['mason', 'massonary', 'macon', 'masonery', 'maconnnerie', 'aide macon', 'aide-macon', 'construction']
for variation in mason_variations:
    profession_mapping[variation] = 'mason'

# Military variations
military_variations = ['military', 'militaire', 'militaty', 'multaire']
for variation in military_variations:
    profession_mapping[variation] = 'military'

# Police officer variations
police_variations = ['police', 'police officer', 'policewoman', 'porice']
for variation in police_variations:
    profession_mapping[variation] = 'police officer'

# Sex worker variations
sex_worker_variations = ['sex worker', 'sex workers', 'sex warkers', 'sex woker', 'sex wokers', 'fsw', 'uburaya']
for variation in sex_worker_variations:
    profession_mapping[variation] = 'sex worker'

# Hostess variations
hostess_variations = ['hostess', 'hotesse']
for variation in hostess_variations:
    profession_mapping[variation] = 'hostess'
    
# reception variation
receptionist_variation=['RECEPTIONNISTE', 'hosptality' ]
for variation in receptionist_variation:
    profession_mapping[variation]='receptionist'
    
# Jobless variations
jobless_variations = ['jobless', 'unemployed', 'no job', 'none', 'non', 'nothing', 'pas de travail', 'rien fait',
                     'chomeur', 'choumeur', 'chomeur ', 'chauimeur', 'chaummeur', 'chaomeur', 'chaumeur', 'joblesss',
                     'non job', 'nono', 'nona']

for variation in jobless_variations:
    profession_mapping[variation] = 'jobless'

# Community worker variations
community_worker_variations = ['community worker', 'community work']
for variation in community_worker_variations:
    profession_mapping[variation] = 'community worker'
    

# Freelancer variations
freelancer_variations = ['freelancer', 'occassionaly', 'ibiraka']
for variation in freelancer_variations:
    profession_mapping[variation] = 'freelancer'

# Public worker variations
public_worker_variations = ['public worker', 'pubric worker', 'public']
for variation in public_worker_variations:
    profession_mapping[variation] = 'public worker'

# Unknown variations
unknown_variations = ['unknown', 'other', 'autre', 'x', 'v', 'nm', 'nh', 'co', 'sans', '1', '0',
                     'private', 'private sector', 'company', "company's employee", 'usine', 'industrial worker',
                     'casual labor', 'double work', 'med', 'a2', 'a1','restaurant', 'restaurent',]

for variation in unknown_variations:
    profession_mapping[variation] = 'unknown'
    
# check all unusual characters


for key in list(df['profession'].unique()):
    if pd.isnull(key):
        continue  # skip NaNs

    key_str = str(key).strip()  # Convert to string and remove surrounding whitespace

    # If it looks like a hash (long alphanumeric string)
    if re.match(r'^[a-f0-9]{64}$', key_str) or re.match(r'^[a-f0-9]{32}$', key_str):
        profession_mapping[key] = 'unknown'

    # If it's just a hyphen like "-"
    elif key_str == '-':
        profession_mapping[key] = 'unknown'

    # If it’s only numbers (int or string of digits)
    elif re.match(r'^\d+$', key_str):
        profession_mapping[key] = 'unknown'

    # You can add more conditions below for cleanup if needed


# Apply mapping and handle missing values
df['profession_cleaned'] = df['profession'].map(profession_mapping).fillna('unknown')

# Create profession categories for broader analysis
profession_categories = {
    # 'healthcare': [
    #     'nurse', 'midwife', 'clinical officer', 'head of health center', 'nutritionist', 
    #     'lab technician', 'community health worker'
    # ],
    'agriculture': ['farmer', 'agronomist'],
    'education': ['teacher', 'student'],
    'services': [
        'trader', 'hairdresser', 'tailor', 'cook', 'cleaner', 'driver', 
        'security officer', 'receptionist', 'hostess', 'freelancer', 'mason'
    ],
    'professional': [
        'accountant', 'engineer', 'designer', 'photographer', 
        'secretary', 'data entry clerk', 'manager'
    ],
    'public_sector': ['military', 'police officer', 'public worker', 'community worker'],
    'household': ['housewife', 'housekeeper'],
    'unemployed': ['jobless'],
    # 'other': ['sex worker', 'unknown']
}

# Function to categorize professions
def categorize_profession(profession):
    for category, prof_list in profession_categories.items():
        if profession in prof_list:
            return category
    return 'other'

# Add category column
df['profession_category'] = df['profession_cleaned'].apply(categorize_profession)

In [39]:
df[['profession_category','profession_cleaned']].head()

Unnamed: 0,profession_category,profession_cleaned
0,services,trader
1,other,unknown
2,household,housekeeper
3,household,housekeeper
4,services,tailor


In [146]:
df['profession_category'].value_counts()

profession_category
other            10436
agriculture       2701
household          658
services           656
unemployed         612
education           84
public_sector       18
professional         9
Name: count, dtype: int64

## Catchment Area

In [147]:
#catchment area 
catchment_area = ['Hors Zone', 'Zone', 'Hors District']

def catch_area(value):
    if isinstance(value, str):
        for area in catchment_area:
            if area in value:
                return area
    return np.nan  # Return NaN only if no match found or value is not a string

df['catchment_area'] = df['catchment_area'].apply(catch_area)


In [148]:
df['catchment_area'].value_counts()

catchment_area
Zone             10772
Hors Zone         3328
Hors District      996
Name: count, dtype: int64

##  Marital Status

The `marital_status` column contains various forms of marital status entries, which may differ in wording or representation. 


In [149]:
#marital status
# Define the valid marital status categories
valid_marital_status = ['Married', 'Single', 'Separated', 'Widow/Widower', 'Divorced']


def clean_marital_status(value):
    """
    Function to clean and categorize marital status
    """

    if value in valid_marital_status:
        return value
    else:
        return np.nan

# Apply the cleaning function
df['marital_status'] = df['marital_status'].apply(clean_marital_status)


In [150]:
df['marital_status'].value_counts()

marital_status
Married          10293
Single            4653
Separated           82
Widow/Widower       36
Divorced            32
Name: count, dtype: int64

## YES & NO containing  Columns

The columns listed in `cols_with_bool` contain values like `'Yes'` and `'No'`, which represent boolean-like responses. 

To standardize the data:

In [151]:
def clean_columns_with_bool(value):
    # Check if the value is "Yes" or "No"
    if value in ['Yes', 'No']:
        return value
    else:
        return np.nan
cols_with_bool=['accompanied_by_partner', 'birth_spacing', 'birth_limitation', 'previous_utilization_of_contraception', 'postpartum_family_planning',]
for col in cols_with_bool:
    df[col]=df[col].apply(clean_columns_with_bool)

In [None]:
df['']

In [152]:
df['accompanied_by_partner'].value_counts()

accompanied_by_partner
No     14900
Yes      196
Name: count, dtype: int64

##  Standardizing Numeric Columns

The columns in `numeric_cols` contain numerical values, but they might include non-numeric entries. 

To clean these columns:

In [153]:
numeric_cols = [
    'gravidity', 
    'parity', 
    'children_in_life', 
    'desired_number_of_children', 
    'number_of_children_died'
]

def clean_numeric_columns(df, columns):
    for col in columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    return df

df = clean_numeric_columns(df, numeric_cols)


In [154]:
df[['gravidity', 'parity', 'children_in_life', 'desired_number_of_children', 'number_of_children_died']]

Unnamed: 0,gravidity,parity,children_in_life,desired_number_of_children,number_of_children_died
0,1.0,1.0,1.0,3.0,0.0
1,,,1.0,1.0,
2,1.0,1.0,1.0,4.0,0.0
3,1.0,1.0,1.0,2.0,0.0
4,1.0,1.0,1.0,2.0,0.0
...,...,...,...,...,...
15169,2.0,2.0,2.0,4.0,0.0
15170,7.0,7.0,0.0,0.0,0.0
15171,3.0,2.0,2.0,2.0,0.0
15172,3.0,3.0,3.0,4.0,0.0


In [155]:
#birth spacing
df['birth_spacing'].value_counts()

birth_spacing
Yes    8522
No     2578
Name: count, dtype: int64

## Contraceptive Methods

The `contraceptive_method` column contains various representations of contraceptive methods with inconsistent formatting.

To ensure consistency:
- A `contraceptive_dict` is defined to map different representations to standardized method names.
- A function `clean_contraceptive_column()` uses the `.map()` method to apply this dictionary for value replacement.
- Unmatched entries are replaced with `NaN` for clarity.

This standardization supports cleaner analysis and avoids duplication of similar method names.


In [156]:
# Unified contraceptive method dictionary
contraceptive_dict = {
    'Injectables (Depo-Provera/Injectables (Depo-Provera))': 'Depo-Provera',
    'Injectables_DMPA-SC/Injectables_DMPA SC': 'Sayana Press',
    'Injectables_DMPA-SC/Injectables DMPA SC': 'Sayana Press',
    'INJECTION': 'Injectable',
    'Injectables(norristerat)': 'Noristerat',
    'depo-provera': 'Depo-Provera',
    'Implant/implanon': 'Implant/Implanon',
    'Implant/jadelle (MJ)': 'Implant/Jadelle',
    'Combined pills': 'Combined Oral Pills (COCs)',
    'Pillule ombinee (PC)': 'Combined Oral Pills (COCs)',
    'Oral Contraceptives, combined /Contraceptifs oraux, combiné': 'Combined Oral Pills (COCs)',
    'Oral Contraceptives': 'Oral Contraceptives',
    'Oral Contraceptives= oral contraceptives': 'Oral Contraceptives',
    'Pillule progestative (PP)': 'Progestin-Only Pills (POPs)',
    'Oral Contraceptives, progestative / Contraceptifs oraux, progestatif': 'Progestin-Only Pills (POPs)',
    'progestative / Contraceptifs oraux, progestatif': 'Progestin-Only Pills (POPs)',
    'Pillule implants': 'Implant',
    'IUD': 'Intrauterine Device (IUD)',
    'Intrauterine device': 'Intrauterine Device (IUD)',
    'Dispositif intra utérine': 'Intrauterine Device (IUD)',
    'Lactational amenorrhea method': 'Lactational Amenorrhea Method',
    'Male Condoms': 'Male Condoms',
    'Female condom': 'Female Condom',
    'Barriers': 'Barrier Methods',
    'PFN': 'Natural Family Planning (NFP)',
    'Cycle beads': 'Cycle Beads',
    'Collier': 'Cycle Beads',
    'Auto-observation': 'Self-Observation',
    'Progesterone': 'Progesterone',
    'Tubal ligation': 'Tubal Ligation'
}

# Define a function to clean any column using the contraceptive_dict
def clean_contraceptive_column(df, column_name, dictionary=contraceptive_dict):
    """
    Cleans a column in the DataFrame using the provided contraceptive dictionary.
    
    Args:
        df (pd.DataFrame): The DataFrame to clean.
        column_name (str): The column name to standardize.
        dictionary (dict): The mapping dictionary for standardization.
        
    Returns:
        pd.Series: Cleaned column with standardized values or NaN if not found.
    """
    return df[column_name].map(dictionary).fillna(np.nan)


In [157]:
# Clean the columns using the function
df['family_plan_method_used'] = clean_contraceptive_column(df, 'family_plan_method_used')
df['choosen_method'] = clean_contraceptive_column(df, 'choosen_method')
df['method_offered'] = clean_contraceptive_column(df, 'method_offered')

In [158]:
df['family_plan_method_used'].value_counts()

family_plan_method_used
Depo-Provera                     1940
Combined Oral Pills (COCs)       1743
Implant/Jadelle                  1548
Injectable                        877
Sayana Press                      622
Progestin-Only Pills (POPs)       346
Intrauterine Device (IUD)         153
Male Condoms                       37
Noristerat                         27
Lactational Amenorrhea Method       5
Progesterone                        4
Female Condom                       3
Natural Family Planning (NFP)       2
Barrier Methods                     1
Cycle Beads                         1
Name: count, dtype: int64

In [159]:
df['choosen_method'].value_counts()

choosen_method
Depo-Provera                     2260
Combined Oral Pills (COCs)       2017
Implant/Jadelle                  1428
Sayana Press                     1302
Injectable                        928
Progestin-Only Pills (POPs)       767
Intrauterine Device (IUD)         334
Male Condoms                      102
Lactational Amenorrhea Method      30
Barrier Methods                     9
Female Condom                       4
Natural Family Planning (NFP)       4
Progesterone                        2
Cycle Beads                         2
Self-Observation                    1
Tubal Ligation                      1
Noristerat                          1
Name: count, dtype: int64

In [160]:
df['method_offered'].value_counts()

method_offered
Depo-Provera                     2324
Combined Oral Pills (COCs)       1991
Sayana Press                     1463
Implant/Jadelle                  1444
Progestin-Only Pills (POPs)       763
Injectable                        557
Intrauterine Device (IUD)         382
Male Condoms                      111
Lactational Amenorrhea Method      35
Barrier Methods                     9
Female Condom                       5
Natural Family Planning (NFP)       4
Noristerat                          4
Progesterone                        2
Cycle Beads                         2
Self-Observation                    1
Name: count, dtype: int64

##  Duration into Months

The `duration_of_utilization` column contains values in mixed formats, such as years, months, or decimals (e.g., "2 years 3 months" or "1.5").



In [161]:
def convert_to_months(val):
    if pd.isna(val):
        return np.nan

    val = str(val).lower().strip()

    # Normalize text
    val = re.sub(r'[^a-z0-9\s./]', '', val)
    val = re.sub(r'\s+', ' ', val)

    # Replace French & variants
    val = val.replace('ans', 'years').replace('an', 'year').replace('mois', 'months')
    val = val.replace('mounths', 'months').replace('mounth', 'month').replace('moths', 'month').replace('m','month')
    val = val.replace('yrs', 'years').replace('yrs', 'years')
    val = val.replace('months', 'month').replace('years', 'year')

    # Match "X year(s)" and "Y month(s)"
    year_match = re.search(r'(\d+\.?\d*)\s*year', val)
    month_match = re.search(r'(\d+\.?\d*)\s*month', val)

    years = float(year_match.group(1)) if year_match else 0
    months = float(month_match.group(1)) if month_match else 0

    if not year_match and not month_match:
        # Handle cases like "1.6", "2.3" etc.
        try:
            val_float = float(val)
            int_part = int(val_float)
            decimal_part = (val_float - int_part) * 10  # Treat decimal as months
            return int_part * 12 + decimal_part
        except:
            return np.nan

    total_months = int(round(years * 12 + months))
    return total_months

# Apply to column
df['duration_of_utilization'] = df['duration_of_utilization'].apply(convert_to_months)


In [57]:
df['duration_of_utilization'].isnull().sum()

7055

In [162]:
def extract_duration_months(value):
    """
    Extract contraceptive duration in months from various text formats.
    Returns None for dates or unrecognized formats.
    """
    # Handle missing values
    if pd.isna(value) or value in ['', ' ', 'no', 'none', 'NO', 'N', '0']:
        return None
    
    # Convert to string and lowercase for consistent processing
    value = str(value).strip().lower()
    
    # Skip values that look like dates
    if re.search(r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}', value) or re.search(r'\d{1,2}\.\d{1,2}\.\d{2,4}', value):
        return None
    
    # Common contraceptive types with standard durations
    if any(term in value for term in ['iud', 'diu', 'implant', 'jadelle', 'implanon']):
        return 
    
    if any(term in value for term in ['depo', 'injection', 'injectable']):
        return np.nan
    
    if any(term in value for term in ['pill', 'microgynon', 'microlute']):
        return np.nan
    
    # Check for years pattern (e.g., "5 years", "5y", "5ans")
    years_match = re.search(r'(\d+(?:\.\d+)?)\s*(?:y|year|years|yrs|ans|an)', value)
    if years_match:
        years = float(years_match.group(1))
        return int(years * 12) if years.is_integer() else years * 12
    
    # Check for months pattern (e.g., "3 months", "3m", "3mois")
    months_match = re.search(r'(\d+(?:\.\d+)?)\s*(?:m|mo|month|months|mois)', value)
    if months_match:
        return float(months_match.group(1))
    
    # Check for weeks pattern (e.g., "12 weeks", "12w")
    weeks_match = re.search(r'(\d+(?:\.\d+)?)\s*(?:w|week|weeks|wks|semaine)', value)
    if weeks_match:
        weeks = float(weeks_match.group(1))
        return round(weeks / 4.3, 1)  # Convert weeks to months
    
    # Check for days pattern (e.g., "90 days", "84d")
    days_match = re.search(r'(\d+(?:\.\d+)?)\s*(?:d|day|days|jour)', value)
    if days_match:
        days = float(days_match.group(1))
        return round(days / 30, 1)  # Convert days to months
    
    # Just a number (e.g., "3", "5")
    if re.match(r'^\d+(?:\.\d+)?$', value):
        num = float(value)
        if num:
            return np.nan # because we don't know if the it is year month or something
        elif num <= 90:
            return np.nan
        else:
            return None  
    
    # If nothing matches, return None
    return None

df['the_period_the_method_will_last'] = df['the_period_the_method_will_last'].apply(extract_duration_months)

In [163]:
df['the_period_the_method_will_last'].isnull().sum()

6538

In [164]:
df['reason_to_stop'].value_counts().sort_values(ascending=False).head(50)

reason_to_stop
continue                    403
none                        332
expired                     227
NONE                        223
continuer                   181
grossesse                   161
no                          160
EN COURS                    146
pregnancy                   136
CONTINUE                    131
cont                        126
termine                     122
no stop                     119
continuation                114
CONTINUATION                112
-                           104
accouchement                 97
EXPIRED                      96
NO STOP                      91
volontary                    84
change                       83
expiration                   79
followup                     75
volontaire                   75
GROSSESSE                    71
EXPIRATION                   68
side effect                  68
PREGNANCY                    67
NO                           65
side effects                 64
no reason                

## Contraceptive Discontinuation Classification using OpenAI API
 I have commented this code because data was being reduced up to 2116, so i didn't use it, i was planing to ask you if can use it

In [165]:
#api_key= your openai api here
client = OpenAI(api_key=api_key)

In [170]:
# # Load API key from environment variables
# load_dotenv(override=True)
# api_key = os.getenv('OPENAI_API_KEY')
# client = OpenAI(api_key=api_key)
print(f"API key loaded: {'Yes' if api_key else 'No'}")

def is_meaningful(text):
    """
    Check if text contains meaningful content worth classifying.
    
    Args:
        text: Input text to check
        
    Returns:
        bool: True if text has meaningful content, False otherwise
    """
    if pd.isna(text):
        return False
        
    text_str = str(text).strip().lower()
    non_meaningful = ['nan', 'none', 'n/a', '', '-', 'unknown', 'not applicable', 'not specified']
    
    return text_str not in non_meaningful and len(text_str) > 1

def classify_contraceptive_reasons(reason_texts, batch_size=20, model="gpt-4o-mini"):
    """
    Classify a series of contraceptive discontinuation reasons into predefined categories
    using the OpenAI API. Preserves original values that shouldn't be classified.
    
    Args:
        reason_texts (list): List of text reasons to classify
        batch_size (int): Number of items to process in each API call
        model (str): OpenAI model to use
        
    Returns:
        list: Classifications for each input text or preserved original values
    """
    # Create a copy of the input list to store results
    final_results = []
    for text in reason_texts:
        if not is_meaningful(text):
            final_results.append(pd.NA)  # Set non-meaningful to NaN
        else:
            final_results.append(text)  # Temporarily store text, will be replaced with category
    
    # Identify which texts should be classified
    meaningful_indices = [i for i, text in enumerate(reason_texts) if is_meaningful(text)]
    
    if not meaningful_indices:
        return final_results  # Return original list with all NaNs if nothing to classify
    
    # Group identical texts to avoid redundant API calls
    text_to_indices = {}
    for idx in meaningful_indices:
        text = reason_texts[idx]
        if text not in text_to_indices:
            text_to_indices[text] = []
        text_to_indices[text].append(idx)
    
    # Create a list of unique texts to classify
    unique_texts = list(text_to_indices.keys())
    unique_results = {}  # Will store classification results for unique texts
    
    # Process unique entries in batches
    batches = [unique_texts[i:i+batch_size] for i in range(0, len(unique_texts), batch_size)]
    
    classification_prompt = """
        Enhanced Medical Contraception Classification Prompt
        You are a specialized medical researcher focusing on contraceptive use patterns with extensive knowledge of medical terminology and contraceptive methods. Your task is to classify reasons for stopping contraception.
        
        IMPORTANT: You must classify EVERY entry into EXACTLY ONE specific category - no exceptions. Never use "Unknown" unless absolutely nothing can be determined from the text.
        
        Language Detection & Translation:
        • First detect the language of the input
        • If it's not English, translate it to English before categorizing
        
        Categories (CHOOSE EXACTLY ONE for each entry):
        
        Side Effects — Any physical or psychological reactions to the method
        • Examples: dizziness, bleeding, headache, infection, pain, discomfort, weight gain, nausea
        • French terms: vertige (dizziness), DYSPARENIA (pain during intercourse), metrolagia/metrorrhagia (abnormal bleeding), menstruation disorder
        
        Expired — The method's validity ended or time-based removal
        • Examples: expired, time to remove, method finished, end of period, due date for removal, completed cycle,it is time to be removed,
        • French terms: EXPIREE (expired), expiration
        
        Finished_Med — The contraceptive course or period has finished
        •Loss of combined pills todays,sold out
        • French terms: termine (finished), DURE TERMINE (duration ended), DEJA TERMINEE (already finished)
        
        Pregnacy- pregnantor, became pregnant, GROSSES (instead of GROSSESSE), pregnance, pregnancy
        
        Wanted To Have Child — Desire to get pregnant 
        • Examples: wants child, gave birth, trying to conceive, family planning,REEPROCREATION,SHE WHAT BIRTH
        • French terms: DESIRE DES ENFANTS (desire for children), DESIRE OF OTHER CHILD (desire for another child), 
          DESIRE/DESIR DE GROSSESSE (desire for pregnancy), DESIRE/DESIR D'UN ENFANT (desire for a child), 
          DESIR L'ENFANT (desire for the child), DESIR UN ENFANT (desire for a child)
        • Note: Also recognize typo variants like ENFANF, LÉNFANT, ENFENT, 
        
        Switch — Changed or wanted to change contraceptive method
        • Examples: changing to another method, preferred different method, stop to continue long method,LONG LIFE METHOD,contenue
        • French terms: CHANGEMENT (change), CHANGEMENT DE LA METHODE (change of method)
        
        Adherence — Issues related to remembering or using method correctly
        • Examples: forgot to take pill, missed dose, risk for forgetting, inconsistent use
        • French terms: OUBLIE (forgot), NO RESPECT DE RV (no respect for appointment)
        
        Voluntary — Stopped by personal choice, preference or will
        • Examples: own will, voluntary decision, personal choice, no longer needed, time to remove, enlevement,volunteer,NO REASON,PRIVE
        • French terms: enlevement (removal), volontary/volontaire (voluntary), raison personnel/RAISON PERSNNEL (personal reason)

        Social — Influenced by partner, family, relocation
        • Examples: partner moved, separated, migrated, husband/partner preference
        • French terms: vient ailleur/VIENT AILLEUR (comes from elsewhere)
        
        Medical — Underlying health condition or contraindication
        • Examples: hypertension, severe headache, allergy, doctor's recommendation, health concerns, difficult to drink
        
        Loss — Due to death of child or infant
        • Examples: baby died, infant loss, child mortality
        Unknown - unintelligence
        • Examples: un known, hhhh, and more others
        
        Ongoing — Indicates method is still in use or continuation
        • Examples: no change, ongoing use, followup, continuing method, she is on going, continue, 'ARACYAKOMEJE','ARAKOMEJE,NO STOP,contenue
        • French terms: suivi (follow-up), en cours (ongoing/ongoing use), CONTINUATION (continuation)
        
        CLASSIFICATION RULES:
        • "INJECTION" likely refers to a contraceptive injection method - classify as "Switch" if context suggests changing methods
        • "Difficult to drink" refers to side effects of the contraceptive - classify as "Medical"
        • French phrases must be properly translated and understood within context
        • Typos in French terms should be recognized and interpreted correctly
        • Detailed analysis is required - examine each entry thoroughly for meaning
        • If you're unsure about specific medical terminology, make your best determination based on context
        • You MUST choose one of the above categories - never leave anything as "Unknown" unless it's completely unintelligible
        
        FORMAT YOUR RESPONSE:
        Return ONLY the entry number and category separated by a colon (e.g., "1: Side Effects")
        No additional commentary or explanation - just the numbered classifications
        

    """
    
    for batch_idx, batch in enumerate(tqdm(batches, desc="Processing batches")):
        try:
            # Create batch prompt
            batch_prompt = classification_prompt
            for i, text in enumerate(batch):
                batch_prompt += f"\n{i+1}. {text}"
            
            # Make API call
            response = client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": "You are a specialized medical classifier with expertise in contraceptive methods."},
                    {"role": "user", "content": batch_prompt}
                ],
                max_tokens=500,
                temperature=0
            )
            
            # Parse response
            response_text = response.choices[0].message.content.strip()
            lines = response_text.split('\n')
            
            for line in lines:
                if ':' in line:
                    try:
                        idx_str, category = line.split(':', 1)
                        idx = int(idx_str.strip()) - 1
                        if idx < len(batch):
                            text = batch[idx]
                            category = category.strip()
                            # Convert "Unknown" categories to NaN
                            if category.lower() == "unknown":
                                unique_results[text] = pd.NA
                            else:
                                unique_results[text] = category
                    except (ValueError, IndexError) as e:
                        print(f"Error parsing line '{line}': {e}")
                        continue
            
            # Check for any missing classifications in this batch
            for i, text in enumerate(batch):
                if text not in unique_results:
                    print(f"Warning: Missing classification for '{text}' in batch {batch_idx+1}")
                    # Apply fallback classification rules for common terms
                    text_lower = text.lower()
                    
                    if any(term in text_lower for term in ['inject', 'shot']):
                        unique_results[text] = "Switch"
                    elif any(term in text_lower for term in ['pain', 'bleed', 'discomfort', 'side','metrolagie']):
                        unique_results[text] = "Side Effects"
                    elif any(term in text_lower for term in ['forgot', 'miss', 'remember']):
                        unique_results[text] = "Adherence"
                    elif any(term in text_lower for term in ['expire', 'finish', 'end', 'complet']):
                        unique_results[text] = "Expired"
                    elif any(term in text_lower for term in ['pregnan', 'conceive']):
                        unique_results[text] = "Pregnancy"
                    elif any(term in text_lower for term in ['child', 'birth', 'conceive']):
                        unique_results[text] = "Wanted To Have Child"
                    elif any(term in text_lower for term in ['change', 'switch', 'other method', 'long method']):
                        unique_results[text] = "Switch"
                    elif any(term in text_lower for term in ['child', 'birth', 'conceive']):
                        unique_results[text] = "Wanted To Have Child"
                    else:
                        # If classification fails, set to NaN instead of "Unknown"
                        unique_results[text] = pd.NA
                    
            # Rate limiting to avoid hitting API limits
            time.sleep(0.5)
            
        except Exception as e:
            print(f"Error with batch {batch_idx+1}: {e}")
            # For failed batches, apply the same fallback logic
            for text in batch:
                text_lower = text.lower()
                
                if any(term in text_lower for term in ['inject', 'shot', 'long life method']):
                    unique_results[text] = "Switch"
                elif any(term in text_lower for term in ['pain', 'bleed', 'discomfort', 'side','metrolagie',]):
                    unique_results[text] = "Side Effects"
                elif any(term in text_lower for term in ['forgot', 'miss', 'remember']):
                    unique_results[text] = "Adherence"
                elif any(term in text_lower for term in ['expire', 'finish', 'end', 'complet']):
                    unique_results[text] = "Expired"
                elif any(term in text_lower for term in ['pregnan', 'conceive']):
                        unique_results[text] = "Pregnancy"
                elif any(term in text_lower for term in ['change', 'switch', 'other method', 'long method']):
                    unique_results[text] = "Switch"
                
                else:
                    # If classification fails, set to NaN
                    unique_results[text] = unique_results[text]
    
    # Apply classifications to all indices
    for text, indices in text_to_indices.items():
        category = unique_results.get(text, pd.NA)
        for idx in indices:
            final_results[idx] = category
    
    return final_results

def classify_dataframe_column(df, column_name):
    """
    Apply classification to a DataFrame column and return a Series of classifications.
    Non-meaningful values and "Unknown" classifications are converted to NaN.
    
    Args:
        df (pandas.DataFrame): DataFrame containing the column to classify
        column_name (str): Name of the column to classify
        
    Returns:
        pandas.Series: Series of classifications with NaN for unknown or non-meaningful values
    """
    # Convert column to string type to handle any numeric values
    values = df[column_name].fillna('nan').astype(str).tolist()
    
    # Classify the values
    classified_values = classify_contraceptive_reasons(values)
    
    # Create a Series with the same index as the DataFrame
    result_series = pd.Series(classified_values, index=df.index)
    
    return result_series

# Example usage - Classification check function
def check_classification_coverage(df, input_column, output_column):
    """
    Check what percentage of values (excluding NaN) have been successfully classified.
    
    Args:
        df (pandas.DataFrame): DataFrame to check
        input_column (str): Name of the original column
        output_column (str): Name of the classification column
        
    Returns:
        float: Percentage of successful classification (0-100)
    """
    # Count only meaningful values in original column
    meaningful_mask = df[input_column].apply(is_meaningful)
    total_meaningful = meaningful_mask.sum()
    
    # Count NaN values in result column for meaningful inputs
    nan_results = df.loc[meaningful_mask, output_column].isna().sum()
    
    if total_meaningful == 0:
        return 100.0
    
    coverage = 100 * (1 - nan_results / total_meaningful)
    
    print(f"Classification coverage: {coverage:.2f}%")
    print(f"Total meaningful values: {total_meaningful}")
    print(f"Values that couldn't be classified (NaN): {nan_results}")
    
    # Show examples of items that couldn't be classified
    if nan_results > 0:
        print("\nSamples of values that couldn't be classified:")
        samples = df.loc[meaningful_mask & df[output_column].isna(), input_column].sample(min(5, nan_results))
        for item in samples:
            print(f"- '{item}'")
    
    return coverage

# Example usage
if __name__ == "__main__":
   
    df['reason_to_stop_category'] = classify_dataframe_column(df, 'reason_to_stop')
    print(df[['reason_to_stop', 'reason_to_stop_category']])
    
    check_classification_coverage(df, 'reason_to_stop', 'reason_to_stop_category')

API key loaded: Yes


Processing batches: 100%|██████████████████████████████████████████████████████████████| 88/88 [04:45<00:00,  3.24s/it]

                              reason_to_stop reason_to_stop_category
0                                side effect            Side Effects
1                                  INJECTION                  Switch
2      stop in order to continue long method                  Switch
3                                        NaN                    <NA>
4                         difficult to drink                 Medical
...                                      ...                     ...
15169                    risk for forgetting               Adherence
15170                             EXPIRATION                 Expired
15171                               FOLLOWUP                 Ongoing
15172                  END AND NEW PREGNANCY    Wanted To Have Child
15173                              PREGNANCY               Pregnancy

[15174 rows x 2 columns]
Classification coverage: 89.92%
Total meaningful values: 8129
Values that couldn't be classified (NaN): 819

Samples of values that couldn't be cl




In [171]:
df['reason_to_stop_category'].value_counts()

reason_to_stop_category
Ongoing                 2435
Voluntary                878
Side Effects             799
Expired                  762
Switch                   662
Wanted To Have Child     645
Pregnacy                 312
Pregnancy                286
Finished_Med             279
Adherence                107
Social                    68
Medical                   65
Loss                      12
Name: count, dtype: int64

In [172]:
df[['reason_to_stop','reason_to_stop_category']].head(50)

Unnamed: 0,reason_to_stop,reason_to_stop_category
0,side effect,Side Effects
1,INJECTION,Switch
2,stop in order to continue long method,Switch
3,,
4,difficult to drink,Medical
5,,
6,bleeding on implanon for long time,Side Effects
7,to use another method,Switch
8,bleeding for long time,Side Effects
9,,


In [173]:
df['reason_to_stop_category'].unique()

array(['Side Effects', 'Switch', <NA>, 'Medical', 'Expired', 'Voluntary',
       'Finished_Med', 'Wanted To Have Child', 'Ongoing', 'Pregnancy',
       'Adherence', 'Social', 'Pregnacy', 'Loss'], dtype=object)

In [174]:
dictionary={'Side Effects':'Side Effects',
            'Switch':'Wanted a more effective method',
            
            'Medical':'Health concerns',
            'Expired':'Access/availability issues', 
            'Voluntary':'Voluntary',
            'Finished_Med':'Access/availability issues', 
            'Ongoing':'Ongoing', 
            'Wanted To Have Child':'Wanted To Have Child', 
            'Adherence':'Inconvenient to use',
            'Pregnacy':'Pregnancy',
            'Social':'Social', 
            'Pregnancy':'Pregnancy', 
            'Loss':'Wanted To Have Child'}
                   

In [175]:
df['reason_to_stop_category']=df['reason_to_stop_category'].map(dictionary)

In [176]:
df['reason_to_stop_category'].value_counts()

reason_to_stop_category
Ongoing                           2435
Access/availability issues        1041
Voluntary                          878
Side Effects                       799
Wanted a more effective method     662
Wanted To Have Child               657
Pregnancy                          598
Inconvenient to use                107
Social                              68
Health concerns                     65
Name: count, dtype: int64

In [177]:
df.columns

Index(['name', 'method_initiation_date', 'province', 'sector', 'village',
       'district', 'cell', 'company', 'date_of_birth', 'gender',
       'consultation_date', 'education_level', 'profession', 'catchment_area',
       'marital_status', 'accompanied_by_partner', 'gravidity', 'parity',
       'children_in_life', 'desired_number_of_children',
       'date_of_last_delivery', 'number_of_children_died', 'birth_spacing',
       'birth_limitation', 'desired_method',
       'first_date_of_last_menstrual_period_',
       'previous_utilization_of_contraception', 'family_plan_method_used',
       'duration_of_utilization', 'reason_to_stop', 'choosen_method',
       'method_offered', 'first_appointment_date',
       'postpartum_family_planning', 'the_period_the_method_will_last',
       'discharging', 'profession_cleaned', 'profession_category',
       'reason_to_stop_category'],
      dtype='object')

In [178]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15174 entries, 0 to 15173
Data columns (total 39 columns):
 #   Column                                 Non-Null Count  Dtype         
---  ------                                 --------------  -----         
 0   name                                   15174 non-null  object        
 1   method_initiation_date                 11580 non-null  datetime64[ns]
 2   province                               14927 non-null  object        
 3   sector                                 14926 non-null  object        
 4   village                                14857 non-null  object        
 5   district                               14927 non-null  object        
 6   cell                                   14926 non-null  object        
 7   company                                15174 non-null  object        
 8   date_of_birth                          15094 non-null  datetime64[ns]
 9   gender                                 15095 non-null  object

In [180]:
df.to_csv('family_planning_cleaned.csv', index=False)