In [81]:
import pandas as pd 
import numpy as np

In [82]:
df=pd.read_csv("financial_inclusion_dataset.csv")

In [83]:
df

Unnamed: 0,age,gender,country,education_level,has_bank_account,mobile_money_user,loan_access,monthly_income
0,56,Male,Tanzania,Primary,1,0.0,0.0,503.20
1,69,Female,Burundi,Secondary,1,1.0,1.0,432.48
2,46,Female,Kenya,Secondary,1,1.0,0.0,206.95
3,32,Male,Rwanda,Secondary,,0.0,1.0,629.71
4,60,Male,Rwanda,Secondary,0,1.0,1.0,424.83
...,...,...,...,...,...,...,...,...
49995,66,Male,Rwanda,Primary,0,0.0,0.0,673.73
49996,35,Female,Burundi,Primary,yes,1.0,1.0,485.15
49997,28,Male,Uganda,Secondary,1,1.0,0.0,325.05
49998,59,Female,Uganda,Secondary,1,0.0,0.0,910.17


In [84]:
def normalize_yes_no_binary(x):
    if pd.isna(x):
        return np.nan

    if isinstance(x, (int, float, np.integer, np.floating)):
        return int(x) if x in (0, 1) else np.nan

    s = str(x).strip().lower()

    if s in {"yes", "y", "true", "t", "1"}:
        return 1
    if s in {"no", "n", "false", "f", "0"}:
        return 0
    return np.nan

def impute_income_grouped(df, income_col="monthly_income",
                          group_cols=("country", "education_level")):

 # group median (country and education)
    group_median = df.groupby(list(group_cols))[income_col].transform("median")
    df[income_col] = df[income_col].fillna(group_median)

#country median
    country_median = df.groupby("country")[income_col].transform("median")
    df[income_col] = df[income_col].fillna(country_median)

 #overall median
    df[income_col] = df[income_col].fillna(df[income_col].median())
    return df

In [87]:
# Normalize messy binary columns (mixed types)

binary_cols = ["has_bank_account", "mobile_money_user", "loan_access"]
for col in binary_cols:
    if col in df.columns:
        df[col] = df[col].apply(normalize_yes_no_binary)
        
##  Handle negatives / invalids (income)

income_col = "monthly_income"
if income_col in df.columns:
    df.loc[df[income_col] < 0, income_col] = np.nan

#Bank Account: Fill NaN with 0 (Unknown -> No account)
if "has_bank_account" in df.columns:
    df["has_bank_account"] = df["has_bank_account"].fillna(0).astype(int)

# Gender: Fill NaN with "Unknown"
if "gender" in df.columns:
    df["gender"] = df["gender"].fillna("Unknown")

# Income: Fill NaN with median income of that country/education level
if income_col in df.columns:
    df = impute_income_grouped(df, income_col=income_col,
                               group_cols=("country", "education_level"))

In [89]:
for col in df.select_dtypes(include="object").columns:
    if col not in {"gender"}: 
        df[col] = df[col].fillna("Unknown")


In [93]:
binary_cols = ["has_bank_account", "mobile_money_user", "loan_access"]

for col in binary_cols:
    if col in df.columns:
        df[col] = df[col].fillna(0).astype(int)

In [94]:
df.isnull().sum()

age                  0
gender               0
country              0
education_level      0
has_bank_account     0
mobile_money_user    0
loan_access          0
monthly_income       0
dtype: int64

In [91]:
df.to_csv("cleaned_financial_inclusion.csv", index=False)
print("Saved: cleaned_financial_inclusion.csv")

Saved: cleaned_financial_inclusion.csv
