## Imports + paths

In [2]:
import pandas as pd 
import numpy as np 
from pathlib import Path

### Define the paths

#### Folder paths

In [3]:
ROOT = Path(r"D:\DATA SCIENCE PROJECTS\FHI_SOUTH_AFRICA")
DATA_RAW = ROOT / "data" /"raw"
OUT_PROF = ROOT / "outputs" / "profiling"
OUT_FIG = ROOT /"outputs" / "figures"

OUT_PROF.mkdir(parents=True, exist_ok=True)
OUT_FIG.mkdir(parents=True, exist_ok=True)

#### Data Paths

In [4]:
train_path = DATA_RAW / "Train.csv"
test_path = DATA_RAW / "Test.csv"
vardef_path = DATA_RAW / "VariableDefinitions.csv"

print("Train", train_path.exists(), train_path)
print("Test", test_path.exists(), test_path)
print("Variable Definitions", vardef_path.exists(), vardef_path)

Train True D:\DATA SCIENCE PROJECTS\FHI_SOUTH_AFRICA\data\raw\Train.csv
Test True D:\DATA SCIENCE PROJECTS\FHI_SOUTH_AFRICA\data\raw\Test.csv
Variable Definitions True D:\DATA SCIENCE PROJECTS\FHI_SOUTH_AFRICA\data\raw\VariableDefinitions.csv


## Load Data

In [5]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

Train shape: (9618, 39)
Test shape: (2405, 38)


In [6]:
train_df.head(3)

Unnamed: 0,ID,country,owner_age,attitude_stable_business_environment,attitude_worried_shutdown,compliance_income_tax,perception_insurance_doesnt_cover_losses,perception_cannot_afford_insurance,personal_income,business_expenses,...,has_internet_banking,has_debit_card,future_risk_theft_stock,business_age_months,medical_insurance,funeral_insurance,motivation_make_more_money,uses_friends_family_savings,uses_informal_lender,Target
0,ID_3CFL0U,eswatini,63.0,Yes,No,No,No,Yes,3000.0,6000.0,...,Never had,Never had,,6.0,Never had,Used to have but don’t have now,,Never had,Never had,Low
1,ID_XWI7G3,zimbabwe,39.0,No,Yes,Yes,No,Yes,,,...,,,No,3.0,Never had,Never had,,,,Medium
2,ID_TY93LV,malawi,34.0,Don’t know or N/A,No,No,Don't know,Yes,30000.0,6000.0,...,Never had,Never had,Yes,,,,Yes,,,Low


### Saninty Checks

* Check key columns
* Check ID/Business_id
* Check countries

In [7]:
# Columns check
cols = set(train_df.columns)
print("Has Target:", "Target" in cols)

Has Target: True


In [8]:
# ID/Business ID check
print("Has ID:", "ID" in cols, "| Has Business ID:", "business_id" in cols)

Has ID: True | Has Business ID: False


In [9]:
# country check
print("Has Country:", "country" in cols)

Has Country: True


In [10]:
# train/test column mismatch
test_cols = set(test_df.columns)
train_cols = set(train_df.columns)

print("Columns in train not in test:", sorted(list(train_cols - test_cols))[:30], "...")
print("Columns in test not in train:", sorted(list(test_cols - train_cols))[:30], "...")

Columns in train not in test: ['Target'] ...
Columns in test not in train: [] ...


## Target Distribution

In [11]:
target_count = train_df["Target"].value_counts(dropna=False)
target_pct = (target_count / len(train_df) * 100).round(2)

target_dist = pd.DataFrame({"Count": target_count, "Percentage": target_pct})
target_dist.to_csv(OUT_PROF / "target_distribution_overall.csv", index=True)

target_dist

Unnamed: 0_level_0,Count,Percentage
Target,Unnamed: 1_level_1,Unnamed: 2_level_1
Low,6280,65.29
Medium,2868,29.82
High,470,4.89


## Target Distribution by Country

In [12]:
if "country" in train_df.columns:
    by_country = (
        train_df.groupby("country")["Target"]
        .value_counts(normalize=False)
        .rename("count")
        .reset_index()
    )

    by_country["pct_within_country"] = (
        by_country.groupby("country")["count"].transform(lambda x: (x / x.sum() * 100).round(2))
    )

    by_country.to_csv(OUT_PROF / "target_distribution_by_country.csv", index=False)
    by_country

else:
    print("No country column found.")

In [22]:
by_country.head(20)

Unnamed: 0,country,Target,count,pct_within_country
0,eswatini,Low,1375,51.42
1,eswatini,Medium,992,37.1
2,eswatini,High,307,11.48
3,lesotho,Low,1174,60.39
4,lesotho,Medium,764,39.3
5,lesotho,High,6,0.31
6,malawi,Low,1940,81.24
7,malawi,Medium,352,14.74
8,malawi,High,96,4.02
9,zimbabwe,Low,1791,68.57


## Missing Data

In [14]:
def missingness_table(df: pd.DataFrame, name: str) -> pd.DataFrame:
    miss = df.isna().sum()
    pct = (miss / len(df) * 100).round(2)
    out = pd.DataFrame({"missing_count": miss, "missing_pct": pct}).sort_values("missing_pct", ascending=False)
    out.to_csv(OUT_PROF / f"missingness_{name}.csv", index=True)
    return out

miss_train = missingness_table(train_df, "train")
miss_test = missingness_table(test_df, "test")

miss_train.head(10)

Unnamed: 0,missing_count,missing_pct
uses_informal_lender,4489,46.67
uses_friends_family_savings,4488,46.66
motivation_make_more_money,4291,44.61
funeral_insurance,4188,43.54
medical_insurance,4188,43.54
business_age_months,4111,42.74
future_risk_theft_stock,4100,42.63
has_debit_card,4003,41.62
has_internet_banking,4003,41.62
has_loan_account,3999,41.58


## Numeric vs Categorical Columns

In [15]:
# Drop target from feature typing
feature_cols = [c for c in train_df.columns if c != "Target"]

numeric_cols = train_df[feature_cols].select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = [c for c in feature_cols if c not in numeric_cols]

pd.Series({
    "n_features_total": len(feature_cols),
    "n_numeric": len(numeric_cols),
    "n_categorical": len(categorical_cols)
})

n_features_total    38
n_numeric            6
n_categorical       32
dtype: int64

In [16]:
# save list
pd.Series(numeric_cols).to_csv(OUT_PROF / "numeric_columns.csv", index=False, header=["column"])
pd.Series(categorical_cols).to_csv(OUT_PROF / "categorical_columns.csv", index=False, header=["column"])

numeric_cols[:10], categorical_cols[:10]

(['owner_age',
  'personal_income',
  'business_expenses',
  'business_turnover',
  'business_age_years',
  'business_age_months'],
 ['ID',
  'country',
  'attitude_stable_business_environment',
  'attitude_worried_shutdown',
  'compliance_income_tax',
  'perception_insurance_doesnt_cover_losses',
  'perception_cannot_afford_insurance',
  'motor_vehicle_insurance',
  'has_mobile_money',
  'current_problem_cash_flow'])

## Outlier & Skew Check for Numeric Columns

In [17]:
def numeric_profile(df: pd.DataFrame, cols:list[str]) ->pd.DataFrame:
    rows = []
    for c in cols:
        s = df[c]
        rows.append({
            "column": c,
            "non-null": int(s.notna().sum()),
            "missing_pct": float((s.isna().mean()*100)),
            "min": float(np.nanmin(s.values)) if s.notna().any() else np.nan,
            "p01": float(np.nanpercentile(s.values, 1)) if s.notna().any() else np.nan,
            "p50": float(np.nanpercentile(s.values, 50)) if s.notna().any() else np.nan,
            "p99": float(np.nanpercentile(s.values, 99)) if s.notna().any() else np.nan,
            "max": float(np.nanmax(s.values)) if s.notna().any() else np.nan,
            "mean": float(np.nanmean(s.values)) if s.notna().any() else np.nan,

        })

    out = pd.DataFrame(rows).sort_values("missing_pct", ascending=False)
    out.to_csv(OUT_PROF / "numeric_profile.csv", index=False)
    return out

num_prof = numeric_profile(train_df, numeric_cols)
num_prof

Unnamed: 0,column,non-null,missing_pct,min,p01,p50,p99,max,mean
5,business_age_months,5507,42.742774,0.0,0.0,3.0,11.0,11.0,3.636281
4,business_age_years,9366,2.620087,0.0,0.0,4.0,37.0,60.0,7.030536
2,business_expenses,9389,2.380952,0.0,40.0,3000.0,8019200.0,500000000.0,458383.8
3,business_turnover,9402,2.245789,0.0,97.0,6000.0,24995000.0,420000000.0,1348210.0
1,personal_income,9509,1.133292,0.0,20.0,2000.0,3129200.0,150000000.0,262734.5
0,owner_age,9618,0.0,18.0,20.0,40.0,76.0,103.0,41.70534


## Categorical Cardinality + Top Values

* Spot messy survey tokens

In [18]:
def categorical_profile(df: pd.DataFrame, cols: list[str], top_n: int = 10) ->pd.DataFrame:
    rows = []
    for c in cols:
        s = df[c].astype("object")
        rows.append({
            "column": c,
            "n_unique": int(s.nunique(dropna=True)),
            "missing_pct": float((s.isna().mean()*100)),
            "top_values": str(s.value_counts(dropna=False).head(top_n).to_dict()),
        })

        out = pd.DataFrame(rows).sort_values(["missing_pct", "n_unique"], ascending=[False, False])
        out.to_csv(OUT_PROF / "categorical_profile_top_values.csv", index=False)
    return out

cat_prof = categorical_profile(train_df, categorical_cols, top_n=8)
cat_prof.head(30)

Unnamed: 0,column,n_unique,missing_pct,top_values
31,uses_informal_lender,6,46.672905,"{nan: 4489, 'Never had': 2864, ""Used to have b..."
30,uses_friends_family_savings,6,46.662508,"{nan: 4488, 'Never had': 2895, ""Used to have b..."
29,motivation_make_more_money,2,44.614265,"{nan: 4291, 'Yes': 3462, 'No': 1865}"
27,medical_insurance,6,43.543356,"{'Never had': 4794, nan: 4188, 'Have now': 404..."
28,funeral_insurance,5,43.543356,"{nan: 4188, 'Never had': 3615, 'Have now': 160..."
26,future_risk_theft_stock,2,42.628405,"{nan: 4100, 'No': 3191, 'Yes': 2327}"
24,has_internet_banking,6,41.619879,"{'Never had': 4713, nan: 4003, 'Have now': 550..."
25,has_debit_card,5,41.619879,"{nan: 4003, 'Never had': 3864, 'Have now': 143..."
23,has_loan_account,6,41.578291,"{'Never had': 4501, nan: 3999, ""Used to have b..."
9,current_problem_cash_flow,3,39.280516,"{nan: 3778, 'Yes': 2688, 'No': 1887, '0': 1265}"


## Detect "Weird Token" Inconsistencies & Normalize

In [19]:
# common messy tokens to look fo in categorical fields

tokens = ["Don't", "Don`t", "Refused", "N/A", "NA", "None", "none", "uknown", "Unknown"]

hits = []

for c in categorical_cols:
    s = train_df[c].astype("object")
    vc = s.value_counts(dropna=False)
    present = [t for t in tokens if any (vc.index.astype(str).str.contains(t, na=False))]
    if present:
        hits.append({"column": c, "tokens_found": present})

hits_df = pd.DataFrame(hits).sort_values("column")
hits_df.to_csv(OUT_PROF / "token_inconsistencies.csv", index=False)

hits_df.head(50)

Unnamed: 0,column,tokens_found
0,ID,[NA]
13,attitude_more_successful_next_year,"[Don't, N/A]"
8,attitude_satisfied_with_achievement,"[Don't, N/A]"
1,attitude_stable_business_environment,[N/A]
2,attitude_worried_shutdown,[N/A]
3,compliance_income_tax,[Refused]
12,covid_essential_service,[Don't]
18,funeral_insurance,[Don't]
9,has_credit_card,[Don't]
16,has_debit_card,[Don't]


## Leakage Scan

* Flag Columns suspiciously correlated with the target after simple Encoding -> any numeric columns that separate classes too perfectly

In [20]:
if len(numeric_cols) > 0:
    tmp = train_df[numeric_cols + ["Target"]].copy()
    stats = tmp.groupby("Target")[numeric_cols].agg(["mean", "median"]).transpose()
    stats.to_csv(OUT_PROF / "numeric_by_target_summary.csv")
    stats.head(30)
else:
    print("No numeric columns found.")

In [21]:
stats.head(30)

Unnamed: 0,Target,High,Low,Medium
owner_age,mean,44.61064,40.73296,43.358438
owner_age,median,42.0,39.0,42.0
personal_income,mean,882802.2,254094.8,180199.970205
personal_income,median,5000.0,2000.0,1500.0
business_expenses,mean,1062801.0,485568.6,297952.644548
business_expenses,median,10100.0,3000.0,2000.0
business_turnover,mean,5598924.0,1253353.0,852437.068719
business_turnover,median,45000.0,6000.0,4500.0
business_age_years,mean,8.015021,7.033745,6.861239
business_age_years,median,5.0,4.0,4.0
