## 1. Column Mapping

#### Setup & Load Data

In [1]:
import pandas as pd 
import numpy as np 
from pathlib import Path 
import re

In [4]:
ROOT = Path(r"D:\DATA SCIENCE PROJECTS\FHI_SOUTH_AFRICA")
DATA_RAW = ROOT / "data" / "raw"
OUT_PROF = ROOT / "outputs" / "profiling"
OUT_PROF.mkdir(parents=True, exist_ok=True)

In [5]:
train_path = DATA_RAW / "Train.csv"
test_path = DATA_RAW / "Test.csv"

In [6]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [7]:
print("Train:", train_df.shape, "| Test:", test_df.shape)
train_df.head(2)

Train: (9618, 39) | Test: (2405, 38)


Unnamed: 0,ID,country,owner_age,attitude_stable_business_environment,attitude_worried_shutdown,compliance_income_tax,perception_insurance_doesnt_cover_losses,perception_cannot_afford_insurance,personal_income,business_expenses,...,has_internet_banking,has_debit_card,future_risk_theft_stock,business_age_months,medical_insurance,funeral_insurance,motivation_make_more_money,uses_friends_family_savings,uses_informal_lender,Target
0,ID_3CFL0U,eswatini,63.0,Yes,No,No,No,Yes,3000.0,6000.0,...,Never had,Never had,,6.0,Never had,Used to have but don’t have now,,Never had,Never had,Low
1,ID_XWI7G3,zimbabwe,39.0,No,Yes,Yes,No,Yes,,,...,,,No,3.0,Never had,Never had,,,,Medium


#### Utility `->` Build Expected Feature List From Train

In [8]:
TARGET_COL = "Target"
COUNTRY_COL = "Country"

# Canonical feature list = train columns excluding target
CANONICAL_FEATURES = [c for c in train_df.columns if c != TARGET_COL]

print("Canonical Feature Count:", len(CANONICAL_FEATURES))
print("First 10 Canonical Features:", CANONICAL_FEATURES[:10])

Canonical Feature Count: 38
First 10 Canonical Features: ['ID', 'country', 'owner_age', 'attitude_stable_business_environment', 'attitude_worried_shutdown', 'compliance_income_tax', 'perception_insurance_doesnt_cover_losses', 'perception_cannot_afford_insurance', 'personal_income', 'business_expenses']


#### Schema Mapping (ID `->` business_id)

In [9]:
def map_schema(df: pd.DataFrame) -> tuple[pd.DataFrame, list[str]]:
    """
    Standardizes identifier column naming:
    - If ID exixts, rename to business_id
    -If busisness_id exists, keep
    -If neither exists, generate business_id
    """
    notes = []

    df = df.copy()

    if "business_id" in df.columns:
        notes.append("business_id already present.")
    elif "ID" in df.columns:
        df =df.rename(columns={"ID": "business_id"})
        notes.append("Renamed ID -> business_id.")
    else:
        # generate stable ids based on row index
        df["business_id"] = [f"auti_{i:06d}" for i in range(1, len(df) + 1)]
        notes.append("No ID/business_id found. Generated business_id as auto_000001...")

    return df, notes

## 2. Normalize Text Values

In [10]:
MISSING_TOKENS = {
    "", " ", "  ", "n/a", "na", "none", "null", "nan", "refused", "prefer not to say", "not applicable"
    }
def normalize_text_value(x):
    if pd.isna(x):
        return "missing"
    
    # cast to str
    s = str(x).strip()

    # Normalize curly apostrophes to straight
    s = s.replace("’", "'").replace("‘", "'")

    # collapse repeated whitespace
    s = re.sub(r"\s+", " ", s).strip()

    # Handle missing tokens
    if s.lower() in MISSING_TOKENS:
        return "missing"
    
    # Standardize Don't/Don`t to "dont_know" if it's a "don't know" style token
    if s.lower() in ["don't know", "dont know", "don’t knoww"]:
        return "dont_know"
    return s


In [11]:
def normalize_categoricals(df: pd.DataFrame, categorical_cols: list[str]) -> tuple[pd.DataFrame, list[str]]:
    """
    Normalize categorical columns to reduce duplicate categories and stabilize encoding
    Returns cleaned df + notes
    """

    df = df.copy()
    notes = []
    for c in categorical_cols:
        # Only normalize non-numeric columns

        df[c] = df[c].astype("object").map(normalize_text_value)
    notes.append(f"Normalized {len(categorical_cols)} categorical columns (apostrophes, midding tokens, dont_know).")
    return df, notes 

#### Identify Numeric vs Categorical Columns From Profiling Logic

In [12]:
# Use train to infer types
feature_cols = [c for c in train_df.columns if c != TARGET_COL]

numeric_cols = train_df[feature_cols].select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = [c for c in feature_cols if c not in numeric_cols]

print("Numeric:", numeric_cols)
print("Categorical count:", len(categorical_cols))

Numeric: ['owner_age', 'personal_income', 'business_expenses', 'business_turnover', 'business_age_years', 'business_age_months']
Categorical count: 32


## Safe Feature derivation

In [13]:
def derive_features(df: pd.DataFrame) -> tuple[pd.DataFrame, list[str]]:
    """
    Derives business_age_years/months where possible.
    Does NOT guess money units or invent financial values.
    """
    df = df.copy()
    notes = []

    if "business_age_months" in df.columns and "business_age_years" in df.columns:
        # if months missing but years present
        months_missing = df["business_age_months"].isna() | (df["business_age_months"] == "missing")
        years_numeric = pd.to_numeric(df["business_age_years"], errors = "coerce")

    # If years missing but months present
    years_missing = df["business_age_years"].isna() | (df["business_age_years"] == "missing")
    months_numeric = pd.to_numeric(df["business_age_months"], errors="coerce")

    if years_missing.any() and months_numeric.notna().any():
        df.loc[years_missing, "business_age_years"] = np.floor(months_numeric[years_missing] / 12)
        notes.append("Derived business_age_years from business_age_months where missing.")

    return df, notes