## Import + Load Train + Run Contract Engine

In [93]:
import pandas as pd
import numpy as np
from pathlib import Path
import sys

In [94]:
import importlib
import src.contract.normalizer as norm
import src.contract.engine as eng

importlib.reload(norm)
importlib.reload(eng)

<module 'src.contract.engine' from 'D:\\DATA SCIENCE PROJECTS\\FHI_SOUTH_AFRICA\\src\\contract\\engine.py'>

In [95]:
# Import sklearn dependencies
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

import joblib

In [96]:
# Define paths
ROOT = Path(r"D:\DATA SCIENCE PROJECTS\FHI_SOUTH_AFRICA")
DATA_RAW = ROOT / "data" / "raw"
OUT_MODELS = ROOT / "outputs" / "models" / "staging"
OUT_REPORTS = ROOT / "outputs" / "reports"

sys.path.append(str(ROOT))

OUT_MODELS.mkdir(parents=True, exist_ok=True)
OUT_REPORTS.mkdir(parents=True, exist_ok=True)

train_df = pd.read_csv(DATA_RAW / "Train.csv")

TARGET_COL = "Target"

#### *Canonical Features + Type List*

In [97]:
canonical_features = [c for c in train_df.columns if c !=TARGET_COL]

features_cols = canonical_features
numeric_cols = train_df[features_cols].select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = [c for c in features_cols if c not in numeric_cols]

print("n_features:", len(features_cols), "| n_numeric:", len(numeric_cols), "| n_cat:", len(categorical_cols))

n_features: 38 | n_numeric: 6 | n_cat: 32


#### *Run Contract Engine on Train Features*

In [104]:
from src.contract.engine import run_contract_engine


y = train_df[TARGET_COL].copy()
X_raw = train_df.drop(columns=[TARGET_COL]).copy()

out = run_contract_engine(X_raw, canonical_features_raw=canonical_features, categorical_cols=categorical_cols)
X = out["df"].copy()

print("Compatibility grade:", out["compatibility_grade"])
print("X:", X.shape, "y:", y.shape)
print("Has busieness_id:", "business_id" in X.columns)
print("Has country:", any(c.strip().lower() == "country" for c in X.columns))
print("X has ID:", "ID" in X.columns)
# Check if any "Unnamed" index columns exist
print("Unnamed cols:", [c for c in X.columns if "unnamed" in c.lower()])

# Check duplicates
print("Duplicate columns?:", X.columns.duplicated().any())

# few notes
print("\nNotes (first 10):")
for n in out["compatibility_notes"][:10]:
    print("-", n)

Compatibility grade: 1
X: (9618, 38) y: (9618,)
Has busieness_id: True
Has country: True
X has ID: False
Unnamed cols: []
Duplicate columns?: False

Notes (first 10):
- Renamed ID -> business_id.
- Categorical cols mapped (raw->32 to mapped->32).
- Normalized 32 categorical columns.
- Derived business_age_months from business_age_years where missing.
- Derived business_age_years from business_age_months where missing.
- Detected country column: 'country'
- Signal summary: basics=3, financial_activity=3, access_resilience=12
- Grade 1: Sufficient signals for reliable scoring.


## Build `X_model`

In [105]:
IDENTIFIER_COLS = ["business_id"]

X_model = X.drop(columns=[c for c in IDENTIFIER_COLS if c in X.columns], errors="ignore")

print("X_model shape:", X_model.shape)
print("business_id in X_model?:", "business_id" in X_model.columns)
print("Row count match?:", X_model.shape[0] == len(y))

X_model shape: (9618, 37)
business_id in X_model?: False
Row count match?: True


## Baseline Model: `Logistic Regression`

#### *Split (Stratified)*

In [106]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X_model, y, test_size=0.2, random_state=42, stratify=y
)

print("Train:", X_train.shape, "Val:", X_val.shape)
print("y_train distribution:\n", y_train.value_counts(normalize=True).round(3))

Train: (7694, 37) Val: (1924, 37)
y_train distribution:
 Target
Low       0.653
Medium    0.298
High      0.049
Name: proportion, dtype: float64
