## Import + Load Train + Run Contract Engine

In [37]:
import pandas as pd
import numpy as np
from pathlib import Path
import sys

In [38]:
# Import sklearn dependencies
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

import joblib

In [39]:
# Define paths
ROOT = Path(r"D:\DATA SCIENCE PROJECTS\FHI_SOUTH_AFRICA")
DATA_RAW = ROOT / "data" / "raw"
OUT_MODELS = ROOT / "outputs" / "models" / "staging"
OUT_REPORTS = ROOT / "outputs" / "reports"

sys.path.append(str(ROOT))

OUT_MODELS.mkdir(parents=True, exist_ok=True)
OUT_REPORTS.mkdir(parents=True, exist_ok=True)

train_df = pd.read_csv(DATA_RAW / "Train.csv")

TARGET_COL = "Target"

#### *Canonical Features + Type List*

In [40]:
canonical_features = [c for c in train_df.columns if c !=TARGET_COL]

features_cols = canonical_features
numeric_cols = train_df[features_cols].select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = [c for c in features_cols if c not in numeric_cols]

print("n_features:", len(features_cols), "| n_numeric:", len(numeric_cols), "| n_cat:", len(categorical_cols))

n_features: 38 | n_numeric: 6 | n_cat: 32


#### *Run Contract Engine on Train Features*

In [41]:
from src.contract.engine import run_contract_engine


y = train_df[TARGET_COL].copy()
X_raw = train_df.drop(columns=[TARGET_COL]).copy()

out = run_contract_engine(X_raw, canonical_features=canonical_features, categorical_cols=categorical_cols)
X = out["df"]

print("Compatibility grade:", out["compatibility_grade"])
print("X:", X.shape, "y:", y.shape)
print("Has busieness_id:", "business_id" in X.columns)
print("Has country:", any(c.strip().lower() == "country" for c in X.columns))

# few notes
print("\nNotes (first 10):")
for n in out["compatibility_notes"][:10]:
    print("-", n)

Compatibility grade: 1
X: (9618, 39) y: (9618,)
Has busieness_id: True
Has country: True

Notes (first 10):
- Renamed ID -> business_id.
- Added missing column 'ID' as NaN (schema alignment).
- Normalized 32 categorical columns.
- Derived business_age_months from business_age_years where missing.
- Derived business_age_years from business_age_months where missing.
- Detected country column: 'country'
- Signal summary: basics=3, financial_activity=3, access_resilience=12
- Grade 1: Sufficient signals for reliable scoring.


In [42]:
print("X columns count:", len(X.columns))
print("y lenth:", len(y))

X columns count: 39
y lenth: 9618


In [43]:
# X should have 38 columns, lets fix
suspicious_col = [c for c in X.columns if "unnamed" in c.lower() or "index" in c.lower()]
print("Suspicious column:", suspicious_col)

Suspicious column: []


In [44]:
# confirm Target is not in X
print("Target in X?:", TARGET_COL in X.columns)

Target in X?: False


In [45]:
# compare against canonical features
extra = sorted(list(set(X.columns) - set(canonical_features)))
missing = sorted(list(set(canonical_features) - set(X.columns)))

print("Extra columns vs canonical_features:", extra)
print("Missing columns vs canonical_features:", missing)

Extra columns vs canonical_features: ['business_id']
Missing columns vs canonical_features: []


In [46]:
# drop extra columns safely
X = X.drop(columns=extra, errors="ignore")
X = X[canonical_features]

print("\nFixed X shape:", X.shape)
print("\nTarget in X?:", TARGET_COL in X.columns)


Fixed X shape: (9618, 38)

Target in X?: False


### *Create correct canonical schema*

In [47]:
# canonical data columns should be same to what contract engine outputs
canonical_data_cols = list(X.columns)
print("Canonical data cols count:", len(canonical_data_cols))
print("Has business_id", "business_id" in canonical_data_cols)
print("Has ID:", "ID" in canonical_data_cols)

Canonical data cols count: 38
Has business_id False
Has ID: True
