# BankChurn Predictor â€” Demo
Quick run: train small sample, predict, and show simple feature importance.

In [None]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import train_test_split

from src.bankchurn.config import BankChurnConfig
from src.bankchurn.training import ChurnTrainer
from src.bankchurn.prediction import ChurnPredictor

print('Imports ready')

In [None]:
# Load data (expects data/raw/Churn.csv)
csv_path = Path('data/raw/Churn.csv')
if not csv_path.exists():
    # create small synthetic sample if not present
    n=500
    df = pd.DataFrame({
        'RowNumber': range(1,n+1),
        'CustomerId': range(10000,10000+n),
        'Surname': [f'Customer_{i}' for i in range(n)],
        'CreditScore': np.random.randint(350,851,n),
        'Geography': np.random.choice(['France','Spain','Germany'], n),
        'Gender': np.random.choice(['Male','Female'], n),
        'Age': np.random.randint(18,93,n),
        'Tenure': np.random.randint(0,11,n),
        'Balance': np.random.uniform(0,250000,n),
        'NumOfProducts': np.random.randint(1,5,n),
        'HasCrCard': np.random.choice([0,1], n),
        'IsActiveMember': np.random.choice([0,1], n),
        'EstimatedSalary': np.random.uniform(11,200000,n),
        'Exited': np.random.choice([0,1], n, p=[0.8,0.2])
    })
else:
    df = pd.read_csv(csv_path)
df.head()

In [None]:
# Initialize Trainer with Config
config = BankChurnConfig.from_yaml('configs/config.yaml')
# Disable MLflow for demo to avoid clutter
config.mlflow.enabled = False 

trainer = ChurnTrainer(config)

# Prepare data
X, y = trainer.prepare_features(df)

# Split manually for demo purposes (though trainer handles CV internally)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train on training set
model, metrics = trainer.train(X_train, y_train, use_cv=False)
print("Training Metrics:", metrics)

# Predict on test set
# We use the trained pipeline directly or create a Predictor wrapper
predictor = ChurnPredictor(model)
preds_df = predictor.predict(X_test)

# Calculate demo metric
acc = (preds_df["prediction"].values == y_test.values).mean()
print(json.dumps({'accuracy_demo': float(acc)}, indent=2))

In [None]:
# Simple feature importance snapshot
# Note: Since we use a VotingClassifier, we don't have a single "feature_importances_" array.
# We can look at the RandomForest component if we want.

try:
    # Access the classifier step
    pipeline = trainer.model_
    # Access Random Forest from VotingClassifier
    # Structure: Pipeline -> 'classifier' (ResampleClassifier) -> 'estimator_' (VotingClassifier) -> 'estimators_' (list)
    
    # Note: ResampleClassifier wraps the estimator
    if hasattr(pipeline.named_steps['classifier'], 'estimator_'):
        voting_clf = pipeline.named_steps['classifier'].estimator_
        
        # Find RF
        rf_model = dict(voting_clf.named_estimators_)['rf']
        
        # Get feature names
        # Preprocessor is first step
        preprocessor = pipeline.named_steps['preprocessor']
        # This is complex with ColumnTransformer, let's just use a simple proxy or try to get names if possible
        # For demo simplicity, we'll plot the RF importances against indices or try to map them
        
        importances = rf_model.feature_importances_
        indices = np.argsort(importances)[::-1]
        
        # Plot top 10
        plt.figure(figsize=(10,6))
        plt.title("Feature Importances (Random Forest component)")
        plt.bar(range(10), importances[indices[:10]])
        plt.show()
        
except Exception as e:
    print(f"Could not extract feature importances directly: {e}")
    # Fallback to variance proxy
    imp = X.select_dtypes(include=np.number).var().sort_values(ascending=False).head(10)
    imp.plot(kind='bar', title='Feature importance (proxy)');