In [13]:
import os
import joblib
import pandas as pd
import logging
from src.data.demo_data_preprocessing import (
    TargetExtractor,
    NaNDuplicatesRemover,
    OutlierRemover,
    FeatureTransformer,
    ColumnAligner,
)


# Set up logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

def preprocess_demo_data(demo_csv_path, output_path="artifacts/demo_processed.csv"):
    # Load artifacts
    pipeline = joblib.load("artifacts/preprocessing_pipeline.pkl")
    transformers = joblib.load("artifacts/transformers.pkl")
    outlier_thresholds = joblib.load("artifacts/outlier_thresholds.pkl")
    reference_columns = joblib.load("artifacts/feature_columns.pkl")

    # Load demo data
    demo_df = pd.read_csv(demo_csv_path)
    logging.info(f"Loaded demo data with shape: {demo_df.shape}")

    # Manually rebuild relevant pipeline steps for transform only
    from sklearn.pipeline import Pipeline
    from src.data.demo_data_preprocessing import (
        NaNDuplicatesRemover,
        OutlierRemover,
        FeatureTransformer,
        ColumnAligner,
        TargetExtractor,
    )

    # Recreate minimal pipeline for inference (exclude target extraction)
    inference_pipeline = Pipeline(steps=[
        ('cleaning', NaNDuplicatesRemover()),
        ('outlier_removal', OutlierRemover()),
        ('feature_transform', FeatureTransformer()),
        ('column_alignment', ColumnAligner(reference_columns=reference_columns)),
    ])

    # Inject fitted parameters
    inference_pipeline.named_steps['outlier_removal'].thresholds_ = outlier_thresholds
    inference_pipeline.named_steps['feature_transform'].transformers_ = transformers

    # Transform demo data
    processed_demo = inference_pipeline.transform(demo_df)
    logging.info(f"Processed demo data shape: {processed_demo.shape}")

    # Save output
    os.makedirs("artifacts", exist_ok=True)
    processed_demo.to_csv(output_path, index=False)
    logging.info(f"Processed demo data saved to: {output_path}")

In [14]:
df.head()

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22.0,female,Master,71948.0,0,RENT,35000.0,PERSONAL,16.02,0.49,3.0,561,No,1
1,21.0,female,High School,12282.0,0,OWN,1000.0,EDUCATION,11.14,0.08,2.0,504,Yes,0
2,25.0,female,High School,12438.0,3,MORTGAGE,5500.0,MEDICAL,12.87,0.44,3.0,635,No,1
3,23.0,female,Bachelor,79753.0,0,RENT,35000.0,MEDICAL,15.23,0.44,2.0,675,No,1
4,24.0,male,Master,66135.0,1,RENT,35000.0,MEDICAL,14.27,0.53,4.0,586,No,1


In [17]:
from sklearn.model_selection import train_test_split

X = df.drop('loan_status', axis=1)
y = df['loan_status']

In [18]:
preprocess_demo_data('./notebooks/loan_data.csv', './demo_artifacts/demo_processed.csv')

2025-05-06 12:11:06,875 - INFO - Loaded demo data with shape: (45000, 14)
2025-05-06 12:11:06,875 - INFO - Initial shape before NaN/Duplicate removal: (45000, 14)
2025-05-06 12:11:06,918 - INFO - Shape after NaN/Duplicate removal: (45000, 14)
2025-05-06 12:11:06,918 - INFO - Initial shape before outlier removal: (45000, 14)
2025-05-06 12:11:06,950 - INFO - Shape after outlier removal: (37515, 14)
2025-05-06 12:11:06,965 - INFO - Processed demo data shape: (37515, 13)
2025-05-06 12:11:07,361 - INFO - Processed demo data saved to: ./demo_artifacts/demo_processed.csv


In [26]:
import os
import joblib
import pandas as pd
import numpy as np
import logging
import yaml
from sklearn.pipeline import Pipeline

# Import data preprocessing modules
from src.data.data_transformation import (
    load_feature_config,
    create_preprocessing_pipeline
)

# Import feature engineering modules
from src.features.feature_engineering import (
    ColumnStandardizer,
    OneHotEncoder,
    TargetSeparator,
    SMOTEBalancer,
    FeatureColumnSaver,
    create_feature_engineering_pipeline,
    load_feature_store_params
)

# Import preprocessing components
from src.data.demo_data_preprocessing import (
    NaNDuplicatesRemover,
    OutlierRemover,
    FeatureTransformer,
    ColumnAligner
)

# Import utility functions
from src.logger import logging, section

# Set up logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

def preprocess_demo_data(demo_csv_path, output_path="artifacts/demo_processed.csv"):
    """Preprocess demo data using the saved preprocessing pipeline"""
    section("PREPROCESSING DEMO DATA", level=logging.INFO)

    # Load artifacts
    pipeline = joblib.load("artifacts/preprocessing_pipeline.pkl")
    transformers = joblib.load("artifacts/transformers.pkl")
    outlier_thresholds = joblib.load("artifacts/outlier_thresholds.pkl")
    reference_columns = joblib.load("artifacts/feature_columns.pkl")

    # Load demo data
    demo_df = pd.read_csv(demo_csv_path)
    logging.info(f"Loaded demo data with shape: {demo_df.shape}")

    # Recreate minimal pipeline for inference
    cleaning = NaNDuplicatesRemover()
    outlier_removal = OutlierRemover()
    outlier_removal.thresholds_ = outlier_thresholds
    feature_transform = FeatureTransformer()
    feature_transform.transformers_ = transformers
    column_alignment = ColumnAligner(reference_columns=reference_columns)

    # Sequentially apply transformations with logging
    data = demo_df.copy()

    logging.info(f"Step 1: Raw input shape: {data.shape}")
    data = cleaning.fit_transform(data)
    logging.info(f"Step 2: After NaN/Duplicates Removal: {data.shape}")

    data = outlier_removal.transform(data)
    logging.info(f"Step 3: After Outlier Removal: {data.shape}")

    data = feature_transform.transform(data)
    logging.info(f"Step 4: After Feature Transformation: {data.shape}")

    data = column_alignment.transform(data)
    logging.info(f"Step 5: After Column Alignment: {data.shape}")

    # Save output
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    data.to_csv(output_path, index=False)
    logging.info(f"Processed demo data saved to: {output_path}")

    return data

def apply_feature_engineering(processed_data, config_path, output_path="artifacts/demo_features.csv"):
    """Apply feature engineering to preprocessed data"""
    section("FEATURE ENGINEERING PIPELINE", level=logging.INFO, char='*', length=80)

    try:
        # Load feature configuration
        feature_config = load_feature_store_params(config_path)

        # Get target column (use default if not specified)
        target_col = None
        if 'target_cols' in feature_config and feature_config['target_cols']:
            target_col = feature_config['target_cols'][0].lower().strip()
        else:
            target_col = 'loan_status'  # Default target column
            logging.warning(f"No target column specified in config. Using default: {target_col}")

        # Create feature engineering pipeline for inference
        feat_pipeline, _ = create_feature_engineering_pipeline(
            feature_config,
            target_col=target_col,
            mode='test'  # Use test mode for demo data (inference)
        )

        # Apply feature engineering
        logging.info("Applying feature engineering to demo data")
        features = feat_pipeline.fit_transform(processed_data)
        logging.info(f"Feature engineering completed. Output shape: {features.shape}")

        # Save engineered features
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        features.to_csv(output_path, index=False)
        logging.info(f"Feature engineered data saved to: {output_path}")

        # Save the feature engineering pipeline
        pipeline_path = os.path.join(os.path.dirname(output_path), "feature_engineering_pipeline.joblib")
        joblib.dump(feat_pipeline, pipeline_path)
        logging.info(f"Feature engineering pipeline saved to: {pipeline_path}")

        # Return the engineered features
        return features

    except Exception as e:
        logging.error(f"Feature engineering pipeline failed: {str(e)}")
        section("FEATURE ENGINEERING PIPELINE FAILED", level=logging.ERROR, char='!', length=80)
        raise

def demo_complete_pipeline(demo_csv_path, config_path="references/feature_store.yaml"):
    """Demo the complete pipeline: preprocessing + feature engineering"""
    section("COMPLETE PIPELINE DEMONSTRATION", level=logging.INFO, char='#', length=80)

    try:
        # Step 1: Preprocess the demo data
        logging.info("Step 1: Preprocessing demo data")
        processed_data = preprocess_demo_data(demo_csv_path)

        # Step 2: Apply feature engineering
        logging.info("Step 2: Applying feature engineering")
        features = apply_feature_engineering(processed_data, config_path)

        section("PIPELINE DEMONSTRATION COMPLETED SUCCESSFULLY", level=logging.INFO, char='#', length=80)
        return features

    except Exception as e:
        logging.error(f"Complete pipeline demonstration failed: {str(e)}")
        section("PIPELINE DEMONSTRATION FAILED", level=logging.ERROR, char='!', length=80)
        raise

def main():
    """Main function to test the pipeline"""
    try:
        # Set up directories
        os.makedirs("artifacts", exist_ok=True)
        os.makedirs("data", exist_ok=True)

        # Path to demo data
        demo_data_path = "data/external/demo.csv"
        config_path = "references/feature_store.yaml"

        # Check if files exist before running the pipeline
        if not os.path.exists(demo_data_path):
            logging.error(f"Demo data not found at {demo_data_path}. Please provide demo data.")
            return

        if not os.path.exists(config_path):
            logging.error(f"Config not found at {config_path}. Please provide configuration.")
            return

        # Run the complete pipeline
        demo_complete_pipeline(demo_data_path, config_path)

    except Exception as e:
        logging.error(f"Main function failed: {str(e)}")
        raise

if __name__ == "__main__":
    main()

[✓] 2025-05-06 12:51:06 | root            | INFO  | 
################################################################################
                        COMPLETE PIPELINE DEMONSTRATION                         
################################################################################
[✓] 2025-05-06 12:51:06 | root            | INFO  | Step 1: Preprocessing demo data
[✓] 2025-05-06 12:51:06 | root            | INFO  | 
             PREPROCESSING DEMO DATA              
[✓] 2025-05-06 12:51:06 | root            | INFO  | Loaded demo data with shape: (4, 13)
[✓] 2025-05-06 12:51:06 | root            | INFO  | Step 1: Raw input shape: (4, 13)
[✓] 2025-05-06 12:51:06 | root            | INFO  | Initial shape before NaN/Duplicate removal: (4, 13)
[✓] 2025-05-06 12:51:06 | root            | INFO  | Shape after NaN/Duplicate removal: (4, 13)
[✓] 2025-05-06 12:51:06 | root            | INFO  | Step 2: After NaN/Duplicates Removal: (4, 13)
[✓] 2025-05-06 12:51:06 | root            | I

In [42]:
# Option 1: If the module successfully loads the pipeline
import pandas as pd
from src.features.demo_engineering import drop_target_column

# Load the pipeline directly (more reliable)
import joblib
pipeline = joblib.load('./models/pipe.pkl')

# Load your data
df = pd.read_csv("data/external/demo.csv")

# Use the pipeline
processed_data = pipeline.transform(df)

# Option 2: Define everything locally (most reliable)
import pandas as pd
import joblib

# Define the same function with the EXACT same name and implementation
def drop_target_column(X):
    """Drop the target column from the DataFrame."""
    return X.drop(columns=['loan_status'], errors='ignore')

# Now load the pipeline
pipeline = joblib.load('./models/pipe.pkl')

# Load your data
df = pd.read_csv("data/external/demo.csv")

# Use the pipeline
processed_data = pipeline.transform(df)

# Option 3: If you want to fix your original code
# Make sure you're importing the right function and loading the pipeline yourself
from src.features.demo_engineering import drop_target_column

# Load your data
df = pd.read_csv("data/external/demo.csv")

# Load the pipeline explicitly
import joblib
pipeline = joblib.load('./models/pipe.pkl')

# Now transform
processed_data = pipeline.transform(df)

[✓] 2025-05-06 14:04:33 | root            | INFO  | 
                  DATA CLEANING                   
[✓] 2025-05-06 14:04:33 | root            | INFO  | No NaN values found in the dataset
[✓] 2025-05-06 14:04:33 | root            | INFO  | No duplicate rows found in the dataset
[✓] 2025-05-06 14:04:33 | root            | INFO  | After cleaning: (3, 13)
[✓] 2025-05-06 14:04:33 | root            | INFO  | 
           OUTLIER REMOVAL - TRANSFORM            
[✓] 2025-05-06 14:04:33 | root            | INFO  | Processing outliers for column: person_age
[✗] 2025-05-06 14:04:33 | root            | ERROR | Error processing outliers for column 'person_age': ufunc 'less_equal' did not contain a loop with signature matching types (<class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.StrDType'>) -> None


TypeError: _UFuncNoLoopError.__init__() missing 1 required positional argument: 'dtypes'

In [51]:
import numpy as np
import pandas as pd
import yaml
import logging
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, PowerTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, precision_recall_curve, auc
from catboost import CatBoostClassifier
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='[✓] %(asctime)s | %(name)-15s | %-5s | %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)

# Custom transformer for IQR-based outlier removal with detailed logging
class IQROutlierRemover(BaseEstimator, TransformerMixin):
    def __init__(self, cols=None, factor=1.5):
        self.cols = cols
        self.factor = factor
        self.Q1 = {}
        self.Q3 = {}
        logger.info(f"Initialized IQROutlierRemover with factor={factor}")

    def fit(self, X, y=None):
        logger.info(f"Fitting IQROutlierRemover on data with shape {X.shape}")
        logger.info(f"Input columns: {list(X.columns)}")

        X_ = X.copy()
        if self.cols is None:
            self.cols = X_.select_dtypes(include=['int64', 'float64']).columns.tolist()
            logger.info(f"Auto-detected numeric columns: {self.cols}")
        else:
            logger.info(f"Using specified columns: {self.cols}")

        for col in self.cols:
            self.Q1[col] = X_[col].quantile(0.25)
            self.Q3[col] = X_[col].quantile(0.75)
            logger.info(f"Column '{col}': Q1={self.Q1[col]:.4f}, Q3={self.Q3[col]:.4f}")

        return self

    def transform(self, X, y=None):
        logger.info(f"Transforming data with IQROutlierRemover, input shape: {X.shape}")
        X_ = X.copy()

        outlier_counts = {}
        for col in self.cols:
            IQR = self.Q3[col] - self.Q1[col]
            lower_bound = self.Q1[col] - (self.factor * IQR)
            upper_bound = self.Q3[col] + (self.factor * IQR)

            logger.info(f"Column '{col}': IQR={IQR:.4f}, lower_bound={lower_bound:.4f}, upper_bound={upper_bound:.4f}")

            # Count outliers before replacing
            lower_outliers = (X_[col] < lower_bound).sum()
            upper_outliers = (X_[col] > upper_bound).sum()
            outlier_counts[col] = {'lower': lower_outliers, 'upper': upper_outliers, 'total': lower_outliers + upper_outliers}

            # Replace outliers with NaN
            X_[col] = np.where((X_[col] < lower_bound) | (X_[col] > upper_bound), np.nan, X_[col])

            # Log count of NaN values
            nan_count = X_[col].isna().sum()
            logger.info(f"Column '{col}': {lower_outliers} lower outliers, {upper_outliers} upper outliers, {nan_count} NaN values")

            # Fill NaN with median
            median_value = X_[col].median()
            X_[col] = X_[col].fillna(median_value)
            logger.info(f"Column '{col}': Filled NaN values with median {median_value:.4f}")

        logger.info(f"IQROutlierRemover transformation complete, output shape: {X_.shape}")
        logger.info(f"Outlier summary: {outlier_counts}")

        return X_

def log_dataframe_info(df, name):
    """Helper function to log dataframe information"""
    logger.info(f"{name} shape: {df.shape}")
    logger.info(f"{name} columns: {list(df.columns)}")
    logger.info(f"{name} data types:\n{df.dtypes}")
    logger.info(f"{name} missing values:\n{df.isna().sum()}")
    logger.info(f"{name} summary statistics:\n{df.describe().transpose()}")

def load_feature_store(file_path='./references/feature_store.yaml'):
    """
    Load feature store YAML file and return column names for numeric and categorical features.
    """
    logger.info(f"Loading feature store from {file_path}")

    try:
        with open(file_path, 'r') as f:
            feature_store = yaml.safe_load(f)

        numeric_cols = feature_store.get('numerical_cols', [])
        categorical_cols = feature_store.get('categorical_cols', [])

        logger.info(f"Loaded numeric columns: {numeric_cols}")
        logger.info(f"Loaded categorical columns: {categorical_cols}")

        return numeric_cols, categorical_cols
    except Exception as e:
        logger.error(f"Error loading feature store: {str(e)}")
        logger.warning("Returning empty column lists")
        return [], []

class LoggingTransformer(BaseEstimator, TransformerMixin):
    """Custom transformer that logs data without modifying it"""
    def __init__(self, name):
        self.name = name
        self._feature_names = None

    def fit(self, X, y=None):
        if isinstance(X, pd.DataFrame):
            logger.info(f"{self.name} fit - DataFrame shape: {X.shape}, columns: {list(X.columns)}")
        else:
            logger.info(f"{self.name} fit - array shape: {X.shape}")
            if self._feature_names is not None:
                logger.info(f"{self.name} fit - feature names: {self._feature_names}")
        return self

    def transform(self, X, y=None):
        if isinstance(X, pd.DataFrame):
            logger.info(f"{self.name} transform - DataFrame shape: {X.shape}, columns: {list(X.columns)}")
        else:
            logger.info(f"{self.name} transform - array shape: {X.shape}")
            if self._feature_names is not None:
                logger.info(f"{self.name} transform - feature names: {self._feature_names}")
        return X

    def set_feature_names(self, names):
        self._feature_names = names
        logger.info(f"{self.name} - Set feature names: {names}")

# New class to capture and log one-hot encoded feature names
class OneHotEncoderWithLogging(OneHotEncoder):
    """OneHotEncoder that logs the feature names it creates"""
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.feature_names_out_ = None
        self.categories_ = None

    def fit(self, X, y=None):
        result = super().fit(X, y)

        # Get and log the categories found
        if isinstance(X, pd.DataFrame):
            column_names = X.columns
        else:
            column_names = [f"col_{i}" for i in range(X.shape[1])]

        # Create and store feature names
        self.feature_names_out_ = []
        for i, (col, cats) in enumerate(zip(column_names, self.categories_)):
            # Skip the first category since drop_first=True
            for cat in cats[1:]:
                self.feature_names_out_.append(f"{col}_{cat}")

        logger.info(f"OneHotEncoder created the following features:")
        for feature in self.feature_names_out_:
            logger.info(f"  - {feature}")

        return result

    def get_feature_names_out(self, input_features=None):
        if self.feature_names_out_ is not None:
            return np.array(self.feature_names_out_)
        return super().get_feature_names_out(input_features)

def build_pipeline(X, target_column='loan_status', random_state=42):
    """
    Build a complete sklearn pipeline with outlier removal, power transformation,
    one-hot encoding, SMOTE, and CatBoostClassifier, with detailed logging
    """
    logger.info("Starting pipeline construction")
    logger.info(f"Input DataFrame shape: {X.shape}")
    logger.info(f"Input columns: {list(X.columns)}")
    logger.info(f"Target column: {target_column}")

    # Load column types from feature_store.yaml
    numeric_cols, categorical_cols = load_feature_store()

    # Auto-detect column types if not provided or if empty
    if not numeric_cols and not categorical_cols:
        logger.info("No columns found in feature store, auto-detecting column types")
        numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
        categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
        logger.info(f"Auto-detected numeric columns: {numeric_cols}")
        logger.info(f"Auto-detected categorical columns: {categorical_cols}")

    # Remove target column from feature lists if present
    if target_column in numeric_cols:
        numeric_cols.remove(target_column)
        logger.info(f"Removed target column '{target_column}' from numeric columns")

    if target_column in categorical_cols:
        categorical_cols.remove(target_column)
        logger.info(f"Removed target column '{target_column}' from categorical columns")

    logger.info(f"Final numeric columns: {numeric_cols}")
    logger.info(f"Final categorical columns: {categorical_cols}")

    # Define preprocessing steps with logging
    numeric_transformer = Pipeline(steps=[
        ('log_input', LoggingTransformer('Numeric preprocessing - input')),
        ('outlier_remover', IQROutlierRemover()),
        ('power_transformer', PowerTransformer(method='yeo-johnson')),
        ('log_output', LoggingTransformer('Numeric preprocessing - output'))
    ])
    logger.info("Created numeric transformation pipeline")

    # Use our new OneHotEncoderWithLogging instead of the standard OneHotEncoder
    # Set drop_first=True to avoid sparse output and prevent dummy variable trap
    categorical_transformer = Pipeline(steps=[
        ('log_input', LoggingTransformer('Categorical preprocessing - input')),
        ('onehot', OneHotEncoderWithLogging(handle_unknown='ignore', sparse_output=False, drop='first')),
        ('log_output', LoggingTransformer('Categorical preprocessing - output'))
    ])
    logger.info("Created categorical transformation pipeline with drop_first=True")

    # Column transformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_cols),
            ('cat', categorical_transformer, categorical_cols)
        ],
        remainder='drop',  # Drop any columns not specified
        verbose_feature_names_out=True  # Ensure verbose feature names that include transformer name
    )
    logger.info("Created column transformer")

    # Configure CatBoost classifier
    classifier = CatBoostClassifier(
        iterations=800,
        learning_rate=0.167,
        depth=6,
        loss_function='Logloss',
        eval_metric='AUC',
        random_seed=random_state,
        verbose=100,  # Show progress every 100 iterations
        thread_count=4
    )
    logger.info(f"Configured CatBoostClassifier with parameters: {classifier.get_params()}")

    # Full pipeline
    pipeline = ImbPipeline(steps=[
        ('preprocessor', preprocessor),
        ('log_before_smote', LoggingTransformer('Before SMOTE')),
        ('smote', SMOTE(random_state=random_state)),
        ('log_after_smote', LoggingTransformer('After SMOTE')),
        ('classifier', classifier)
    ])
    logger.info("Final pipeline created")

    # Function to extract and log feature names after pipeline fitting
    def extract_feature_names(fitted_pipeline):
        """Extract and log the feature names from the fitted pipeline"""
        try:
            # Get one-hot encoded feature names from the preprocessor
            cat_transformer = fitted_pipeline.named_steps['preprocessor'].named_transformers_['cat']
            onehotencoder = cat_transformer.named_steps['onehot']

            # Log categorical feature mapping
            logger.info("=== ONE-HOT ENCODED FEATURE NAMES ===")
            if hasattr(onehotencoder, 'feature_names_out_'):
                for feature in onehotencoder.feature_names_out_:
                    logger.info(f"  - {feature}")
            else:
                # Try to reconstruct the feature names
                if hasattr(onehotencoder, 'categories_'):
                    categorical_cols_used = fitted_pipeline.named_steps['preprocessor'].transformers_[1][2]

                    for i, (col, categories) in enumerate(zip(categorical_cols_used, onehotencoder.categories_)):
                        logger.info(f"Feature '{col}' encoded to:")
                        # Skip the first category as it's dropped
                        for cat in categories[1:]:
                            feature_name = f"{col}_{cat}"
                            logger.info(f"  - {feature_name}")

        except Exception as e:
            logger.warning(f"Error extracting one-hot encoded feature names: {str(e)}")

    # Add the extract_feature_names method to the pipeline
    pipeline.extract_feature_names = extract_feature_names

    return pipeline

def evaluate_model(model, X_test, y_test, X_train=None, y_train=None):
    """
    Evaluate the model performance using various classification metrics with detailed logging
    """
    logger.info("Starting model evaluation")
    logger.info(f"Test data shape: X_test={X_test.shape}, y_test={y_test.shape}")
    if X_train is not None:
        logger.info(f"Train data shape: X_train={X_train.shape}, y_train={y_train.shape}")

    # Make predictions
    logger.info("Generating predictions on test data")
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    # Log class distribution of actual vs predicted
    actual_dist = pd.Series(y_test).value_counts()
    predicted_dist = pd.Series(y_pred).value_counts()
    logger.info(f"Actual class distribution: {actual_dist.to_dict()}")
    logger.info(f"Predicted class distribution: {predicted_dist.to_dict()}")

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    logger.info(f"Test accuracy: {accuracy:.4f}")

    class_report = classification_report(y_test, y_pred, output_dict=True)
    logger.info(f"Classification report:\n{pd.DataFrame(class_report).transpose()}")

    conf_matrix = confusion_matrix(y_test, y_pred)
    logger.info(f"Confusion matrix:\n{conf_matrix}")

    roc_auc = roc_auc_score(y_test, y_pred_proba)
    logger.info(f"ROC AUC: {roc_auc:.4f}")

    # Calculate precision-recall AUC
    precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
    pr_auc = auc(recall, precision)
    logger.info(f"PR AUC: {pr_auc:.4f}")

    # Compile results
    results = {
        'accuracy': accuracy,
        'classification_report': class_report,
        'confusion_matrix': conf_matrix,
        'roc_auc': roc_auc,
        'pr_auc': pr_auc
    }

    # If training data is provided, check for overfitting
    if X_train is not None and y_train is not None:
        logger.info("Generating predictions on training data to check for overfitting")
        y_train_pred = model.predict(X_train)
        train_accuracy = accuracy_score(y_train, y_train_pred)
        results['train_accuracy'] = train_accuracy
        results['overfit_ratio'] = train_accuracy / accuracy if accuracy > 0 else float('inf')

        logger.info(f"Train accuracy: {train_accuracy:.4f}")
        logger.info(f"Overfit ratio (train/test): {results['overfit_ratio']:.4f}")
        if results['overfit_ratio'] > 1.1:
            logger.warning(f"Potential overfitting detected: train/test accuracy ratio = {results['overfit_ratio']:.4f}")

    logger.info("Model evaluation complete")
    return results

# Example usage
def main():
    logger.info("=== Starting ML Pipeline Execution ===")

    # Load dataset
    logger.info("Loading dataset")
    try:
        df = pd.read_csv('./notebooks/loan_data.csv')
        logger.info(f"Dataset loaded successfully with shape: {df.shape}")
        log_dataframe_info(df, "Original dataset")
    except Exception as e:
        logger.error(f"Error loading dataset: {str(e)}")
        return

    # Check for target column
    target_column = 'loan_status'
    if target_column not in df.columns:
        logger.error(f"Target column '{target_column}' not found in dataset")
        logger.info(f"Available columns: {list(df.columns)}")
        return

    # Log target distribution
    target_distribution = df[target_column].value_counts()
    logger.info(f"Target distribution:\n{target_distribution}")

    # Calculate and format percentage distribution properly
    percentage_distribution = 100 * target_distribution / len(df)
    logger.info(f"Target distribution percentage:\n{percentage_distribution.to_string()}")

    # Split data
    logger.info("Splitting dataset into train and test sets")
    X = df.drop(target_column, axis=1)
    y = df[target_column]
    logger.info(f"X shape after dropping target: {X.shape}")
    logger.info(f"y shape: {y.shape}")

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    logger.info(f"Train-test split complete: X_train={X_train.shape}, X_test={X_test.shape}, y_train={y_train.shape}, y_test={y_test.shape}")
    logger.info(f"Train target distribution: {pd.Series(y_train).value_counts().to_dict()}")
    logger.info(f"Test target distribution: {pd.Series(y_test).value_counts().to_dict()}")

    # Build pipeline
    logger.info("Building pipeline")
    pipeline = build_pipeline(
        X_train,
        target_column=target_column,
        random_state=42
    )

    # Fit pipeline
    logger.info("Fitting pipeline")
    try:
        pipeline.fit(X_train, y_train)
        logger.info("Pipeline fitting complete")

        # Extract and log the one-hot encoded feature names
        pipeline.extract_feature_names(pipeline)

        # Get feature importances from CatBoost if possible
        try:
            feature_importances = pipeline.named_steps['classifier'].feature_importances_

            # Try to get feature names from preprocessor
            try:
                cat_transformer = pipeline.named_steps['preprocessor'].named_transformers_['cat']
                onehotencoder = cat_transformer.named_steps['onehot']

                if hasattr(onehotencoder, 'feature_names_out_') and onehotencoder.feature_names_out_ is not None:
                    feature_names = onehotencoder.feature_names_out_
                    # Create feature importance dataframe
                    if len(feature_names) == len(feature_importances):
                        feature_importance = pd.DataFrame({
                            'Feature': feature_names,
                            'Importance': feature_importances
                        })
                        feature_importance = feature_importance.sort_values('Importance', ascending=False)
                        logger.info(f"Feature importances:\n{feature_importance.head(20)}")
            except Exception as e:
                logger.warning(f"Could not extract feature names for importances: {str(e)}")

        except Exception as e:
            logger.warning(f"Could not extract feature importances: {str(e)}")

    except Exception as e:
        logger.error(f"Error fitting pipeline: {str(e)}")
        return

    # Evaluate model
    logger.info("Evaluating model")
    results = evaluate_model(pipeline, X_test, y_test, X_train, y_train)

    # Print results
    logger.info("\n=== Model Evaluation Results ===")
    logger.info(f"Accuracy: {results['accuracy']:.4f}")
    logger.info(f"ROC AUC: {results['roc_auc']:.4f}")
    logger.info(f"PR AUC: {results['pr_auc']:.4f}")
    logger.info("\nClassification Report:")
    logger.info(f"{classification_report(y_test, pipeline.predict(X_test))}")
    logger.info("\nConfusion Matrix:")
    logger.info(f"{results['confusion_matrix']}")

    # Check for overfitting
    if 'overfit_ratio' in results:
        logger.info(f"\nTrain Accuracy: {results['train_accuracy']:.4f}")
        logger.info(f"Test Accuracy: {results['accuracy']:.4f}")
        logger.info(f"Overfit Ratio (train/test): {results['overfit_ratio']:.4f}")

        if results['overfit_ratio'] > 1.2:
            logger.warning("Model shows signs of overfitting")
        elif results['overfit_ratio'] > 1.1:
            logger.warning("Model shows mild signs of overfitting")
        else:
            logger.info("Model does not show significant signs of overfitting")

    logger.info("=== ML Pipeline Execution Complete ===")

if __name__ == "__main__":
    main()

[✓] 2025-05-06 14:48:41 | __main__        | INFO  | === Starting ML Pipeline Execution ===
[✓] 2025-05-06 14:48:41 | __main__        | INFO  | Loading dataset
[✓] 2025-05-06 14:48:41 | __main__        | INFO  | Dataset loaded successfully with shape: (45000, 14)
[✓] 2025-05-06 14:48:41 | __main__        | INFO  | Original dataset shape: (45000, 14)
[✓] 2025-05-06 14:48:41 | __main__        | INFO  | Original dataset columns: ['person_age', 'person_gender', 'person_education', 'person_income', 'person_emp_exp', 'person_home_ownership', 'loan_amnt', 'loan_intent', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length', 'credit_score', 'previous_loan_defaults_on_file', 'loan_status']
[✓] 2025-05-06 14:48:41 | __main__        | INFO  | Original dataset data types:
person_age                        float64
person_gender                      object
person_education                   object
person_income                     float64
person_emp_exp                      int64
perso

In [15]:
import numpy as np
import pandas as pd
import joblib
import os
from catboost import CatBoostClassifier
from src.data.data_transformation import PreprocessingPipeline

def main():
    # Load the preprocessing pipeline
    pipeline_path = "./models/preprocessor/preprocessing_pipeline.pkl"
    try:
        preprocessing_pipeline = joblib.load(pipeline_path)
        print("Pipeline loaded successfully")
    except Exception as e:
        print(f"Error loading pipeline: {str(e)}")
        return

    # Create test data
    data = {
        "person_age": [20],
        "person_gender": ["male"],
        "person_education": ["Bachelor"],
        "person_income": [10000],
        "person_emp_exp": [2],
        "person_home_ownership": ["OWN"],
        "loan_amnt": [5000],
        "loan_intent": ["EDUCATION"],
        "loan_int_rate": [4.5],
        "loan_percent_income": [0.04],
        "cb_person_cred_hist_length": [3],
        "credit_score": [498],
        "previous_loan_defaults_on_file": ["Yes"]
    }

    # Convert to DataFrame
    df = pd.DataFrame(data)

    # Add random target column (not needed for prediction but helps with pipeline expectations)

    try:
        # Verify required columns exist
        required_columns = preprocessing_pipeline.numeric_cols + preprocessing_pipeline.categorical_cols
        missing_cols = [col for col in required_columns if col not in df.columns]
        if missing_cols:
            print(f"Missing columns in test data: {missing_cols}")
            return

        # Transform data
        X_transformed = preprocessing_pipeline.transform(df)

        # Convert to DataFrame with feature names
        X_transformed_df = pd.DataFrame(X_transformed, columns=preprocessing_pipeline.get_feature_names_out())

        # Display transformation info
        print("\nTransformed Feature Array Shape:", X_transformed_df.shape)
        print("\nTransformed Feature Names:")
        print(X_transformed_df.columns.tolist())
        print("\nFirst Transformed Sample:")
        print(X_transformed_df.iloc[0])

        # Load trained CatBoost model
        model_path = "./models/model/model.cbm"
        model = CatBoostClassifier()
        model.load_model(model_path)
        print("\nModel loaded successfully")

        # Predict using the model
        prediction = model.predict(X_transformed_df)
        print("\nModel Prediction:", prediction[0])

    except Exception as e:
        print(f"Error during processing: {str(e)}")
        raise

if __name__ == "__main__":
    main()


Pipeline loaded successfully

Transformed Feature Array Shape: (1, 22)

Transformed Feature Names:
['person_age', 'person_income', 'person_emp_exp', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length', 'credit_score', 'person_gender_male', 'person_education_Bachelor', 'person_education_Doctorate', 'person_education_High School', 'person_education_Master', 'person_home_ownership_OTHER', 'person_home_ownership_OWN', 'person_home_ownership_RENT', 'loan_intent_EDUCATION', 'loan_intent_HOMEIMPROVEMENT', 'loan_intent_MEDICAL', 'loan_intent_PERSONAL', 'loan_intent_VENTURE', 'previous_loan_defaults_on_file_Yes']

First Transformed Sample:
person_age                           -2.472941
person_income                        -3.747876
person_emp_exp                       -0.372963
loan_amnt                            -0.661598
loan_int_rate                         4.500000
loan_percent_income                  -1.411752
cb_person_cred_hist_length           -0.797265
cr

In [None]:
import pandas as pd
import numpy as np
import yaml
import joblib
from sklearn.base import BaseEstimator, TransformerMixin
import os

# Create sample data for testing
data = {
    "person_age": [32.0, 24.0],
    "person_gender": ["male", "male"],
    "person_education": ["Associate", "Associate"],
    "person_income": [96865.0, 56838.0],
    "person_emp_exp": [10, 6],
    "person_home_ownership": ["MORTGAGE", "RENT"],
    "loan_amnt": [7500.0, 9000.0],
    "loan_intent": ["EDUCATION", "EDUCATION"],
    "loan_int_rate": [6.04, 11.49],
    "loan_percent_income": [0.08, 0.16],
    "cb_person_cred_hist_length": [10.0, 4.0],
    "credit_score": [601, 647],
    "previous_loan_defaults_on_file": ["No", "Yes"],
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Randomly generate target column for demonstration purposes
df["loan_status"] = np.random.randint(0, 2, size=len(df))

# Define numeric and categorical columns

numeric_cols = [
    "person_age",
    "person_income",
    "person_emp_exp",
    "loan_amnt",
    "loan_int_rate",
    "loan_percent_income",
    "cb_person_cred_hist_length",
    "credit_score"
]

# 1. Create the preprocessing pipeline (make sure this is a class instance, not an array)
from your_module_path import PreprocessingPipeline  # Import your actual class
# Or use this simplified version if needed:
'''
class PreprocessingPipeline:
    def __init__(self, numeric_cols, categorical_cols):
        self.numeric_cols = numeric_cols
        self.categorical_cols = categorical_cols
        self.feature_names_out_ = None
        self.transformers = {}

    def fit(self, X, y=None):
        # Your fit logic here
        return self

    def transform(self, X):
        # Your transform logic here
        return X_transformed

    def get_feature_names_out(self, input_features=None):
        return np.array(self.feature_names_out_)
'''

# Create the pipeline instance
preprocessing_pipeline = PreprocessingPipeline(numeric_cols, categorical_cols)

# 2. Fit the preprocessing pipeline on your data
preprocessing_pipeline = preprocessing_pipeline.fit(df.drop(columns=['loan_status']))
# NOTE: Even though it's common to chain methods like this, make sure to assign it back to the same variable

# 3. Now use the pipeline to transform your data
X_transformed = preprocessing_pipeline.transform(df.drop(columns=['loan_status']))
# DO NOT do: preprocessing_pipeline = preprocessing_pipeline.get_feature_names_out()

# 4. If you need feature names, store them in a separate variable
feature_names = preprocessing_pipeline.get_feature_names_out()

# 5. Output transformed result
print("Transformed Feature Array:")
print(X_transformed.shape)
print(feature_names)

In [None]:
import numpy as np
import pandas as pd
import joblib
import os
from demo_src.data.data_transformation import PreprocessingPipeline

# Load the pipeline
pipeline_path = os.path.join("demo_artifacts", "preprocessing_pipeline.pkl")
preprocessing_pipeline = joblib.load(pipeline_path)

# Create test data
data = {
    "person_age": [32.0, 24.0],
    "person_gender": ["male", "male"],
    "person_education": ["Associate", "Associate"],
    "person_income": [96865.0, 56838.0],
    "person_emp_exp": [10, 6],
    "person_home_ownership": ["MORTGAGE", "RENT"],
    "loan_amnt": [7500.0, 9000.0],
    "loan_intent": ["EDUCATION", "EDUCATION"],
    "loan_int_rate": [6.04, 11.49],
    "loan_percent_income": [0.08, 0.16],
    "cb_person_cred_hist_length": [10.0, 4.0],
    "credit_score": [601, 647],
    "previous_loan_defaults_on_file": ["No", "Yes"]
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Randomly generate target column for demonstration purposes
df["loan_status"] = np.random.randint(0, 2, size=len(df))

# Transform data
X_transformed = preprocessing_pipeline.transform(df.drop(columns=["loan_status"]))

# Output transformed result
print("Transformed Feature Array:")
print(X_transformed)
