In [1]:
"""
Lead Data Preprocessing Pipeline - Production Version

This module provides a comprehensive data preprocessing pipeline specifically designed 
for lead scoring datasets. It handles missing values, feature encoding, scaling, and 
maintains consistency between training and inference phases.

Key Features:
- Robust handling of categorical and numerical features
- Consistent preprocessing for training and inference
- Automatic feature importance-based processing strategies
- Comprehensive logging and error handling
- Serializable pipeline for production deployment

Author: Data Science Team
Version: 1.0.0
Dependencies: pandas, numpy, scikit-learn, joblib
"""

import pandas as pd
import numpy as np
import joblib
import os
import logging
from datetime import datetime
from typing import Dict, List, Optional, Union, Tuple
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import warnings

# Configure warnings to avoid sklearn deprecation messages in production
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

# Configure logging for production monitoring
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('lead_preprocessor.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)


class MultiColumnLabelEncoder(BaseEstimator, TransformerMixin):
    """
    Custom Label Encoder that can handle multiple columns simultaneously and 
    gracefully manages unseen categories during inference.
    
    This transformer is designed to be used in scikit-learn pipelines and 
    maintains consistency between training and inference phases.
    
    Design Decisions:
    - Inherits from BaseEstimator and TransformerMixin for sklearn compatibility
    - Handles unseen categories by mapping them to the most frequent training category
    - Stores individual encoders for each column to maintain flexibility
    - Converts all inputs to strings to handle mixed data types consistently
    """
    
    def __init__(self):
        """Initialize the multi-column label encoder."""
        self.encoders: Dict[Union[str, int], LabelEncoder] = {}
        self.column_names: Optional[List[str]] = None
        
    def fit(self, X: Union[pd.DataFrame, np.ndarray], y: Optional[np.ndarray] = None) -> 'MultiColumnLabelEncoder':
        """
        Fit label encoders for each column in the input data.
        
        Parameters:
        -----------
        X : DataFrame or ndarray
            Input data containing categorical features to encode
        y : array-like, optional
            Target values (ignored, present for sklearn compatibility)
            
        Returns:
        --------
        self : MultiColumnLabelEncoder
            Returns self for method chaining
            
        Notes:
        ------
        - Each column gets its own LabelEncoder to maintain independence
        - String conversion ensures consistent handling of mixed data types
        - Stores column names for DataFrame inputs to maintain consistency
        """
        logger.info("Fitting MultiColumnLabelEncoder")
        
        if isinstance(X, pd.DataFrame):
            self.column_names = X.columns.tolist()
            for col in X.columns:
                encoder = LabelEncoder()
                # Convert to string to handle mixed data types and NaN values
                col_data = X[col].astype(str).fillna('missing_value')
                encoder.fit(col_data)
                self.encoders[col] = encoder
                logger.debug(f"Fitted encoder for column {col} with {len(encoder.classes_)} classes")
        else:
            # Handle numpy array input
            self.column_names = None
            for i in range(X.shape[1]):
                encoder = LabelEncoder()
                col_data = pd.Series(X[:, i]).astype(str).fillna('missing_value')
                encoder.fit(col_data)
                self.encoders[i] = encoder
                logger.debug(f"Fitted encoder for column {i} with {len(encoder.classes_)} classes")
                
        return self
    
    def transform(self, X: Union[pd.DataFrame, np.ndarray]) -> np.ndarray:
        """
        Transform data using fitted encoders with robust handling of unseen categories.
        
        Parameters:
        -----------
        X : DataFrame or ndarray
            Input data to transform
            
        Returns:
        --------
        ndarray
            Transformed data with categorical values encoded as integers
            
        Notes:
        ------
        - Unseen categories are mapped to the most frequent training category
        - This approach maintains model stability during inference
        - Warnings are logged for monitoring data drift
        """
        if not self.encoders:
            raise ValueError("Encoder must be fitted before transform. Call fit() first.")
            
        logger.info("Transforming data with MultiColumnLabelEncoder")
        
        if isinstance(X, pd.DataFrame):
            result = X.copy()
            for col in X.columns:
                if col in self.encoders:
                    result[col] = self._transform_column(X[col], self.encoders[col], col)
            return result.values
        else:
            # Handle numpy array input
            result = X.copy()
            for i in range(X.shape[1]):
                if i in self.encoders:
                    col_series = pd.Series(X[:, i])
                    result[:, i] = self._transform_column(col_series, self.encoders[i], f"column_{i}")
            return result
    
    def _transform_column(self, col_data: pd.Series, encoder: LabelEncoder, col_name: str) -> np.ndarray:
        """
        Transform a single column with robust unseen category handling.
        
        Parameters:
        -----------
        col_data : Series
            Column data to transform
        encoder : LabelEncoder
            Fitted encoder for this column
        col_name : str
            Column name for logging purposes
            
        Returns:
        --------
        ndarray
            Transformed column data
            
        Strategy for unseen categories:
        - Map to the first class in the encoder (most frequent from training)
        - Log warning for monitoring and potential retraining needs
        - This maintains model stability while alerting to data drift
        """
        col_str = col_data.astype(str).fillna('missing_value')
        unique_vals = set(col_str)
        known_vals = set(encoder.classes_)
        unseen_vals = unique_vals - known_vals
        
        if unseen_vals:
            logger.warning(f"Unseen categories in {col_name}: {unseen_vals}")
            # Replace unseen values with the first known class (most frequent from training)
            col_str_processed = col_str.copy()
            for unseen_val in unseen_vals:
                col_str_processed = col_str_processed.replace(unseen_val, encoder.classes_[0])
            return encoder.transform(col_str_processed)
        else:
            return encoder.transform(col_str)
    
    def fit_transform(self, X: Union[pd.DataFrame, np.ndarray], y: Optional[np.ndarray] = None) -> np.ndarray:
        """Fit and transform in one step for convenience."""
        return self.fit(X, y).transform(X)


class LeadDataPreprocessor:
    """
    Production-ready preprocessing pipeline for lead scoring data.
    
    This class implements a comprehensive preprocessing strategy based on 
    feature importance analysis and business requirements. It ensures 
    consistent data transformation between training and inference phases.
    
    Design Philosophy:
    - Feature importance drives processing strategy (one-hot vs label encoding)
    - Robust handling of missing values and unseen categories
    - Minimal information loss while maintaining model performance
    - Production-ready with comprehensive logging and error handling
    - Serializable for deployment consistency
    
    Processing Strategy:
    1. High-importance categorical features: One-hot encoding to preserve all information
    2. Medium-importance categorical features: Label encoding to reduce dimensionality
    3. Numerical features: MinMax scaling for consistent feature ranges
    4. Binary features: Label encoding for Yes/No values
    5. Low-variance features: Automatic removal to reduce noise
    """
    
    def __init__(self, 
                 output_dir: str = 'preprocessed_output',
                 pipeline_name: str = 'lead_preprocessor_pipeline.pkl'):
        """
        Initialize the lead data preprocessor with configuration.
        
        Parameters:
        -----------
        output_dir : str, default='preprocessed_output'
            Directory to save processed files and pipeline artifacts
        pipeline_name : str, default='lead_preprocessor_pipeline.pkl'
            Name of the serialized pipeline file
            
        Design Decisions:
        - Separate output directory for organized file management
        - Configurable pipeline name for version control
        - Feature categorization based on EDA and business importance
        """
        self.output_dir = output_dir
        self.pipeline_name = pipeline_name
        self.pipeline: Optional[ColumnTransformer] = None
        self.feature_names_: Optional[List[str]] = None
        self.is_fitted = False
        
        # Create output directory structure
        os.makedirs(self.output_dir, exist_ok=True)
        os.makedirs(os.path.join(self.output_dir, 'data'), exist_ok=True)
        os.makedirs(os.path.join(self.output_dir, 'models'), exist_ok=True)
        os.makedirs(os.path.join(self.output_dir, 'logs'), exist_ok=True)
        
        # Feature categorization based on EDA and business importance
        self._initialize_feature_categories()
        
        logger.info(f"Initialized LeadDataPreprocessor with output directory: {self.output_dir}")
    
    def _initialize_feature_categories(self):
        """
        Initialize feature categories based on EDA insights and business requirements.
        
        Design Rationale:
        - Numerical features: Direct predictors of engagement and behavior
        - High-importance categorical: Critical for conversion prediction (one-hot encoded)
        - Medium-importance categorical: Useful but not critical (label encoded to save space)
        - Binary features: Simple Yes/No preferences
        - Features to drop: No predictive power or 100% single value
        """
        
        # Numerical features: Direct engagement and behavioral metrics
        # These are continuous variables that benefit from MinMax scaling
        self.numerical_features = [
            'TotalVisits',                    # User engagement frequency
            'Total Time Spent on Website',   # User engagement depth
            'Page Views Per Visit',          # User engagement intensity
            'Asymmetrique Activity Score',   # Proprietary engagement metric
            'Asymmetrique Profile Score'     # Proprietary profile quality metric
        ]
        
        # High-importance categorical features (mutual information > 0.1)
        # These get one-hot encoding to preserve all category information
        # Rationale: High predictive power justifies increased dimensionality
        self.high_importance_categorical = [
            'Tags',                           # Highest importance (0.3746) - lead categorization
            'Lead Quality',                   # Second highest (0.1898) - quality assessment
            'Lead Profile',                   # Third highest (0.1245) - profile type
            'What is your current occupation', # Fourth highest (0.0970) - professional context
            'Last Activity',                  # Recent behavior indicator
            'Last Notable Activity',          # Significant behavior indicator
            'Lead Source',                    # Business critical - marketing channel
            'Lead Origin'                     # Business critical - first touchpoint
        ]
        
        # Medium-importance categorical features
        # These get label encoding to balance information retention with dimensionality
        # Rationale: Useful features but don't justify high dimensionality of one-hot encoding
        self.medium_importance_categorical = [
            'Specialization',                           # Educational preference
            'City',                                     # Geographic segmentation
            'How did you hear about X Education',       # Marketing attribution
            'What matters most to you in choosing a course', # Decision factors
            'Country',                                  # Geographic segmentation
            'Asymmetrique Activity Index',              # Activity level categorization
            'Asymmetrique Profile Index'                # Profile quality categorization
        ]
        
        # Binary features: Simple Yes/No preferences
        # These get label encoding: Yes=1, No=0
        self.binary_features = [
            'Do Not Email',                    # Communication preference
            'Do Not Call',                     # Communication preference
            'A free copy of Mastering The Interview'  # Content engagement indicator
        ]
        
        # System columns to exclude from feature processing
        self.id_columns = ['Prospect ID', 'Lead Number']  # Unique identifiers
        self.target_column = 'Converted'                  # Target variable
        
        # Features to drop based on EDA findings
        # Rationale: >95% single value or no predictive power
        self.features_to_drop = [
            'Magazine',                               # 100% "No" - no variance
            'Receive More Updates About Our Courses', # 100% "No" - no variance
            'Update me on Supply Chain Content',      # 100% "No" - no variance
            'Get updates on DM Content',              # 100% "No" - no variance
            'I agree to pay the amount through cheque', # 100% "No" - no variance
            'Search',                                 # 99.85% "No" - minimal variance
            'Newspaper Article',                      # 99.98% "No" - minimal variance
            'X Education Forums',                     # 99.99% "No" - minimal variance
            'Newspaper',                              # 99.99% "No" - minimal variance
            'Digital Advertisement',                  # 99.96% "No" - minimal variance
            'Through Recommendations'                 # 99.92% "No" - minimal variance
        ]
    
    def _validate_and_filter_features(self, X: pd.DataFrame) -> None:
        """
        Validate and filter feature lists based on available columns in the dataset.
        
        Parameters:
        -----------
        X : DataFrame
            Input dataset to validate against
            
        Purpose:
        - Handles cases where expected features might be missing
        - Provides clear feedback on feature availability
        - Prevents runtime errors from missing columns
        """
        available_features = set(X.columns)
        
        # Filter each feature list to only include existing features
        self.numerical_features = [f for f in self.numerical_features if f in available_features]
        self.high_importance_categorical = [f for f in self.high_importance_categorical if f in available_features]
        self.medium_importance_categorical = [f for f in self.medium_importance_categorical if f in available_features]
        self.binary_features = [f for f in self.binary_features if f in available_features]
        self.features_to_drop = [f for f in self.features_to_drop if f in available_features]
        
        # Log feature availability summary
        logger.info(f"Feature availability summary:")
        logger.info(f"  Numerical features: {len(self.numerical_features)}")
        logger.info(f"  High importance categorical: {len(self.high_importance_categorical)}")
        logger.info(f"  Medium importance categorical: {len(self.medium_importance_categorical)}")
        logger.info(f"  Binary features: {len(self.binary_features)}")
        logger.info(f"  Features to drop: {len(self.features_to_drop)}")
    
    def _prepare_data(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Clean and prepare the dataset by removing non-feature columns.
        
        Parameters:
        -----------
        X : DataFrame
            Raw input dataset
            
        Returns:
        --------
        DataFrame
            Cleaned dataset ready for feature processing
            
        Cleaning Strategy:
        1. Remove ID columns (not predictive, cause overfitting)
        2. Remove target column (for feature preparation)
        3. Remove low-variance features (no predictive power)
        """
        logger.info("Preparing data for preprocessing")
        
        initial_shape = X.shape
        
        # Combine all columns to remove
        columns_to_remove = self.id_columns + [self.target_column] + self.features_to_drop
        
        # Remove only existing columns to avoid KeyError
        existing_columns_to_remove = [col for col in columns_to_remove if col in X.columns]
        X_processed = X.drop(columns=existing_columns_to_remove, errors='ignore')
        
        logger.info(f"Data preparation complete:")
        logger.info(f"  Initial shape: {initial_shape}")
        logger.info(f"  Columns removed: {len(existing_columns_to_remove)}")
        logger.info(f"  Final shape: {X_processed.shape}")
        
        return X_processed
    
    def fit(self, X: pd.DataFrame) -> 'LeadDataPreprocessor':
        """
        Fit the preprocessing pipeline on training data.
        
        Parameters:
        -----------
        X : DataFrame
            Training dataset
            
        Returns:
        --------
        self : LeadDataPreprocessor
            Returns self for method chaining
            
        Process:
        1. Data preparation (remove unwanted columns)
        2. Feature validation and filtering
        3. Pipeline creation for each feature type
        4. Pipeline combination and fitting
        5. Feature name generation
        """
        logger.info("Starting preprocessing pipeline fitting")
        
        # Step 1: Prepare data
        X_processed = self._prepare_data(X)
        
        # Step 2: Validate and filter features
        self._validate_and_filter_features(X_processed)
        
        # Step 3: Create preprocessing pipelines
        preprocessor_steps = []
        
        # Numerical features pipeline
        if self.numerical_features:
            logger.info(f"Creating numerical pipeline for {len(self.numerical_features)} features")
            numerical_pipeline = Pipeline([
                # Use median imputation for robustness against outliers
                ('imputer', SimpleImputer(strategy='median')),
                # MinMax scaling ensures all features are in [0,1] range
                # This prevents features with larger scales from dominating
                ('scaler', MinMaxScaler(feature_range=(0, 1)))
            ])
            preprocessor_steps.append(('numerical', numerical_pipeline, self.numerical_features))
        
        # High-importance categorical features pipeline
        if self.high_importance_categorical:
            logger.info(f"Creating high-importance categorical pipeline for {len(self.high_importance_categorical)} features")
            high_cat_pipeline = Pipeline([
                # Most frequent imputation preserves distribution
                ('imputer', SimpleImputer(strategy='most_frequent')),
                # One-hot encoding preserves all category information
                # drop='first' prevents multicollinearity
                # handle_unknown='ignore' gracefully handles new categories
                ('encoder', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
            ])
            preprocessor_steps.append(('high_categorical', high_cat_pipeline, self.high_importance_categorical))
        
        # Medium-importance categorical features pipeline
        if self.medium_importance_categorical:
            logger.info(f"Creating medium-importance categorical pipeline for {len(self.medium_importance_categorical)} features")
            medium_cat_pipeline = Pipeline([
                # 'Unknown' creates explicit missing category
                ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
                # Label encoding reduces dimensionality while preserving ordinal information
                ('encoder', MultiColumnLabelEncoder())
            ])
            preprocessor_steps.append(('medium_categorical', medium_cat_pipeline, self.medium_importance_categorical))
        
        # Binary features pipeline
        if self.binary_features:
            logger.info(f"Creating binary pipeline for {len(self.binary_features)} features")
            binary_pipeline = Pipeline([
                # Default to 'No' for conservative assumption
                ('imputer', SimpleImputer(strategy='constant', fill_value='No')),
                # Label encoding: Yes=1, No=0
                ('encoder', MultiColumnLabelEncoder())
            ])
            preprocessor_steps.append(('binary', binary_pipeline, self.binary_features))
        
        # Step 4: Combine all pipelines
        logger.info(f"Combining {len(preprocessor_steps)} preprocessing pipelines")
        self.pipeline = ColumnTransformer(
            transformers=preprocessor_steps,
            remainder='drop'  # Drop any remaining unspecified columns
        )
        
        # Step 5: Fit the complete pipeline
        logger.info("Fitting complete preprocessing pipeline")
        self.pipeline.fit(X_processed)
        
        # Step 6: Generate feature names
        self._generate_feature_names()
        
        self.is_fitted = True
        logger.info(f"Pipeline fitting complete. Generated {len(self.feature_names_)} output features")
        
        return self
    
    def _generate_feature_names(self) -> None:
        """
        Generate descriptive feature names for the transformed output.
        
        Purpose:
        - Maintains interpretability of processed features
        - Enables feature importance analysis
        - Supports model debugging and validation
        """
        self.feature_names_ = []
        
        for name, transformer, features in self.pipeline.transformers_:
            if name == 'numerical':
                # Numerical features keep original names
                self.feature_names_.extend(features)
                
            elif name == 'high_categorical':
                # One-hot encoded features get descriptive names
                try:
                    encoder = transformer.named_steps['encoder']
                    if hasattr(encoder, 'get_feature_names_out'):
                        encoded_features = encoder.get_feature_names_out(features)
                        self.feature_names_.extend(encoded_features)
                    else:
                        # Fallback for older sklearn versions
                        self.feature_names_.extend([f"{feat}_onehot" for feat in features])
                except Exception as e:
                    logger.warning(f"Could not generate feature names for high_categorical: {e}")
                    self.feature_names_.extend([f"{feat}_onehot" for feat in features])
                    
            elif name in ['medium_categorical', 'binary']:
                # Label encoded features get descriptive suffix
                self.feature_names_.extend([f"{feat}_encoded" for feat in features])
        
        logger.info(f"Generated {len(self.feature_names_)} feature names")
    
    def transform(self, 
                  X: pd.DataFrame, 
                  save_to_csv: bool = True, 
                  filename: Optional[str] = None) -> pd.DataFrame:
        """
        Transform new data using the fitted pipeline.
        
        Parameters:
        -----------
        X : DataFrame
            Input data to transform
        save_to_csv : bool, default=True
            Whether to save transformed data to CSV
        filename : str, optional
            Custom filename for the output CSV
            
        Returns:
        --------
        DataFrame
            Transformed data ready for machine learning
            
        Process:
        1. Validation checks
        2. Data preparation
        3. Pipeline transformation
        4. Result formatting
        5. Optional saving
        """
        if not self.is_fitted:
            raise ValueError("Pipeline must be fitted before transform. Call fit() first.")
        
        logger.info("Starting data transformation")
        
        # Store original information for reporting
        original_shape = X.shape
        original_index = X.index.copy()
        
        # Step 1: Prepare data (same cleaning as in fit)
        X_processed = self._prepare_data(X)
        
        # Step 2: Apply transformation
        logger.info("Applying preprocessing transformations")
        X_transformed = self.pipeline.transform(X_processed)
        
        # Step 3: Create output DataFrame
        X_final = pd.DataFrame(
            X_transformed,
            columns=self.feature_names_,
            index=original_index
        )
        
        # Step 4: Log transformation summary
        logger.info("Transformation complete:")
        logger.info(f"  Original shape: {original_shape}")
        logger.info(f"  Final shape: {X_final.shape}")
        logger.info(f"  Features generated: {len(self.feature_names_)}")
        
        # Validate numerical feature scaling
        if self.numerical_features:
            num_cols = [col for col in X_final.columns if any(nf in col for nf in self.numerical_features)]
            if num_cols:
                min_val = X_final[num_cols].min().min()
                max_val = X_final[num_cols].max().max()
                logger.info(f"  Numerical features scaled to range: [{min_val:.3f}, {max_val:.3f}]")
        
        # Step 5: Save results if requested
        if save_to_csv:
            if filename is None:
                timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
                filename = f'processed_data_{timestamp}.csv'
            
            filepath = os.path.join(self.output_dir, 'data', filename)
            X_final.to_csv(filepath, index=False)
            logger.info(f"Processed data saved to: {filepath}")
        
        return X_final
    
    def fit_transform(self, 
                      X: pd.DataFrame, 
                      save_to_csv: bool = True, 
                      filename: Optional[str] = None) -> pd.DataFrame:
        """
        Fit the pipeline and transform data in one step.
        
        Typically used for training data processing.
        """
        logger.info("Performing fit_transform operation")
        self.fit(X)
        return self.transform(X, save_to_csv=save_to_csv, filename=filename)
    
    def save_pipeline(self, pipeline_filename: Optional[str] = None) -> str:
        """
        Save the fitted pipeline to disk for production deployment.
        
        Parameters:
        -----------
        pipeline_filename : str, optional
            Custom filename for the pipeline
            
        Returns:
        --------
        str
            Path to the saved pipeline file
            
        Purpose:
        - Enables consistent preprocessing in production
        - Supports model versioning and rollback
        - Ensures training/inference consistency
        """
        if not self.is_fitted:
            raise ValueError("Pipeline must be fitted before saving")
        
        if pipeline_filename is None:
            pipeline_filename = self.pipeline_name
        
        pipeline_path = os.path.join(self.output_dir, 'models', pipeline_filename)
        
        # Save complete preprocessor object
        joblib.dump(self, pipeline_path)
        
        logger.info(f"Pipeline saved to: {pipeline_path}")
        return pipeline_path
    
    @classmethod
    def load_pipeline(cls, pipeline_path: str) -> 'LeadDataPreprocessor':
        """
        Load a previously saved pipeline for inference.
        
        Parameters:
        -----------
        pipeline_path : str
            Path to the saved pipeline file
            
        Returns:
        --------
        LeadDataPreprocessor
            Loaded preprocessor ready for inference
            
        Usage:
        - Production inference
        - Model validation
        - Consistent preprocessing across environments
        """
        logger.info(f"Loading pipeline from: {pipeline_path}")
        
        if not os.path.exists(pipeline_path):
            raise FileNotFoundError(f"Pipeline file not found: {pipeline_path}")
        
        preprocessor = joblib.load(pipeline_path)
        
        if not isinstance(preprocessor, cls):
            raise ValueError(f"Loaded object is not a {cls.__name__}")
        
        logger.info("Pipeline loaded successfully")
        logger.info(f"Ready to process data with {len(preprocessor.feature_names_)} output features")
        
        return preprocessor
    
    def save_feature_documentation(self, filename: str = 'feature_documentation.csv') -> pd.DataFrame:
        """
        Save comprehensive documentation of feature processing for model governance.
        
        Parameters:
        -----------
        filename : str, default='feature_documentation.csv'
            Output filename for documentation
            
        Returns:
        --------
        DataFrame
            Feature documentation DataFrame
            
        Purpose:
        - Model interpretability and debugging
        - Compliance and audit requirements
        - Feature engineering documentation
        """
        if not self.is_fitted:
            raise ValueError("Pipeline must be fitted first")
        
        feature_docs = []
        
        # Document numerical features
        for feat in self.numerical_features:
            feature_docs.append({
                'original_feature': feat,
                'output_feature': feat,
                'feature_type': 'numerical',
                'encoding_method': 'MinMaxScaler',
                'importance_level': 'high',
                'missing_value_strategy': 'median_imputation',
                'scaling_range': '[0, 1]',
                'description': 'Continuous variable scaled to unit range for consistent feature importance'
            })
        
        # Document high-importance categorical features
        for feat in self.high_importance_categorical:
            feature_docs.append({
                'original_feature': feat,
                'output_feature': f"{feat}_*",
                'feature_type': 'categorical',
                'encoding_method': 'OneHotEncoder',
                'importance_level': 'high',
                'missing_value_strategy': 'most_frequent',
                'scaling_range': '[0, 1]',
                'description': 'High-importance categorical converted to binary columns preserving all information'
            })
        
        # Document medium-importance categorical features
        for feat in self.medium_importance_categorical:
            feature_docs.append({
                'original_feature': feat,
                'output_feature': f"{feat}_encoded",
                'feature_type': 'categorical',
                'encoding_method': 'LabelEncoder',
                'importance_level': 'medium',
                'missing_value_strategy': 'unknown_category',
                'scaling_range': '[0, n_classes-1]',
                'description': 'Medium-importance categorical converted to ordinal integers for dimensionality reduction'
            })
        
        # Document binary features
        for feat in self.binary_features:
            feature_docs.append({
                'original_feature': feat,
                'output_feature': f"{feat}_encoded",
                'feature_type': 'binary',
                'encoding_method': 'LabelEncoder',
                'importance_level': 'low',
                'missing_value_strategy': 'default_no',
                'scaling_range': '[0, 1]',
                'description': 'Binary feature encoded as 0/1 with conservative missing value handling'
            })
        
        # Create and save documentation
        df_docs = pd.DataFrame(feature_docs)
        filepath = os.path.join(self.output_dir, 'logs', filename)
        df_docs.to_csv(filepath, index=False)
        
        logger.info(f"Feature documentation saved to: {filepath}")
        return df_docs
    
    def get_processing_summary(self) -> Dict[str, Union[int, str, bool]]:
        """
        Get comprehensive summary of preprocessing configuration.
        
        Returns:
        --------
        dict
            Summary of preprocessing configuration and status
            
        Purpose:
        - Pipeline monitoring and validation
        - Model governance and documentation
        - Performance analysis
        """
        if not self.is_fitted:
            return {"status": "Pipeline not fitted"}
        
        summary = {
            'pipeline_status': 'fitted',
            'total_output_features': len(self.feature_names_),
            'numerical_features_count': len(self.numerical_features),
            'high_importance_categorical_count': len(self.high_importance_categorical),
            'medium_importance_categorical_count': len(self.medium_importance_categorical),
            'binary_features_count': len(self.binary_features),
            'features_dropped_count': len(self.features_to_drop),
            'scaling_method': 'MinMaxScaler [0,1]',
            'high_categorical_encoding': 'OneHotEncoder',
            'medium_categorical_encoding': 'LabelEncoder',
            'binary_encoding': 'LabelEncoder',
            'missing_value_strategies': {
                'numerical': 'median',
                'high_categorical': 'most_frequent',
                'medium_categorical': 'unknown',
                'binary': 'no'
            },
            'output_directory': self.output_dir,
            'pipeline_file': self.pipeline_name,
            'is_production_ready': True
        }
        
        return summary

    def validate_input_data(self, X: pd.DataFrame) -> Tuple[bool, List[str]]:
        """
        Validate input data for preprocessing compatibility.
        
        Parameters:
        -----------
        X : DataFrame
            Input data to validate
            
        Returns:
        --------
        tuple
            (is_valid, list_of_issues)
            
        Purpose:
        - Early detection of data quality issues
        - Prevents pipeline failures in production
        - Provides actionable feedback for data fixes
        """
        issues = []
        
        # Check for empty dataset
        if X.empty:
            issues.append("Input dataset is empty")
        
        # Check for minimum required columns
        required_columns = (self.numerical_features + 
                          self.high_importance_categorical + 
                          self.medium_importance_categorical + 
                          self.binary_features)
        
        missing_columns = [col for col in required_columns if col not in X.columns]
        if missing_columns:
            issues.append(f"Missing required columns: {missing_columns}")
        
        # Check for excessive missing values
        for col in self.numerical_features:
            if col in X.columns:
                missing_pct = X[col].isnull().sum() / len(X)
                if missing_pct > 0.8:  # 80% threshold
                    issues.append(f"Excessive missing values in {col}: {missing_pct:.1%}")
        
        # Check data types
        for col in self.numerical_features:
            if col in X.columns:
                if not pd.api.types.is_numeric_dtype(X[col]):
                    issues.append(f"Non-numeric data in numerical column {col}")
        
        is_valid = len(issues) == 0
        
        if not is_valid:
            logger.warning(f"Input data validation failed with {len(issues)} issues")
            for issue in issues:
                logger.warning(f"  - {issue}")
        else:
            logger.info("Input data validation passed")
        
        return is_valid, issues

    def get_feature_importance_weights(self) -> Dict[str, float]:
        """
        Get feature importance weights based on processing strategy.
        
        Returns:
        --------
        dict
            Feature importance weights for model interpretation
            
        Purpose:
        - Provides weights for feature importance analysis
        - Supports model interpretation and business insights
        - Accounts for encoding-induced feature multiplication
        """
        if not self.is_fitted:
            raise ValueError("Pipeline must be fitted first")
        
        weights = {}
        
        # Numerical features get weight 1.0 (baseline)
        for feat in self.numerical_features:
            weights[feat] = 1.0
        
        # High-importance categorical features get higher weights
        # but distributed across one-hot encoded columns
        for feat in self.high_importance_categorical:
            weights[feat] = 0.8  # Slightly lower due to one-hot distribution
        
        # Medium-importance categorical features get medium weights
        for feat in self.medium_importance_categorical:
            weights[feat] = 0.6
        
        # Binary features get lower weights
        for feat in self.binary_features:
            weights[feat] = 0.4
        
        return weights

    def generate_preprocessing_report(self) -> str:
        """
        Generate comprehensive preprocessing report for documentation.
        
        Returns:
        --------
        str
            Detailed preprocessing report
            
        Purpose:
        - Model governance and compliance
        - Team communication and handoff
        - Performance analysis and optimization
        """
        if not self.is_fitted:
            return "Pipeline not fitted - cannot generate report"
        
        report_lines = []
        report_lines.append("=" * 80)
        report_lines.append("LEAD DATA PREPROCESSING PIPELINE REPORT")
        report_lines.append("=" * 80)
        report_lines.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        report_lines.append(f"Pipeline Status: {'FITTED' if self.is_fitted else 'NOT FITTED'}")
        report_lines.append(f"Output Directory: {self.output_dir}")
        report_lines.append("")
        
        # Feature Processing Summary
        report_lines.append("FEATURE PROCESSING SUMMARY")
        report_lines.append("-" * 40)
        report_lines.append(f"Total Output Features: {len(self.feature_names_)}")
        report_lines.append(f"Numerical Features: {len(self.numerical_features)} -> MinMax Scaled [0,1]")
        report_lines.append(f"High-Importance Categorical: {len(self.high_importance_categorical)} -> One-Hot Encoded")
        report_lines.append(f"Medium-Importance Categorical: {len(self.medium_importance_categorical)} -> Label Encoded")
        report_lines.append(f"Binary Features: {len(self.binary_features)} -> Label Encoded")
        report_lines.append(f"Features Dropped: {len(self.features_to_drop)} (low variance)")
        report_lines.append("")
        
        # Processing Strategy Details
        report_lines.append("PROCESSING STRATEGY DETAILS")
        report_lines.append("-" * 40)
        report_lines.append("1. Numerical Features:")
        report_lines.append("   - Missing Value Strategy: Median imputation (robust to outliers)")
        report_lines.append("   - Scaling: MinMax [0,1] (prevents feature domination)")
        for feat in self.numerical_features:
            report_lines.append(f"   - {feat}")
        report_lines.append("")
        
        report_lines.append("2. High-Importance Categorical Features:")
        report_lines.append("   - Missing Value Strategy: Most frequent category")
        report_lines.append("   - Encoding: One-Hot (preserves all information)")
        report_lines.append("   - Rationale: High predictive power justifies dimensionality increase")
        for feat in self.high_importance_categorical:
            report_lines.append(f"   - {feat}")
        report_lines.append("")
        
        report_lines.append("3. Medium-Importance Categorical Features:")
        report_lines.append("   - Missing Value Strategy: 'Unknown' category")
        report_lines.append("   - Encoding: Label Encoding (dimensionality reduction)")
        report_lines.append("   - Rationale: Balances information retention with efficiency")
        for feat in self.medium_importance_categorical:
            report_lines.append(f"   - {feat}")
        report_lines.append("")
        
        report_lines.append("4. Binary Features:")
        report_lines.append("   - Missing Value Strategy: Default to 'No' (conservative)")
        report_lines.append("   - Encoding: Label Encoding (Yes=1, No=0)")
        for feat in self.binary_features:
            report_lines.append(f"   - {feat}")
        report_lines.append("")
        
        # Dropped Features
        report_lines.append("5. Dropped Features (Low Variance):")
        report_lines.append("   - Rationale: >95% single value, no predictive power")
        for feat in self.features_to_drop:
            report_lines.append(f"   - {feat}")
        report_lines.append("")
        
        # Production Considerations
        report_lines.append("PRODUCTION CONSIDERATIONS")
        report_lines.append("-" * 40)
        report_lines.append("- Pipeline is serializable for consistent deployment")
        report_lines.append("- Handles unseen categories gracefully during inference")
        report_lines.append("- Comprehensive logging for monitoring and debugging")
        report_lines.append("- Validation checks prevent common data quality issues")
        report_lines.append("- Feature documentation supports model governance")
        report_lines.append("- Single processed data file for streamlined workflow")
        report_lines.append("")
        
        # File Structure
        report_lines.append("OUTPUT FILE STRUCTURE")
        report_lines.append("-" * 40)
        report_lines.append("preprocessed_output/")
        report_lines.append("├── data/")
        report_lines.append("│   └── lead_scoring_processed.csv")
        report_lines.append("├── models/")
        report_lines.append("│   └── lead_scoring_pipeline_v1.pkl")
        report_lines.append("└── logs/")
        report_lines.append("    ├── feature_documentation.csv")
        report_lines.append("    └── preprocessing_report.txt")
        report_lines.append("")
        
        report_lines.append("=" * 80)
        
        return "\n".join(report_lines)


# =============================================================================
# PRODUCTION USAGE EXAMPLE AND MAIN EXECUTION
# =============================================================================

def main():
    """
    Main execution function demonstrating production usage of the preprocessing pipeline.
    
    This function shows:
    1. Data loading and validation
    2. Pipeline fitting and training data processing
    3. Pipeline serialization for deployment
    4. New data processing with loaded pipeline
    5. Documentation generation
    """
    
    # Configure logging for main execution
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s'
    )
    
    print("=" * 80)
    print("LEAD DATA PREPROCESSING PIPELINE - PRODUCTION VERSION")
    print("=" * 80)
    
    # Step 1: Load and validate data
    try:
        # Update this path to your actual data file
        data_path = r'C:\Users\Minfy.DESKTOP-3E50D5N\Music\customer_lead\data\Lead Scoring.csv'  # Adjust path as needed
        
        if not os.path.exists(data_path):
            logger.error(f"Data file not found: {data_path}")
            print(f"Please ensure '{data_path}' exists in the current directory")
            return
        
        df = pd.read_csv(data_path)
        logger.info(f"Data loaded successfully from {data_path}")
        logger.info(f"Dataset shape: {df.shape}")
        
        # Basic data validation
        if df.empty:
            logger.error("Dataset is empty")
            return
        
        logger.info(f"Dataset columns: {list(df.columns[:10])}...")  # Show first 10
        
    except Exception as e:
        logger.error(f"Failed to load data: {str(e)}")
        return
    
    # Step 2: Initialize and fit preprocessor
    try:
        logger.info("Initializing preprocessing pipeline")
        preprocessor = LeadDataPreprocessor(
            output_dir='preprocessed_output',
            pipeline_name='lead_scoring_pipeline_v1.pkl'
        )
        
        # Validate input data
        is_valid, issues = preprocessor.validate_input_data(df)
        if not is_valid:
            logger.warning("Data validation issues found, but continuing with processing")
        
        # Fit and transform training data
        logger.info("Processing training data")
        X_train_processed = preprocessor.fit_transform(
            df,
            save_to_csv=True,
            filename='lead_scoring_processed.csv'  # Single file for processed data
        )
        
        logger.info("Training data processing completed successfully")
        
    except Exception as e:
        logger.error(f"Failed to process training data: {str(e)}")
        return
    
    # Step 3: Save pipeline and generate documentation
    try:
        # Save pipeline for production deployment
        pipeline_path = preprocessor.save_pipeline()
        logger.info(f"Pipeline saved for production use: {pipeline_path}")
        
        # Generate feature documentation
        feature_docs = preprocessor.save_feature_documentation()
        logger.info("Feature documentation generated")
        
        # Generate preprocessing report
        report = preprocessor.generate_preprocessing_report()
        report_path = os.path.join(preprocessor.output_dir, 'logs', 'preprocessing_report.txt')
        with open(report_path, 'w') as f:
            f.write(report)
        logger.info(f"Preprocessing report saved: {report_path}")
        
    except Exception as e:
        logger.error(f"Failed to save pipeline or documentation: {str(e)}")
        return
    
    # Step 4: Demonstrate pipeline loading capability (without saving additional files)
    try:
        logger.info("Demonstrating pipeline loading capability")
        
        # Load pipeline (simulating production environment)
        loaded_preprocessor = LeadDataPreprocessor.load_pipeline(pipeline_path)
        
        # Validate loaded pipeline is ready for inference
        logger.info("Pipeline loaded successfully and ready for inference")
        logger.info(f"Loaded pipeline can process {len(loaded_preprocessor.feature_names_)} features")
        
        # Test validation on sample data without saving
        new_data_sample = df.sample(n=min(50, len(df)), random_state=42)
        is_valid, issues = loaded_preprocessor.validate_input_data(new_data_sample)
        
        if is_valid:
            logger.info("Sample data validation passed - pipeline ready for production inference")
        else:
            logger.warning(f"Sample data validation found {len(issues)} issues")
        
    except Exception as e:
        logger.error(f"Failed to load or validate pipeline: {str(e)}")
        return
    
    # Step 5: Display final summary
    print("\n" + "=" * 80)
    print("PROCESSING COMPLETE - SUMMARY")
    print("=" * 80)
    
    summary = preprocessor.get_processing_summary()
    print("\nPipeline Configuration:")
    for key, value in summary.items():
        if key != 'missing_value_strategies':
            print(f"  {key.replace('_', ' ').title()}: {value}")
    
    print(f"\nOutput Files Generated:")
    print(f"  Data Files:")
    print(f"    - preprocessed_output/data/lead_scoring_processed.csv")
    print(f"  Model Files:")
    print(f"    - preprocessed_output/models/lead_scoring_pipeline_v1.pkl")
    print(f"  Documentation:")
    print(f"    - preprocessed_output/logs/feature_documentation.csv")
    print(f"    - preprocessed_output/logs/preprocessing_report.txt")
    print(f"    - lead_preprocessor.log")
    
    print(f"\nFinal Dataset Shape: {X_train_processed.shape}")
    print(f"Features Generated: {len(preprocessor.feature_names_)}")
    print(f"Ready for Machine Learning Model Training!")
    
    # Display feature importance weights
    print(f"\nFeature Importance Weights:")
    weights = preprocessor.get_feature_importance_weights()
    for category, weight in [('numerical', 1.0), ('high_categorical', 0.8), 
                           ('medium_categorical', 0.6), ('binary', 0.4)]:
        count = len([f for f, w in weights.items() if abs(w - weight) < 0.1])
        if count > 0:
            print(f"  {category.replace('_', ' ').title()}: {weight} (n={count})")
    
    print("\n" + "=" * 80)


if __name__ == "__main__":
    main()

2025-07-17 14:46:50,139 - __main__ - INFO - Data loaded successfully from C:\Users\Minfy.DESKTOP-3E50D5N\Music\customer_lead\data\Lead Scoring.csv
2025-07-17 14:46:50,140 - __main__ - INFO - Dataset shape: (9240, 37)
2025-07-17 14:46:50,140 - __main__ - INFO - Dataset columns: ['Prospect ID', 'Lead Number', 'Lead Origin', 'Lead Source', 'Do Not Email', 'Do Not Call', 'Converted', 'TotalVisits', 'Total Time Spent on Website', 'Page Views Per Visit']...
2025-07-17 14:46:50,141 - __main__ - INFO - Initializing preprocessing pipeline
2025-07-17 14:46:50,144 - __main__ - INFO - Initialized LeadDataPreprocessor with output directory: preprocessed_output
2025-07-17 14:46:50,145 - __main__ - INFO - Input data validation passed
2025-07-17 14:46:50,145 - __main__ - INFO - Processing training data
2025-07-17 14:46:50,146 - __main__ - INFO - Performing fit_transform operation
2025-07-17 14:46:50,146 - __main__ - INFO - Starting preprocessing pipeline fitting
2025-07-17 14:46:50,147 - __main__ - IN

LEAD DATA PREPROCESSING PIPELINE - PRODUCTION VERSION


2025-07-17 14:46:50,311 - __main__ - INFO - Transforming data with MultiColumnLabelEncoder
2025-07-17 14:46:50,336 - __main__ - INFO - Transformation complete:
2025-07-17 14:46:50,337 - __main__ - INFO -   Original shape: (9240, 37)
2025-07-17 14:46:50,337 - __main__ - INFO -   Final shape: (9240, 109)
2025-07-17 14:46:50,338 - __main__ - INFO -   Features generated: 109
2025-07-17 14:46:50,345 - __main__ - INFO -   Numerical features scaled to range: [0.000, 1.000]
2025-07-17 14:46:50,591 - __main__ - INFO - Processed data saved to: preprocessed_output\data\lead_scoring_processed.csv
2025-07-17 14:46:50,593 - __main__ - INFO - Training data processing completed successfully
2025-07-17 14:46:50,597 - __main__ - INFO - Pipeline saved to: preprocessed_output\models\lead_scoring_pipeline_v1.pkl
2025-07-17 14:46:50,598 - __main__ - INFO - Pipeline saved for production use: preprocessed_output\models\lead_scoring_pipeline_v1.pkl
2025-07-17 14:46:50,600 - __main__ - INFO - Feature documentat

# testing the pipeline pkl file for the new data 

In [2]:
"""
Simple test script to verify the saved preprocessing pipeline works correctly.

This script:
1. Creates sample data similar to the original dataset
2. Loads the saved pipeline using joblib directly
3. Tests preprocessing on the sample data
4. Validates the output

Usage: Run this after the main pipeline has been trained and saved.
Note: This script loads the pipeline directly using joblib, no module import needed.
"""

import pandas as pd
import numpy as np
import os
import joblib
from datetime import datetime

def create_sample_data(n_samples=100):
    """
    Create sample data that mimics the structure of the original lead scoring dataset.
    
    Parameters:
    -----------
    n_samples : int, default=100
        Number of sample records to create
        
    Returns:
    --------
    DataFrame
        Sample dataset with similar structure to original data
    """
    np.random.seed(42)  # For reproducible results
    
    # Create sample data with similar structure to original dataset
    sample_data = {
        # ID columns
        'Prospect ID': [f'PROS_{i:05d}' for i in range(n_samples)],
        'Lead Number': [f'LEAD_{i:05d}' for i in range(n_samples)],
        
        # Numerical features
        'TotalVisits': np.random.randint(1, 50, n_samples),
        'Total Time Spent on Website': np.random.randint(0, 3600, n_samples),
        'Page Views Per Visit': np.random.uniform(1, 20, n_samples),
        'Asymmetrique Activity Score': np.random.randint(0, 100, n_samples),
        'Asymmetrique Profile Score': np.random.randint(0, 100, n_samples),
        
        # High importance categorical features
        'Tags': np.random.choice(['Hot Lead', 'Cold Lead', 'Warm Lead', 'Lost Lead'], n_samples),
        'Lead Quality': np.random.choice(['High', 'Medium', 'Low'], n_samples),
        'Lead Profile': np.random.choice(['Potential Lead', 'Sure Lead', 'Not Sure'], n_samples),
        'What is your current occupation': np.random.choice(['Student', 'Working Professional', 'Unemployed', 'Other'], n_samples),
        'Last Activity': np.random.choice(['Email Opened', 'SMS Sent', 'Page Visited', 'Form Submitted'], n_samples),
        'Last Notable Activity': np.random.choice(['Email Opened', 'SMS Sent', 'Page Visited', 'Form Submitted'], n_samples),
        'Lead Source': np.random.choice(['Direct Traffic', 'Google', 'Organic Search', 'Reference'], n_samples),
        'Lead Origin': np.random.choice(['Landing Page Submission', 'API', 'Quick Add Form'], n_samples),
        
        # Medium importance categorical features
        'Specialization': np.random.choice(['Marketing Management', 'Finance Management', 'HR Management', 'Operations Management'], n_samples),
        'City': np.random.choice(['Mumbai', 'Delhi', 'Bangalore', 'Chennai', 'Kolkata'], n_samples),
        'How did you hear about X Education': np.random.choice(['Online Search', 'Social Media', 'Word of Mouth', 'Advertisement'], n_samples),
        'What matters most to you in choosing a course': np.random.choice(['Flexibility', 'Quality', 'Price', 'Brand'], n_samples),
        'Country': np.random.choice(['India', 'USA', 'UK', 'Australia'], n_samples),
        'Asymmetrique Activity Index': np.random.choice(['01.High', '02.Medium', '03.Low'], n_samples),
        'Asymmetrique Profile Index': np.random.choice(['01.High', '02.Medium', '03.Low'], n_samples),
        
        # Binary features
        'Do Not Email': np.random.choice(['Yes', 'No'], n_samples),
        'Do Not Call': np.random.choice(['Yes', 'No'], n_samples),
        'A free copy of Mastering The Interview': np.random.choice(['Yes', 'No'], n_samples),
        
        # Features that will be dropped (low variance)
        'Magazine': ['No'] * n_samples,
        'Search': ['No'] * n_samples,
        'Newspaper': ['No'] * n_samples,
        
        # Target variable
        'Converted': np.random.choice([0, 1], n_samples)
    }
    
    # Add some missing values to test imputation
    df = pd.DataFrame(sample_data)
    
    # Introduce missing values randomly
    missing_columns = ['TotalVisits', 'City', 'Specialization', 'Do Not Email']
    for col in missing_columns:
        if col in df.columns:
            # Make 10% of values missing
            missing_indices = np.random.choice(df.index, size=int(0.1 * len(df)), replace=False)
            df.loc[missing_indices, col] = np.nan
    
    return df

def test_pipeline(pipeline_path, sample_data):
    """
    Test the saved pipeline with sample data using direct joblib loading.
    
    Parameters:
    -----------
    pipeline_path : str
        Path to the saved pipeline file
    sample_data : DataFrame
        Sample data to test with
        
    Returns:
    --------
    tuple
        (success, processed_data, error_message)
    """
    try:
        print(f"Testing pipeline: {pipeline_path}")
        print(f"Sample data shape: {sample_data.shape}")
        
        # Load the saved pipeline directly using joblib
        print("Loading saved pipeline...")
        loaded_preprocessor = joblib.load(pipeline_path)
        
        # Check if the loaded object has the expected attributes
        if not hasattr(loaded_preprocessor, 'pipeline') or not hasattr(loaded_preprocessor, 'is_fitted'):
            raise ValueError("Loaded object doesn't appear to be a valid LeadDataPreprocessor")
        
        if not loaded_preprocessor.is_fitted:
            raise ValueError("Loaded pipeline is not fitted")
        
        print(f"✓ Pipeline loaded successfully")
        print(f"✓ Pipeline is fitted and ready to use")
        print(f"✓ Expected output features: {len(loaded_preprocessor.feature_names_)}")
        
        # Validate input data (basic checks)
        print("Performing basic input validation...")
        
        # Check for empty dataset
        if sample_data.empty:
            raise ValueError("Input dataset is empty")
        
        # Check for minimum required columns (basic check)
        required_columns = (loaded_preprocessor.numerical_features + 
                          loaded_preprocessor.high_importance_categorical + 
                          loaded_preprocessor.medium_importance_categorical + 
                          loaded_preprocessor.binary_features)
        
        available_columns = set(sample_data.columns)
        missing_columns = [col for col in required_columns if col not in available_columns]
        
        if missing_columns:
            print(f"⚠️  Warning: Missing some expected columns: {missing_columns[:5]}...")
            print("    Continuing with available columns...")
        else:
            print("✓ All expected columns are present")
        
        # Transform the sample data
        print("Transforming sample data...")
        processed_data = loaded_preprocessor.transform(
            sample_data, 
            save_to_csv=False  # Don't save during testing
        )
        
        print(f"✓ Processing successful!")
        print(f"✓ Input shape: {sample_data.shape}")
        print(f"✓ Output shape: {processed_data.shape}")
        
        return True, processed_data, None
        
    except Exception as e:
        error_msg = f"Pipeline test failed: {str(e)}"
        print(f"❌ {error_msg}")
        return False, None, error_msg

def validate_output(processed_data, original_data):
    """
    Validate the processed output data.
    
    Parameters:
    -----------
    processed_data : DataFrame
        Processed data from pipeline
    original_data : DataFrame
        Original input data
        
    Returns:
    --------
    bool
        True if validation passes
    """
    try:
        print("\nValidating processed output...")
        
        # Check basic properties
        print(f"✓ Output is DataFrame: {isinstance(processed_data, pd.DataFrame)}")
        print(f"✓ Same number of rows: {len(processed_data) == len(original_data)}")
        print(f"✓ No missing values: {processed_data.isnull().sum().sum() == 0}")
        
        # Check data types
        numeric_cols = processed_data.select_dtypes(include=[np.number]).columns
        print(f"✓ Numeric columns: {len(numeric_cols)}")
        
        # Check value ranges for scaled features
        for col in numeric_cols:
            min_val = processed_data[col].min()
            max_val = processed_data[col].max()
            print(f"  {col}: [{min_val:.3f}, {max_val:.3f}]")
        
        # Check for any infinite or extremely large values
        has_inf = np.isinf(processed_data.select_dtypes(include=[np.number])).any().any()
        print(f"✓ No infinite values: {not has_inf}")
        
        # Check column names
        print(f"✓ Feature names generated: {len(processed_data.columns)} columns")
        
        return True
        
    except Exception as e:
        print(f"Output validation failed: {str(e)}")
        return False

def main():
    """
    Main function to run the pipeline test.
    """
    print("=" * 60)
    print("TESTING SAVED PREPROCESSING PIPELINE")
    print("=" * 60)
    
    # Step 1: Define pipeline path
    pipeline_path = "preprocessed_output/models/lead_scoring_pipeline_v1.pkl"
    
    # Check if pipeline file exists
    if not os.path.exists(pipeline_path):
        print(f"❌ Pipeline file not found: {pipeline_path}")
        print("Please run the main preprocessing script first to create the pipeline.")
        print("Expected file location: preprocessed_output/models/lead_scoring_pipeline_v1.pkl")
        return
    
    print(f"✓ Pipeline file found: {pipeline_path}")
    
    # Check file size to ensure it's not empty
    file_size = os.path.getsize(pipeline_path)
    print(f"✓ Pipeline file size: {file_size / (1024*1024):.2f} MB")
    
    # Step 2: Create sample data
    print("Creating sample data...")
    sample_data = create_sample_data(n_samples=50)
    print(f"✓ Sample data created with {len(sample_data)} records")
    
    # Display sample data info
    print(f"\nSample data info:")
    print(f"  Shape: {sample_data.shape}")
    print(f"  Columns: {len(sample_data.columns)}")
    print(f"  Missing values: {sample_data.isnull().sum().sum()}")
    
    # Step 3: Test the pipeline
    print(f"\n" + "-" * 60)
    success, processed_data, error_msg = test_pipeline(pipeline_path, sample_data)
    
    if not success:
        print(f"❌ Pipeline test failed!")
        print(f"Error: {error_msg}")
        return
    
    # Step 4: Validate output
    print(f"\n" + "-" * 60)
    validation_success = validate_output(processed_data, sample_data)
    
    if not validation_success:
        print("❌ Output validation failed!")
        return
    
    # Step 5: Display results
    print(f"\n" + "=" * 60)
    print("TEST RESULTS SUMMARY")
    print("=" * 60)
    print(f"✅ Pipeline loading: SUCCESS")
    print(f"✅ Data processing: SUCCESS")
    print(f"✅ Output validation: SUCCESS")
    print(f"")
    print(f"Input Data:")
    print(f"  Records: {len(sample_data)}")
    print(f"  Features: {len(sample_data.columns)}")
    print(f"  Missing values: {sample_data.isnull().sum().sum()}")
    print(f"")
    print(f"Output Data:")
    print(f"  Records: {len(processed_data)}")
    print(f"  Features: {len(processed_data.columns)}")
    print(f"  Missing values: {processed_data.isnull().sum().sum()}")
    print(f"")
    print(f"🎉 Pipeline is working correctly and ready for production!")
    
    # Show sample of processed data
    print(f"\nSample of processed data (first 3 rows, first 10 columns):")
    print(processed_data.iloc[:3, :10].round(3))

if __name__ == "__main__":
    main()

2025-07-17 14:46:50,661 - __main__ - INFO - Starting data transformation
2025-07-17 14:46:50,662 - __main__ - INFO - Preparing data for preprocessing
2025-07-17 14:46:50,663 - __main__ - INFO - Data preparation complete:
2025-07-17 14:46:50,664 - __main__ - INFO -   Initial shape: (50, 29)
2025-07-17 14:46:50,666 - __main__ - INFO -   Columns removed: 3
2025-07-17 14:46:50,666 - __main__ - INFO -   Final shape: (50, 26)
2025-07-17 14:46:50,667 - __main__ - INFO - Applying preprocessing transformations
2025-07-17 14:46:50,674 - __main__ - INFO - Transforming data with MultiColumnLabelEncoder
2025-07-17 14:46:50,688 - __main__ - INFO - Transforming data with MultiColumnLabelEncoder
2025-07-17 14:46:50,692 - __main__ - INFO - Transformation complete:
2025-07-17 14:46:50,693 - __main__ - INFO -   Original shape: (50, 29)
2025-07-17 14:46:50,694 - __main__ - INFO -   Final shape: (50, 109)
2025-07-17 14:46:50,694 - __main__ - INFO -   Features generated: 109
2025-07-17 14:46:50,696 - __main

TESTING SAVED PREPROCESSING PIPELINE
✓ Pipeline file found: preprocessed_output/models/lead_scoring_pipeline_v1.pkl
✓ Pipeline file size: 0.02 MB
Creating sample data...
✓ Sample data created with 50 records

Sample data info:
  Shape: (50, 29)
  Columns: 29
  Missing values: 20

------------------------------------------------------------
Testing pipeline: preprocessed_output/models/lead_scoring_pipeline_v1.pkl
Sample data shape: (50, 29)
Loading saved pipeline...
✓ Pipeline loaded successfully
✓ Pipeline is fitted and ready to use
✓ Expected output features: 109
Performing basic input validation...
✓ All expected columns are present
Transforming sample data...
✓ Processing successful!
✓ Input shape: (50, 29)
✓ Output shape: (50, 109)

------------------------------------------------------------

Validating processed output...
✓ Output is DataFrame: True
✓ Same number of rows: True
✓ No missing values: True
✓ Numeric columns: 0
✓ No infinite values: True
✓ Feature names generated: 109