In [29]:
import pickle
import pandas as pd
import numpy as np
import warnings
from pathlib import Path
from typing import Union, Dict, Any, Optional, Tuple
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class LeadConversionPipeline:
    """
    A pipeline for lead conversion prediction.

    This class provides functionality to:
    - Load trained preprocessing pipeline and model
    - Clean and preprocess incoming data
    - Align features as per the training phase
    - Perform predictions
    - Save prediction results to a file
    """

    def __init__(self, pipeline_path: str, metadata_path: str, model_path: str):
        """
        Initialize the pipeline by loading all required components.

        Args:
            pipeline_path (str): Path to the preprocessing pipeline (.pkl file)
            metadata_path (str): Path to the metadata file (.pkl)
            model_path (str): Path to the trained model (.pkl)
        """
        self.pipeline_path = pipeline_path
        self.metadata_path = metadata_path
        self.model_path = model_path

        self.preprocessor = None
        self.metadata = None
        self.model = None
        self.model_data = None

        self._load_components()

    def _load_components(self) -> None:
        """Loads the preprocessing pipeline, metadata, and trained model from disk."""
        try:
            with open(self.pipeline_path, 'rb') as f:
                self.preprocessor = pickle.load(f)
            logger.info("Preprocessing pipeline loaded successfully")

            with open(self.metadata_path, 'rb') as f:
                self.metadata = pickle.load(f)
            logger.info("Metadata loaded successfully")

            with open(self.model_path, 'rb') as f:
                self.model_data = pickle.load(f)

            if isinstance(self.model_data, dict):
                for key in ['best_estimator_', 'model', 'estimator', 'best_model', 'classifier']:
                    if key in self.model_data:
                        self.model = self.model_data[key]
                        logger.info(f"Model loaded from key: '{key}'")
                        break
                if self.model is None:
                    for key, value in self.model_data.items():
                        if hasattr(value, 'predict'):
                            self.model = value
                            break
            else:
                self.model = self.model_data

            if not hasattr(self.model, 'predict'):
                raise ValueError("Loaded object is not a valid model")

            logger.info(f"Model type: {type(self.model)}")

        except Exception as e:
            logger.error(f"Error loading components: {e}")
            raise

    def _clean_data_types(self, df: pd.DataFrame) -> pd.DataFrame:
        """Cleans the input DataFrame to correct data types and missing values."""
        df_clean = df.copy()
        numerical_cols = self.metadata['numerical_features']
        categorical_cols = self.metadata['categorical_features']

        for col in numerical_cols:
            if col in df_clean.columns:
                df_clean[col] = pd.to_numeric(df_clean[col].astype(str), errors='coerce')
                if df_clean[col].isna().sum() > 0:
                    median_val = df_clean[col].median()
                    df_clean[col].fillna(median_val if not pd.isna(median_val) else 0, inplace=True)

        for col in categorical_cols:
            if col in df_clean.columns:
                df_clean[col] = df_clean[col].astype(str).replace(['nan', 'None', 'null'], 'Unknown').str.strip()

        return df_clean

    def _encode_binary_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Encodes binary categorical values into numerical format."""
        df_encoded = df.copy()
        binary_cols = self.metadata['binary_features']

        for col in binary_cols:
            if col in df_encoded.columns:
                df_encoded[col] = df_encoded[col].astype(str).str.strip().str.lower()
                binary_mapping = {
                    'yes': 1, 'y': 1, '1': 1, 'true': 1, 'on': 1,
                    'no': 0, 'n': 0, '0': 0, 'false': 0, 'off': 0,
                    'unknown': 0, 'nan': 0, 'none': 0
                }
                df_encoded[f'{col}_encoded'] = df_encoded[col].map(binary_mapping).fillna(0).astype(int)

        return df_encoded

    def _align_features(self, X: pd.DataFrame) -> pd.DataFrame:
        """Aligns feature columns to match those used in model training."""
        if 'feature_names' in self.model_data:
            expected_features = self.model_data['feature_names']
        elif hasattr(self.model, 'feature_names_in_'):
            expected_features = self.model.feature_names_in_
        else:
            return X

        X_aligned = pd.DataFrame(0, index=X.index, columns=expected_features)

        for col in X.columns:
            if col in expected_features:
                X_aligned[col] = X[col]

        return X_aligned

    def preprocess_data(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, Optional[pd.Series]]:
        """Runs the entire preprocessing pipeline on the input data."""
        df_processed = df.copy()
        actual_labels = None

        if 'Converted' in df_processed.columns:
            actual_labels = df_processed['Converted']
            df_processed.drop(columns=['Converted'], inplace=True)

        df_processed = self._clean_data_types(df_processed)
        df_processed = self._encode_binary_features(df_processed)

        numerical_cols = self.metadata['numerical_features']
        categorical_cols = self.metadata['categorical_features']
        binary_cols = self.metadata['binary_features']

        available_num_cols = [col for col in numerical_cols if col in df_processed.columns]
        available_cat_cols = [col for col in categorical_cols if col in df_processed.columns]

        if available_num_cols or available_cat_cols:
            X_sample = df_processed[available_num_cols + available_cat_cols]
            X_transformed = self.preprocessor.transform(X_sample)

            try:
                feature_names = self.preprocessor.get_feature_names_out()
            except:
                feature_names = [f"feature_{i}" for i in range(X_transformed.shape[1])]

            X_final = pd.DataFrame(X_transformed, columns=feature_names, index=df_processed.index)

            for col in [f"{col}_encoded" for col in binary_cols if col in df.columns]:
                if col in df_processed.columns:
                    X_final[col] = df_processed[col].values

            X_final = self._align_features(X_final)
            return X_final, actual_labels

        else:
            raise ValueError("No valid numerical or categorical columns found for preprocessing")

    def predict(self, df: pd.DataFrame) -> Dict[str, Any]:
        """Generates predictions for the given input data."""
        X_final, actual_labels = self.preprocess_data(df)
        predictions = self.model.predict(X_final)

        try:
            probabilities = self.model.predict_proba(X_final)
        except:
            probabilities = None

        results = {
            'predictions': predictions,
            'probabilities': probabilities,
            'actual_labels': actual_labels,
            'input_data': df.copy()
        }

        if actual_labels is not None:
            accuracy = (predictions == actual_labels).mean() * 100
            results['accuracy'] = accuracy

        return results

    def predict_from_file(self, file_path: str, output_path: Optional[str] = None) -> Dict[str, Any]:
        """Reads data from a CSV file, makes predictions, and optionally saves results to CSV."""
        df = pd.read_csv(file_path)
        results = self.predict(df)

        results_df = results['input_data'].copy()
        results_df['Predicted_Converted'] = results['predictions']

        if results['probabilities'] is not None:
            prob_df = pd.DataFrame(results['probabilities'],
                                   columns=[f'Prob_Class_{i}' for i in range(results['probabilities'].shape[1])],
                                   index=results_df.index)
            results_df = pd.concat([results_df, prob_df], axis=1)

        if results['actual_labels'] is not None:
            results_df['Actual_Converted'] = results['actual_labels']
            results_df['Prediction_Correct'] = (results_df['Predicted_Converted'] == results_df['Actual_Converted'])

        if output_path:
            results_df.to_csv(output_path, index=False)

        results['results_df'] = results_df
        return results

    def get_model_info(self) -> Dict[str, Any]:
        """Returns metadata and details about the loaded model."""
        info = {
            'model_type': type(self.model).__name__,
            'numerical_features': self.metadata['numerical_features'],
            'categorical_features': self.metadata['categorical_features'],
            'binary_features': self.metadata['binary_features']
        }

        if 'feature_names' in self.model_data:
            info['total_features'] = len(self.model_data['feature_names'])
            info['feature_names'] = self.model_data['feature_names']

        return info

# Utility Functions
def create_pipeline(pipeline_path: str, metadata_path: str, model_path: str) -> LeadConversionPipeline:
    """Instantiates the LeadConversionPipeline class."""
    return LeadConversionPipeline(pipeline_path, metadata_path, model_path)

def predict_from_csv(pipeline_path: str, metadata_path: str, model_path: str,
                     input_csv: str, output_csv: str = None) -> Dict[str, Any]:
    """Convenience function to predict using input CSV file."""
    pipeline = create_pipeline(pipeline_path, metadata_path, model_path)
    return pipeline.predict_from_file(input_csv, output_csv)

if __name__ == "__main__":
    warnings.filterwarnings('ignore')

    PIPELINE_PATH = r"C:\Users\Minfy.DESKTOP-3E50D5N\Desktop\final_capstone\preprocess\preprocessed_output\processed\pipeline_model.pkl"
    METADATA_PATH = r"C:\Users\Minfy.DESKTOP-3E50D5N\Desktop\final_capstone\preprocess\preprocessed_output\processed\metadata.pkl"
    MODEL_PATH = r"C:\Users\Minfy.DESKTOP-3E50D5N\Desktop\final_capstone\modal_development\model\best_model_xgboost.pkl"
    INPUT_CSV = r"C:\Users\Minfy.DESKTOP-3E50D5N\Desktop\final_capstone\raw_data\check.csv"
    OUTPUT_CSV = r"C:\Users\Minfy.DESKTOP-3E50D5N\Desktop\final_capstone\prediction_results.csv"

    # Option 1: Use convenience function
    results = predict_from_csv(PIPELINE_PATH, METADATA_PATH, MODEL_PATH, INPUT_CSV, OUTPUT_CSV)

    # Option 2: Use the class directly
    pipeline = create_pipeline(PIPELINE_PATH, METADATA_PATH, MODEL_PATH)
    model_info = pipeline.get_model_info()

    print(f"Model type: {model_info['model_type']}")
    print(f"Total features: {model_info.get('total_features', 'Unknown')}")

    results = pipeline.predict_from_file(INPUT_CSV, OUTPUT_CSV)
    print("Prediction Summary:")
    print(f"  - Total samples: {len(results['predictions'])}")
    print(f"  - Predictions: {pd.Series(results['predictions']).value_counts().to_dict()}")
    if 'accuracy' in results:
        print(f"  - Accuracy: {results['accuracy']:.2f}%")

    print("Prediction pipeline completed successfully.")


2025-07-17 21:18:31,036 - INFO - Preprocessing pipeline loaded successfully
2025-07-17 21:18:31,040 - INFO - Metadata loaded successfully
2025-07-17 21:18:31,078 - INFO - Model loaded from key: 'model'
2025-07-17 21:18:31,080 - INFO - Model type: <class 'xgboost.sklearn.XGBClassifier'>
2025-07-17 21:18:31,374 - INFO - Preprocessing pipeline loaded successfully
2025-07-17 21:18:31,376 - INFO - Metadata loaded successfully
2025-07-17 21:18:31,386 - INFO - Model loaded from key: 'model'
2025-07-17 21:18:31,387 - INFO - Model type: <class 'xgboost.sklearn.XGBClassifier'>


Model type: XGBClassifier
Total features: 179
Prediction Summary:
  - Total samples: 5
  - Predictions: {0: 5}
  - Accuracy: 60.00%
Prediction pipeline completed successfully.
