<h1 align=center>Data Ingestion</h1>

In [1]:
import os

os.chdir("../")
os.getcwd()

'c:\\Users\\44787\\Desktop\\mlops-pro-project'

In [2]:
import os
import sys
import pandas as pd
from sklearn.model_selection import train_test_split
from dataclasses import dataclass
from src.logger import logger
from src.exception import CustomException

In [3]:
@dataclass
class DataIngestionConfig:
    """Configuration for data ingestion component."""
    raw_data_path: str
    train_data_path: str
    test_data_path: str
    test_size: float = 0.2
    random_state: int = 42

In [4]:
class DataIngestion:
    """
    Handles data loading and splitting into train/test sets.
    """
    
    def __init__(self, config: DataIngestionConfig):
        """
        Initialize DataIngestion component.
        
        Args:
            config: DataIngestionConfig object with paths and parameters
        """
        self.config = config
        logger.info("Data Ingestion component initialized")
    
    def initiate_data_ingestion(self) -> tuple:
        """
        Load data and split into train/test sets.
        
        Returns:
            Tuple of (train_data_path, test_data_path)
        """
        logger.info("Starting data ingestion process")
        
        try:
            # Read the dataset
            logger.info(f"Reading dataset from {self.config.raw_data_path}")
            df = pd.read_csv(self.config.raw_data_path)
            logger.info(f"Dataset loaded successfully. Shape: {df.shape}")
            
            # Basic info logging
            logger.info(f"Columns: {list(df.columns)}")
            logger.info(f"Missing values: {df.isnull().sum().sum()}")
            logger.info(f"Duplicates: {df.duplicated().sum()}")
            
            # Create directory for processed data
            os.makedirs(os.path.dirname(self.config.train_data_path), exist_ok=True)
            
            # Split the data
            logger.info(f"Splitting data with test_size={self.config.test_size}")
            train_set, test_set = train_test_split(
                df,
                test_size=self.config.test_size,
                random_state=self.config.random_state,
                stratify=df.iloc[:, -1] if 'Churn' in df.columns else None  # Stratify on target
            )
            
            logger.info(f"Train set shape: {train_set.shape}")
            logger.info(f"Test set shape: {test_set.shape}")
            
            # Save train and test sets
            train_set.to_csv(self.config.train_data_path, index=False, header=True)
            test_set.to_csv(self.config.test_data_path, index=False, header=True)
            
            logger.info("Data ingestion completed successfully")
            logger.info(f"Train data saved to: {self.config.train_data_path}")
            logger.info(f"Test data saved to: {self.config.test_data_path}")
            
            return (
                self.config.train_data_path,
                self.config.test_data_path
            )
            
        except Exception as e:
            logger.error("Error in data ingestion")
            raise CustomException(e, sys)
    
    def get_data_info(self) -> dict:
        """
        Get information about the ingested data.
        
        Returns:
            Dictionary with data statistics
        """
        try:
            df = pd.read_csv(self.config.raw_data_path)
            
            info = {
                'total_rows': len(df),
                'total_columns': len(df.columns),
                'columns': list(df.columns),
                'missing_values': df.isnull().sum().to_dict(),
                'duplicates': int(df.duplicated().sum()),
                'dtypes': df.dtypes.astype(str).to_dict()
            }
            
            return info
            
        except Exception as e:
            raise CustomException(e, sys)

In [None]:
def create_data_ingestion_config(config_dict: dict) -> DataIngestionConfig:
    """
    Create DataIngestionConfig from dictionary.
    
    Args:
        config_dict: Configuration dictionary
        
    Returns:
        DataIngestionConfig object
    """
    return DataIngestionConfig(
        raw_data_path=config_dict.data_ingeti.raw_data_path,
        train_data_path=config_dict.train_data_path,
        test_data_path=config_dict.test_data_path,
        test_size=config_dict.test_size,
        random_state=config_dict.random_state)

In [None]:
from pathlib import Path
from yaml import safe_load
from src.utils.common import read_yaml

try:
    config_dict = read_yaml(Path("configs/config.yaml"))
    config = create_data_ingestion_config(config_dict.data_ingestion)
    data_ingestion = DataIngestion(config)
    train_data_path, test_data_path = data_ingestion.initiate_data_ingestion()
except Exception as e:
    logger.error(f"Data ingestion failed: {e}")
    raise CustomException(e, sys)

[2025-11-06 18:25:30,342] INFO - ChurnPrediction - yaml file: configs\config.yaml loaded successfully
[2025-11-06 18:25:30,356] INFO - ChurnPrediction - Data Ingestion component initialized
[2025-11-06 18:25:30,357] INFO - ChurnPrediction - Starting data ingestion process
[2025-11-06 18:25:30,358] INFO - ChurnPrediction - Reading dataset from data/raw/churn_data.csv
[2025-11-06 18:25:30,426] INFO - ChurnPrediction - Dataset loaded successfully. Shape: (7043, 21)


[2025-11-06 18:25:30,429] INFO - ChurnPrediction - Columns: ['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']
[2025-11-06 18:25:30,442] INFO - ChurnPrediction - Missing values: 0
[2025-11-06 18:25:30,476] INFO - ChurnPrediction - Duplicates: 0
[2025-11-06 18:25:30,479] INFO - ChurnPrediction - Splitting data with test_size=0.2
[2025-11-06 18:25:30,504] INFO - ChurnPrediction - Train set shape: (5634, 21)
[2025-11-06 18:25:30,506] INFO - ChurnPrediction - Test set shape: (1409, 21)
[2025-11-06 18:25:30,585] INFO - ChurnPrediction - Data ingestion completed successfully
[2025-11-06 18:25:30,588] INFO - ChurnPrediction - Train data saved to: data/processed/train.csv
[2025-11-06 18:25:30,589] INFO - ChurnPrediction - Tes

<h1 align=center>Data Validation</h1>

In [14]:
import os
import sys
import pandas as pd
from typing import Dict, List
from dataclasses import dataclass
from src.logger import logger
from src.exception import CustomException
# from src.utils.common import save_json

import json
from pathlib import Path

def save_json(path, data):
    path = Path(path)
    with open(path, "w") as f:
        json.dump(data, f, indent=4)

In [15]:
@dataclass
class DataValidationConfig:
    """Configuration for data validation component."""
    report_path: str

In [22]:
import pandas as pd

df = pd.read_csv(Path("data/raw/churn_data.csv"))
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [28]:
class DataValidation:
    """
    Validates data quality and schema compliance.
    """
    
    # Expected schema for churn dataset
    EXPECTED_COLUMNS = [
        'customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
        'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
        'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
        'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
        'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'
    ]
    
    ## type of TotalCharges is object
    # NUMERICAL_COLUMNS = ['tenure', 'MonthlyCharges', 'TotalCharges']
    NUMERICAL_COLUMNS = ['tenure', 'MonthlyCharges']
    
    CATEGORICAL_COLUMNS = [
        'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService',
        'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
        'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
        'Contract', 'PaperlessBilling', 'PaymentMethod'
    ]
    TARGET_COLUMN = 'Churn'
    
    def __init__(self, config: DataValidationConfig):
        """
        Initialize DataValidation component.
        
        Args:
            config: DataValidationConfig object
        """
        self.config = config
        logger.info("Data Validation component initialized")
    
    def validate_schema(self, df: pd.DataFrame) -> Dict[str, bool]:
        """
        Validate if dataframe matches expected schema.
        
        Args:
            df: DataFrame to validate
            
        Returns:
            Dictionary with validation results
        """
        try:
            validation_results = {}
            
            # Check if all expected columns are present
            missing_columns = set(self.EXPECTED_COLUMNS) - set(df.columns)
            validation_results['all_columns_present'] = len(missing_columns) == 0
            validation_results['missing_columns'] = list(missing_columns)
            
            # Check for extra columns
            extra_columns = set(df.columns) - set(self.EXPECTED_COLUMNS)
            validation_results['extra_columns'] = list(extra_columns)
            
            # Check data types for numerical columns
            numerical_dtype_check = {}
            for col in self.NUMERICAL_COLUMNS:
                if col in df.columns:
                    numerical_dtype_check[col] = pd.api.types.is_numeric_dtype(df[col])
            validation_results['numerical_dtypes_correct'] = all(numerical_dtype_check.values())
            validation_results['numerical_dtype_details'] = numerical_dtype_check
            
            logger.info(f"Schema validation completed: {validation_results}")
            return validation_results
            
        except Exception as e:
            raise CustomException(e, sys)
    
    def validate_data_quality(self, df: pd.DataFrame) -> Dict[str, any]:
        """
        Validate data quality checks.
        
        Args:
            df: DataFrame to validate
            
        Returns:
            Dictionary with quality check results
        """
        try:
            quality_report = {}
            
            # Check for missing values
            missing_values = df.isnull().sum()
            quality_report['missing_values'] = missing_values[missing_values > 0].to_dict()
            quality_report['total_missing'] = int(df.isnull().sum().sum())
            quality_report['missing_percentage'] = round(
                (df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100, 2
            )
            
            # Check for duplicates
            quality_report['duplicate_rows'] = int(df.duplicated().sum())
            quality_report['duplicate_percentage'] = round(
                (df.duplicated().sum() / len(df)) * 100, 2
            )
            
            # Check for data ranges (numerical columns)
            numerical_stats = {}
            for col in self.NUMERICAL_COLUMNS:
                if col in df.columns:
                    numerical_stats[col] = {
                        'min': float(df[col].min()),
                        'max': float(df[col].max()),
                        'mean': float(df[col].mean()),
                        'std': float(df[col].std()),
                        'negative_values': int((df[col] < 0).sum())
                    }
            quality_report['numerical_statistics'] = numerical_stats
            
            # Check target distribution
            if self.TARGET_COLUMN in df.columns:
                target_dist = df[self.TARGET_COLUMN].value_counts()
                quality_report['target_distribution'] = target_dist.to_dict()
                quality_report['target_balance_ratio'] = round(
                    target_dist.min() / target_dist.max(), 2
                )
            
            logger.info(f"Data quality validation completed")
            return quality_report
            
        except Exception as e:
            raise CustomException(e, sys)
    
    def initiate_data_validation(self, train_path: str, test_path: str) -> bool:
        """
        Perform complete data validation on train and test sets.
        
        Args:
            train_path: Path to training data
            test_path: Path to test data
            
        Returns:
            Boolean indicating if data passed validation
        """
        logger.info("Starting data validation process")
        
        try:
            # Load data
            train_df = pd.read_csv(train_path)
            test_df = pd.read_csv(test_path)
            
            logger.info(f"Loaded train data: {train_df.shape}")
            logger.info(f"Loaded test data: {test_df.shape}")
            
            # Validate schema
            train_schema = self.validate_schema(train_df)
            test_schema = self.validate_schema(test_df)
            
            # Validate quality
            train_quality = self.validate_data_quality(train_df)
            test_quality = self.validate_data_quality(test_df)
            
            # Compile validation report
            validation_report = {
                'train_data': {
                    'shape': train_df.shape,
                    'schema_validation': train_schema,
                    'quality_validation': train_quality
                },
                'test_data': {
                    'shape': test_df.shape,
                    'schema_validation': test_schema,
                    'quality_validation': test_quality
                },
                'validation_passed': (
                    train_schema['all_columns_present'] and 
                    test_schema['all_columns_present']
                )
            }
            
            # Save validation report
            save_json(self.config.report_path, validation_report)
            logger.info(f"Validation report saved to: {self.config.report_path}")
            
            # Log critical issues
            if not validation_report['validation_passed']:
                logger.warning("Data validation failed! Check the validation report.")
            else:
                logger.info("Data validation passed successfully!")
            
            return validation_report['validation_passed']
            
        except Exception as e:
            logger.error("Error in data validation")
            raise CustomException(e, sys)

In [29]:
def create_data_validation_config(config_dict: dict) -> DataValidationConfig:
    """
    Create DataValidationConfig from dictionary.
    
    Args:
        config_dict: Configuration dictionary
        
    Returns:
        DataValidationConfig object
    """
    return DataValidationConfig(
        report_path=config_dict.report_path)

In [30]:
try:
    config_dict = read_yaml(Path("configs/config.yaml"))
    data_validation_config = create_data_validation_config(config_dict.data_validation)
    data_validation = DataValidation(data_validation_config)
    validation_passed = data_validation.initiate_data_validation(
        config_dict.data_ingestion.train_data_path,
        config_dict.data_ingestion.test_data_path
    )
except Exception as e:
    logger.error(f"Data validation failed: {e}")
    raise CustomException(e, sys)

[2025-11-06 18:39:38,973] INFO - ChurnPrediction - yaml file: configs\config.yaml loaded successfully
[2025-11-06 18:39:38,975] INFO - ChurnPrediction - Data Validation component initialized
[2025-11-06 18:39:38,977] INFO - ChurnPrediction - Starting data validation process
[2025-11-06 18:39:39,016] INFO - ChurnPrediction - Loaded train data: (5634, 21)
[2025-11-06 18:39:39,018] INFO - ChurnPrediction - Loaded test data: (1409, 21)
[2025-11-06 18:39:39,020] INFO - ChurnPrediction - Schema validation completed: {'all_columns_present': True, 'missing_columns': [], 'extra_columns': [], 'numerical_dtypes_correct': True, 'numerical_dtype_details': {'tenure': True, 'MonthlyCharges': True}}
[2025-11-06 18:39:39,020] INFO - ChurnPrediction - Schema validation completed: {'all_columns_present': True, 'missing_columns': [], 'extra_columns': [], 'numerical_dtypes_correct': True, 'numerical_dtype_details': {'tenure': True, 'MonthlyCharges': True}}
[2025-11-06 18:39:39,069] INFO - ChurnPrediction -

<h1 align=center>Data Preprocessing</h1>

In [32]:

import os
import sys
import pandas as pd
import numpy as np
from dataclasses import dataclass
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from src.logger import logger
from src.exception import CustomException
# from src.utils.common import save_object

In [34]:
import os
import pickle

def save_object(file_path: str, obj):
    """
    Save any Python object to a file using pickle.

    Args:
        file_path (str): Path to the file where the object should be saved.
        obj: Python object to pickle.
    """
    # Create directory if it doesn't exist
    dir_path = os.path.dirname(file_path)
    if dir_path != "":
        os.makedirs(dir_path, exist_ok=True)

    # Save object
    with open(file_path, 'wb') as file_obj:
        pickle.dump(obj, file_obj)

In [33]:
@dataclass
class DataPreprocessingConfig:
    """Configuration for data preprocessing component."""
    preprocessor_path: str
    numerical_features: list
    categorical_features: list
    target_column: str

In [35]:
class DataPreprocessing:
    """
    Handles feature engineering and data transformation.
    """
    
    def __init__(self, config: DataPreprocessingConfig):
        """
        Initialize DataPreprocessing component.
        
        Args:
            config: DataPreprocessingConfig object
        """
        self.config = config
        self.preprocessor = None
        logger.info("Data Preprocessing component initialized")
    
    def get_preprocessor(self) -> ColumnTransformer:
        """
        Create preprocessing pipeline for numerical and categorical features.
        
        Returns:
            ColumnTransformer object with preprocessing pipelines
        """
        try:
            # Numerical pipeline
            numerical_pipeline = Pipeline(
                steps=[
                    ('imputer', SimpleImputer(strategy='median')),
                    ('scaler', StandardScaler())
                ]
            )
            
            # Categorical pipeline
            categorical_pipeline = Pipeline(
                steps=[
                    ('imputer', SimpleImputer(strategy='most_frequent')),
                    ('encoder', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
                ]
            )
            
            # Combine pipelines
            preprocessor = ColumnTransformer(
                transformers=[
                    ('num', numerical_pipeline, self.config.numerical_features),
                    ('cat', categorical_pipeline, self.config.categorical_features)
                ],
                remainder='drop'
            )
            
            logger.info("Preprocessing pipeline created successfully")
            logger.info(f"Numerical features: {self.config.numerical_features}")
            logger.info(f"Categorical features: {self.config.categorical_features}")
            
            return preprocessor
            
        except Exception as e:
            raise CustomException(e, sys)
    
    def clean_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Perform data cleaning operations.
        
        Args:
            df: Input DataFrame
            
        Returns:
            Cleaned DataFrame
        """
        try:
            logger.info("Starting data cleaning")
            df_clean = df.copy()
            
            # Remove customerID column (not useful for modeling)
            if 'customerID' in df_clean.columns:
                df_clean = df_clean.drop('customerID', axis=1)
                logger.info("Dropped customerID column")
            
            # Handle TotalCharges - convert to numeric
            if 'TotalCharges' in df_clean.columns:
                df_clean['TotalCharges'] = pd.to_numeric(
                    df_clean['TotalCharges'], 
                    errors='coerce'
                )
                logger.info("Converted TotalCharges to numeric")
            
            # Convert SeniorCitizen to object for categorical encoding
            if 'SeniorCitizen' in df_clean.columns:
                df_clean['SeniorCitizen'] = df_clean['SeniorCitizen'].astype(str)
            
            # Remove duplicates
            initial_rows = len(df_clean)
            df_clean = df_clean.drop_duplicates()
            removed_duplicates = initial_rows - len(df_clean)
            if removed_duplicates > 0:
                logger.info(f"Removed {removed_duplicates} duplicate rows")
            
            logger.info(f"Data cleaning completed. Final shape: {df_clean.shape}")
            return df_clean
            
        except Exception as e:
            raise CustomException(e, sys)
    
    def initiate_data_preprocessing(self, train_path: str, test_path: str) -> tuple:
        """
        Perform complete data preprocessing.
        
        Args:
            train_path: Path to training data
            test_path: Path to test data
            
        Returns:
            Tuple of (train_features, test_features, train_target, test_target, preprocessor_path)
        """
        logger.info("Starting data preprocessing process")
        
        try:
            # Load data
            train_df = pd.read_csv(train_path)
            test_df = pd.read_csv(test_path)
            
            logger.info(f"Loaded train data: {train_df.shape}")
            logger.info(f"Loaded test data: {test_df.shape}")
            
            # Clean data
            train_df = self.clean_data(train_df)
            test_df = self.clean_data(test_df)
            
            # Separate features and target
            target_column = self.config.target_column
            
            X_train = train_df.drop(columns=[target_column])
            y_train = train_df[target_column]
            
            X_test = test_df.drop(columns=[target_column])
            y_test = test_df[target_column]
            
            logger.info(f"Separated features and target")
            logger.info(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
            logger.info(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")
            
            # Encode target variable
            label_encoder = LabelEncoder()
            y_train_encoded = label_encoder.fit_transform(y_train)
            y_test_encoded = label_encoder.transform(y_test)
            
            logger.info(f"Target encoding: {dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))}")
            
            # Get preprocessing pipeline
            self.preprocessor = self.get_preprocessor()
            
            # Fit and transform training data
            logger.info("Fitting preprocessor on training data")
            X_train_transformed = self.preprocessor.fit_transform(X_train)
            
            # Transform test data
            logger.info("Transforming test data")
            X_test_transformed = self.preprocessor.transform(X_test)
            
            logger.info(f"Transformed X_train shape: {X_train_transformed.shape}")
            logger.info(f"Transformed X_test shape: {X_test_transformed.shape}")
            
            # Save preprocessor
            save_object(self.config.preprocessor_path, self.preprocessor)
            logger.info(f"Preprocessor saved to: {self.config.preprocessor_path}")
            
            # Also save label encoder
            label_encoder_path = self.config.preprocessor_path.replace('.pkl', '_label_encoder.pkl')
            save_object(label_encoder_path, label_encoder)
            logger.info(f"Label encoder saved to: {label_encoder_path}")
            
            logger.info("Data preprocessing completed successfully")
            
            return (
                X_train_transformed,
                X_test_transformed,
                y_train_encoded,
                y_test_encoded,
                self.config.preprocessor_path
            )
            
        except Exception as e:
            logger.error("Error in data preprocessing")
            raise CustomException(e, sys)

In [38]:

def create_data_preprocessing_config(config_dict: dict) -> DataPreprocessingConfig:
    """
    Create DataPreprocessingConfig from dictionary.
    
    Args:
        config_dict: Configuration dictionary
        
    Returns:
        DataPreprocessingConfig object
    """
    return DataPreprocessingConfig(
        preprocessor_path=config_dict.preprocessor_path,
        numerical_features=config_dict.numerical_features,
        categorical_features=config_dict.categorical_features,
        target_column=config_dict.target_column
    )

In [39]:
try:
    config_dict = read_yaml(Path("configs/config.yaml"))
    data_preprocessing_config = create_data_preprocessing_config(config_dict.data_preprocessing)
    data_preprocessing = DataPreprocessing(data_preprocessing_config)
    X_train, X_test, y_train, y_test, preprocessor_path = data_preprocessing.initiate_data_preprocessing(
        config_dict.data_ingestion.train_data_path,
        config_dict.data_ingestion.test_data_path
    )

except Exception as e:
    logger.error(f"Data preprocessing failed: {e}")
    raise CustomException(e, sys)

[2025-11-07 17:24:47,967] INFO - ChurnPrediction - yaml file: configs\config.yaml loaded successfully
[2025-11-07 17:24:47,979] INFO - ChurnPrediction - Data Preprocessing component initialized
[2025-11-07 17:24:47,980] INFO - ChurnPrediction - Starting data preprocessing process
[2025-11-07 17:24:48,097] INFO - ChurnPrediction - Loaded train data: (5634, 21)
[2025-11-07 17:24:48,097] INFO - ChurnPrediction - Loaded test data: (1409, 21)
[2025-11-07 17:24:48,098] INFO - ChurnPrediction - Starting data cleaning
[2025-11-07 17:24:48,119] INFO - ChurnPrediction - Dropped customerID column
[2025-11-07 17:24:48,126] INFO - ChurnPrediction - Converted TotalCharges to numeric
[2025-11-07 17:24:48,149] INFO - ChurnPrediction - Removed 15 duplicate rows
[2025-11-07 17:24:48,150] INFO - ChurnPrediction - Data cleaning completed. Final shape: (5619, 20)
[2025-11-07 17:24:48,151] INFO - ChurnPrediction - Starting data cleaning
[2025-11-07 17:24:48,154] INFO - ChurnPrediction - Dropped customerID c

<h1 align=center>Model Training</h1>

In [42]:
import os
import sys
import numpy as np
import mlflow
import mlflow.sklearn
from dataclasses import dataclass
from typing import Dict, Any, Tuple
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from src.logger import logger
from src.exception import CustomException
# from src.utils.common import save_object

In [41]:
import os

os.getcwd()

'c:\\Users\\44787\\Desktop\\mlops-pro-project'

In [43]:
@dataclass
class ModelTrainerConfig:
    """Configuration for model trainer component."""
    models_dir: str
    models: list
    mlflow_tracking_uri: str
    mlflow_experiment_name: str

In [65]:
class ModelTrainer:
    """
    Trains multiple ML models and tracks experiments with MLflow.
    """
    
    def __init__(self, config: ModelTrainerConfig, model_params: Dict[str, Dict]):
        """
        Initialize ModelTrainer component.
        
        Args:
            config: ModelTrainerConfig object
            model_params: Dictionary of model hyperparameters
        """
        self.config = config
        self.model_params = model_params
        self.models = {}
        self.trained_models = {}
        
        # Setup MLflow
        mlflow.set_tracking_uri(config.mlflow_tracking_uri)
        mlflow.set_experiment(config.mlflow_experiment_name)
        
        logger.info("Model Trainer component initialized")
        logger.info(f"MLflow tracking URI: {config.mlflow_tracking_uri}")
        logger.info(f"MLflow experiment: {config.mlflow_experiment_name}")
    
    def get_models(self) -> Dict[str, Any]:
        """
        Initialize models with their hyperparameters.
        
        Returns:
            Dictionary of model instances
        """
        try:
            models = {}
            
            if 'logistic_regression' in self.config.models:
                models['logistic_regression'] = LogisticRegression(
                    **self.model_params.get('logistic_regression', {})
                )
            
            if 'random_forest' in self.config.models:
                models['random_forest'] = RandomForestClassifier(
                    **self.model_params.get('random_forest', {})
                )
            
            if 'xgboost' in self.config.models:
                models['xgboost'] = XGBClassifier(
                    **self.model_params.get('xgboost', {})
                )
            
            if 'lightgbm' in self.config.models:
                models['lightgbm'] = LGBMClassifier(
                    **self.model_params.get('lightgbm', {}),
                    verbose=-1
                )
            
            logger.info(f"Initialized {len(models)} models: {list(models.keys())}")
            return models
            
        except Exception as e:
            raise CustomException(e, sys)
    
    def train_model(
        self, 
        model_name: str, 
        model: Any, 
        X_train: np.ndarray, 
        y_train: np.ndarray,
        X_test: np.ndarray,
        y_test: np.ndarray
    ) -> Tuple[Any, str]:
        """
        Train a single model and log to MLflow.
        
        Args:
            model_name: Name of the model
            model: Model instance
            X_train: Training features
            y_train: Training target
            X_test: Test features
            y_test: Test target
            
        Returns:
            Tuple of (trained_model, model_path)
        """
        try:
            logger.info(f"Training {model_name}...")
            
            with mlflow.start_run(run_name=f"{model_name}_run") as run:
                # Log model parameters
                mlflow.log_params(self.model_params.get(model_name, {}))
                
                # Train model
                model.fit(X_train, y_train)
                logger.info(f"{model_name} training completed")
                
                # Make predictions
                y_train_pred = model.predict(X_train)
                y_test_pred = model.predict(X_test)
                
                # Get prediction probabilities
                if hasattr(model, 'predict_proba'):
                    y_train_pred_proba = model.predict_proba(X_train)[:, 1]
                    y_test_pred_proba = model.predict_proba(X_test)[:, 1]
                else:
                    y_train_pred_proba = None
                    y_test_pred_proba = None
                
                # Calculate basic metrics (detailed evaluation in model_evaluation.py)
                from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
                
                train_accuracy = accuracy_score(y_train, y_train_pred)
                test_accuracy = accuracy_score(y_test, y_test_pred)
                test_precision = precision_score(y_test, y_test_pred, average='binary')
                test_recall = recall_score(y_test, y_test_pred, average='binary')
                test_f1 = f1_score(y_test, y_test_pred, average='binary')
                
                # Log metrics
                mlflow.log_metric("train_accuracy", train_accuracy)
                mlflow.log_metric("test_accuracy", test_accuracy)
                mlflow.log_metric("test_precision", test_precision)
                mlflow.log_metric("test_recall", test_recall)
                mlflow.log_metric("test_f1_score", test_f1)
                
                logger.info(f"{model_name} - Test Accuracy: {test_accuracy:.4f}, F1: {test_f1:.4f}")
                
                # Log model to MLflow
                mlflow.sklearn.log_model(model, f"{model_name}_model")
                
                # Save model locally
                model_path = os.path.join(self.config.models_dir, f"{model_name}.pkl")
                save_object(model_path, model)
                logger.info(f"{model_name} saved to: {model_path}")
                
                # Log artifact path
                mlflow.log_param("model_path", model_path)
                
                return model, model_path
                
        except Exception as e:
            logger.error(f"Error training {model_name}")
            raise CustomException(e, sys)
    
    def initiate_model_training(
        self,
        X_train: np.ndarray,
        X_test: np.ndarray,
        y_train: np.ndarray,
        y_test: np.ndarray
    ) -> Dict[str, Any]:
        """
        Train all configured models.
        
        Args:
            X_train: Training features
            X_test: Test features
            y_train: Training target
            y_test: Test target
            
        Returns:
            Dictionary with trained models and their paths
        """
        logger.info("Starting model training process")
        
        try:
            # Get models
            self.models = self.get_models()
            
            logger.info(f"Training {len(self.models)} models")
            logger.info(f"Training data shape: {X_train.shape}")
            logger.info(f"Test data shape: {X_test.shape}")
            
            # Train each model
            results = {}
            
            for model_name, model in self.models.items():
                logger.info(f"\n{'='*50}")
                logger.info(f"Training {model_name}")
                logger.info(f"{'='*50}")
                
                trained_model, model_path = self.train_model(
                    model_name=model_name,
                    model=model,
                    X_train=X_train,
                    y_train=y_train,
                    X_test=X_test,
                    y_test=y_test
                )
                
                results[model_name] = {
                    'model': trained_model,
                    'model_path': model_path
                }
                
                self.trained_models[model_name] = trained_model
            
            logger.info(f"\n{'='*50}")
            logger.info("Model training completed for all models")
            logger.info(f"{'='*50}")
            
            return results
            
        except Exception as e:
            logger.error("Error in model training")
            raise CustomException(e, sys)

In [66]:
def create_model_trainer_config(config_dict: dict, mlflow_config: dict) -> ModelTrainerConfig:
    """
    Create ModelTrainerConfig from dictionaries.
    
    Args:
        config_dict: Model training configuration dictionary
        mlflow_config: MLflow configuration dictionary
        
    Returns:
        ModelTrainerConfig object
    """
    return ModelTrainerConfig(
        models_dir=config_dict.models_dir,
        models=config_dict.models,
        mlflow_tracking_uri=mlflow_config.tracking_uri,
        mlflow_experiment_name=mlflow_config.experiment_name
    )

In [67]:
params = {}
model_config_params = read_yaml(Path("configs/model_config.yaml"))
params["logistic_regression"] = model_config_params.logistic_regression
params

[2025-11-07 17:58:41,705] INFO - ChurnPrediction - yaml file: configs\model_config.yaml loaded successfully


{'logistic_regression': ConfigBox({'C': 1.0, 'max_iter': 1000, 'random_state': 42, 'class_weight': 'balanced', 'solver': 'lbfgs', 'penalty': 'l2'})}

In [68]:
config_dict = read_yaml(Path("configs/config.yaml"))
model_config_params = read_yaml(Path("configs/model_config.yaml"))


# Get model parameters
models_params = {}
for model_name in config_dict.model_training.models:
    models_params[model_name] = model_config_params[model_name]

[2025-11-07 17:58:41,873] INFO - ChurnPrediction - yaml file: configs\config.yaml loaded successfully
[2025-11-07 17:58:41,877] INFO - ChurnPrediction - yaml file: configs\model_config.yaml loaded successfully


In [77]:
try:
    config_dict = read_yaml(Path("configs/config.yaml"))
    model_config_params = read_yaml(Path("configs/model_config.yaml"))
    
    model_training_config = create_model_trainer_config(
        config_dict.model_training,
        config_dict.mlflow)
    
    # Get model parameters
    model_params = {}
    for model_name in config_dict.model_training.models:
        model_params[model_name] = model_config_params[model_name]
    trained_models = ModelTrainer(
        config=model_training_config,
        model_params=model_params
    ).initiate_model_training(
        X_train,
        X_test,
        y_train,
        y_test
    )
    
except Exception as e:
    logger.error(f"Model training failed: {e}")
    raise CustomException(e, sys)

[2025-11-07 18:13:53,590] INFO - ChurnPrediction - yaml file: configs\config.yaml loaded successfully
[2025-11-07 18:13:53,599] INFO - ChurnPrediction - yaml file: configs\model_config.yaml loaded successfully
[2025-11-07 18:13:53,616] INFO - ChurnPrediction - Model Trainer component initialized
[2025-11-07 18:13:53,618] INFO - ChurnPrediction - MLflow tracking URI: mlruns
[2025-11-07 18:13:53,619] INFO - ChurnPrediction - MLflow experiment: churn_prediction
[2025-11-07 18:13:53,620] INFO - ChurnPrediction - Starting model training process
[2025-11-07 18:13:53,626] INFO - ChurnPrediction - Initialized 4 models: ['logistic_regression', 'random_forest', 'xgboost', 'lightgbm']
[2025-11-07 18:13:53,629] INFO - ChurnPrediction - Training 4 models
[2025-11-07 18:13:53,630] INFO - ChurnPrediction - Training data shape: (5619, 30)
[2025-11-07 18:13:53,632] INFO - ChurnPrediction - Test data shape: (1409, 30)
[2025-11-07 18:13:53,633] INFO - ChurnPrediction - 
[2025-11-07 18:13:53,634] INFO - C

<h1 align=center>Model Evaluation</h1>

In [72]:

import os
import sys
import numpy as np
import pandas as pd
import mlflow
from dataclasses import dataclass
from typing import Dict, Any
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report,
    roc_curve, precision_recall_curve
)
from src.logger import logger
from src.exception import CustomException
# from src.utils.common import save_json
import matplotlib.pyplot as plt
import seaborn as sns

In [73]:
def save_json(path: str, data: dict):
    """
    Save a dictionary as a JSON file.
    """
    # ensure directory exists
    os.makedirs(os.path.dirname(path), exist_ok=True)

    with open(path, "w") as f:
        json.dump(data, f, indent=4)

In [74]:
@dataclass
class ModelEvaluationConfig:
    """Configuration for model evaluation component."""
    metrics_path: str
    threshold: float = 0.5
    min_f1_score: float = 0.75
    min_roc_auc: float = 0.85

In [75]:
class ModelEvaluation:
    """
    Evaluates trained models and generates comprehensive metrics.
    """
    
    def __init__(self, config: ModelEvaluationConfig):
        """
        Initialize ModelEvaluation component.
        
        Args:
            config: ModelEvaluationConfig object
        """
        self.config = config
        logger.info("Model Evaluation component initialized")
    
    def calculate_metrics(
        self,
        y_true: np.ndarray,
        y_pred: np.ndarray,
        y_pred_proba: np.ndarray = None
    ) -> Dict[str, Any]:
        """
        Calculate comprehensive evaluation metrics.
        
        Args:
            y_true: True labels
            y_pred: Predicted labels
            y_pred_proba: Prediction probabilities (optional)
            
        Returns:
            Dictionary of metrics
        """
        try:
            metrics = {}
            
            # Classification metrics
            metrics['accuracy'] = float(accuracy_score(y_true, y_pred))
            metrics['precision'] = float(precision_score(y_true, y_pred, average='binary'))
            metrics['recall'] = float(recall_score(y_true, y_pred, average='binary'))
            metrics['f1_score'] = float(f1_score(y_true, y_pred, average='binary'))
            
            # Confusion matrix
            cm = confusion_matrix(y_true, y_pred)
            metrics['confusion_matrix'] = {
                'tn': int(cm[0, 0]),
                'fp': int(cm[0, 1]),
                'fn': int(cm[1, 0]),
                'tp': int(cm[1, 1])
            }
            
            # Derived metrics from confusion matrix
            tn, fp, fn, tp = cm.ravel()
            metrics['specificity'] = float(tn / (tn + fp)) if (tn + fp) > 0 else 0.0
            metrics['sensitivity'] = float(tp / (tp + fn)) if (tp + fn) > 0 else 0.0
            
            # ROC AUC if probabilities are available
            if y_pred_proba is not None:
                metrics['roc_auc'] = float(roc_auc_score(y_true, y_pred_proba))
            
            # Classification report
            report = classification_report(y_true, y_pred, output_dict=True)
            metrics['classification_report'] = report
            
            return metrics
            
        except Exception as e:
            raise CustomException(e, sys)
    
    def evaluate_model(
        self,
        model_name: str,
        model: Any,
        X_train: np.ndarray,
        X_test: np.ndarray,
        y_train: np.ndarray,
        y_test: np.ndarray
    ) -> Dict[str, Any]:
        """
        Evaluate a single model on train and test data.
        
        Args:
            model_name: Name of the model
            model: Trained model instance
            X_train: Training features
            X_test: Test features
            y_train: Training target
            y_test: Test target
            
        Returns:
            Dictionary with evaluation results
        """
        try:
            logger.info(f"Evaluating {model_name}...")
            
            # Predictions
            y_train_pred = model.predict(X_train)
            y_test_pred = model.predict(X_test)
            
            # Prediction probabilities
            y_train_pred_proba = None
            y_test_pred_proba = None
            
            if hasattr(model, 'predict_proba'):
                y_train_pred_proba = model.predict_proba(X_train)[:, 1]
                y_test_pred_proba = model.predict_proba(X_test)[:, 1]
            
            # Calculate metrics
            train_metrics = self.calculate_metrics(y_train, y_train_pred, y_train_pred_proba)
            test_metrics = self.calculate_metrics(y_test, y_test_pred, y_test_pred_proba)
            
            evaluation_results = {
                'model_name': model_name,
                'train_metrics': train_metrics,
                'test_metrics': test_metrics,
                'threshold': self.config.threshold
            }
            
            # Check if model meets minimum requirements
            meets_requirements = (
                test_metrics['f1_score'] >= self.config.min_f1_score and
                test_metrics.get('roc_auc', 0) >= self.config.min_roc_auc
            )
            
            evaluation_results['meets_requirements'] = meets_requirements
            
            # Log summary
            logger.info(f"\n{model_name} Evaluation Results:")
            logger.info(f"  Train Accuracy: {train_metrics['accuracy']:.4f}")
            logger.info(f"  Test Accuracy:  {test_metrics['accuracy']:.4f}")
            logger.info(f"  Test Precision: {test_metrics['precision']:.4f}")
            logger.info(f"  Test Recall:    {test_metrics['recall']:.4f}")
            logger.info(f"  Test F1-Score:  {test_metrics['f1_score']:.4f}")
            if 'roc_auc' in test_metrics:
                logger.info(f"  Test ROC-AUC:   {test_metrics['roc_auc']:.4f}")
            logger.info(f"  Meets Requirements: {meets_requirements}")
            
            return evaluation_results
            
        except Exception as e:
            logger.error(f"Error evaluating {model_name}")
            raise CustomException(e, sys)
    
    def compare_models(self, evaluation_results: Dict[str, Dict]) -> Dict[str, Any]:
        """
        Compare all evaluated models and select the best one.
        
        Args:
            evaluation_results: Dictionary of evaluation results for all models
            
        Returns:
            Dictionary with comparison results and best model info
        """
        try:
            logger.info("\nComparing models...")
            
            comparison = []
            
            for model_name, results in evaluation_results.items():
                test_metrics = results['test_metrics']
                
                comparison.append({
                    'model_name': model_name,
                    'accuracy': test_metrics['accuracy'],
                    'precision': test_metrics['precision'],
                    'recall': test_metrics['recall'],
                    'f1_score': test_metrics['f1_score'],
                    'roc_auc': test_metrics.get('roc_auc', 0),
                    'meets_requirements': results['meets_requirements']
                })
            
            # Create comparison DataFrame
            comparison_df = pd.DataFrame(comparison)
            comparison_df = comparison_df.sort_values('f1_score', ascending=False)
            
            # Select best model based on F1 score
            best_model = comparison_df.iloc[0].to_dict()
            
            logger.info("\nModel Comparison (sorted by F1-Score):")
            logger.info("\n" + comparison_df.to_string(index=False))
            logger.info(f"\nBest Model: {best_model['model_name']}")
            logger.info(f"  F1-Score: {best_model['f1_score']:.4f}")
            logger.info(f"  ROC-AUC:  {best_model['roc_auc']:.4f}")
            
            return {
                'comparison_table': comparison_df.to_dict('records'),
                'best_model': best_model
            }
            
        except Exception as e:
            raise CustomException(e, sys)
    
    def initiate_model_evaluation(
        self,
        trained_models: Dict[str, Any],
        X_train: np.ndarray,
        X_test: np.ndarray,
        y_train: np.ndarray,
        y_test: np.ndarray
    ) -> Dict[str, Any]:
        """
        Evaluate all trained models and compare them.
        
        Args:
            trained_models: Dictionary of trained models
            X_train: Training features
            X_test: Test features
            y_train: Training target
            y_test: Test target
            
        Returns:
            Dictionary with all evaluation results
        """
        logger.info("Starting model evaluation process")
        
        try:
            evaluation_results = {}
            
            # Evaluate each model
            for model_name, model_info in trained_models.items():
                model = model_info['model']
                
                results = self.evaluate_model(
                    model_name=model_name,
                    model=model,
                    X_train=X_train,
                    X_test=X_test,
                    y_train=y_train,
                    y_test=y_test
                )
                
                evaluation_results[model_name] = results
            
            # Compare models
            comparison_results = self.compare_models(evaluation_results)
            
            # Compile final report
            final_report = {
                'individual_evaluations': evaluation_results,
                'comparison': comparison_results,
                'best_model_name': comparison_results['best_model']['model_name']
            }
            
            # Save evaluation report
            os.makedirs(self.config.metrics_path, exist_ok=True)
            report_path = os.path.join(self.config.metrics_path, 'evaluation_report.json')
            save_json(report_path, final_report)
            
            logger.info(f"\nEvaluation report saved to: {report_path}")
            logger.info("Model evaluation completed successfully")
            
            return final_report
            
        except Exception as e:
            logger.error("Error in model evaluation")
            raise CustomException(e, sys)

In [76]:
def create_model_evaluation_config(config_dict: dict) -> ModelEvaluationConfig:
    """
    Create ModelEvaluationConfig from dictionary.
    
    Args:
        config_dict: Configuration dictionary
        
    Returns:
        ModelEvaluationConfig object
    """
    return ModelEvaluationConfig(
        metrics_path=config_dict.metrics_path,
        threshold=config_dict.threshold,
        min_f1_score=config_dict.min_f1_score,
        min_roc_auc=config_dict.min_roc_auc
    )

In [83]:
trained_models['logistic_regression']

{'model': LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42),
 'model_path': 'artifacts/models\\logistic_regression.pkl'}

In [96]:
try:
    config_dict = read_yaml(Path("configs/config.yaml"))
    config = create_model_evaluation_config(config_dict.model_evaluation)
    evaluation_report = ModelEvaluation(config).initiate_model_evaluation(
        trained_models=trained_models,
        X_train=X_train,
        X_test=X_test,
        y_train=y_train,
        y_test=y_test
    )
except Exception as e:
    logger.error(f"Model evaluation failed: {e}")
    raise CustomException(e, sys)

[2025-11-07 18:29:02,911] INFO - ChurnPrediction - yaml file: configs\config.yaml loaded successfully
[2025-11-07 18:29:02,913] INFO - ChurnPrediction - Model Evaluation component initialized
[2025-11-07 18:29:02,914] INFO - ChurnPrediction - Starting model evaluation process
[2025-11-07 18:29:02,916] INFO - ChurnPrediction - Evaluating logistic_regression...
[2025-11-07 18:29:02,976] INFO - ChurnPrediction - 
logistic_regression Evaluation Results:
[2025-11-07 18:29:02,976] INFO - ChurnPrediction -   Train Accuracy: 0.7533
[2025-11-07 18:29:02,982] INFO - ChurnPrediction -   Test Accuracy:  0.7388
[2025-11-07 18:29:02,982] INFO - ChurnPrediction -   Test Precision: 0.5052
[2025-11-07 18:29:02,982] INFO - ChurnPrediction -   Test Recall:    0.7861
[2025-11-07 18:29:02,982] INFO - ChurnPrediction -   Test F1-Score:  0.6151
[2025-11-07 18:29:02,982] INFO - ChurnPrediction -   Test ROC-AUC:   0.8416
[2025-11-07 18:29:02,982] INFO - ChurnPrediction -   Meets Requirements: False
[2025-11-07