In [1]:
import os

In [2]:
%pwd

'c:\\Users\\ambig\\jupiter_notebook\\Projects\\Credit-Risk-Model\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\ambig\\jupiter_notebook\\Projects\\Credit-Risk-Model'

In [20]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class DataPreprocessingConfig:
    root_dir : Path
    input_data : Path
    output_data : Path
    

In [21]:
from src.Credit_Risk_Model.constants import *
from src.Credit_Risk_Model.utils.common import read_yaml,create_directories,load_df,save_df
from  src.Credit_Risk_Model.logger import logger


In [22]:
class ConfigurationManager:
    def __init__(self,
                 config_filepath = CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH,
                 schema_filepath = SCHEMA_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)    
        self.schema = read_yaml(schema_filepath)
        
        create_directories([self.config.artifacts_root])
    
    def get_data_ingestion_config(self) -> DataPreprocessingConfig:
        config = self.config.data_preprocessing
        create_directories([config.root_dir])
        
        data_preprocessing_cofig = DataPreprocessingConfig(
            root_dir = Path(config.root_dir),
            input_data = Path(config.input_data),
            output_data = Path(config.output_data)
        )
        
        return data_preprocessing_cofig

In [31]:
import os
import sys
import pandas as pd
from src.Credit_Risk_Model.logger import logger
from src.Credit_Risk_Model.exception import CustomException
import warnings
warnings.filterwarnings('ignore')



class DataPreprocessing:
    def __init__(self, config :DataPreprocessingConfig):
        self.input_data = config.input_data
        self.output_data = config.output_data
        
    def data_preprocessing(self):
        try:
            df = load_df(self.input_data)
            
            # Makethe 'Default' column  into "int"
            df['default'] = df['default'].astype(int)
            logger.info(f"The Valure counts for 'default' column: {df['default'].value_counts()}")
            
            ## Check for missing values
            for col in df.columns:
                if df[col].isnull().sum() > 0:
                    if df[col].dtype == 'object':
                        # Fill missing values with 'Unknown'for categorical columns
                        logger.warning(f"Missing values found in column '{col}', replacing with 'Unknown'")
                        df[col].fillna(df[col].mode()[0], inplace=True)
                        logger.info(f"Missing values replaced in column '{col}' with mode '{df[col].mode()[0]}'")
                    else:
                        # Fill missing values with mean for numerical columns
                        logger.warning(f"Missing values found in column '{col}', replacing with mean")
                        df[col].fillna(df[col].mean(), inplace=True)
                        logger.info(f"Missing values replaced in column '{col}' with mean '{df[col].mean()}'")
                else:
                    logger.info(f"No missing values found in column '{col}'")

            ## Check the Duplicate rows
            if df.duplicated().sum() > 0:
                logger.warning(f"Duplicate rows found, dropping them")
                df.drop_duplicates(inplace=True)
                logger.info(f"Duplicate rows dropped")
                
            else:
                logger.info(f"No duplicate rows found")
                
            logger.info(df.columns)
            
            # Save the processed data
            save_df(df=df, file_path=self.output_data)
            logger.info(f"Processed data saved to {self.output_data}")
        
        except CustomException as e:
            logger.error(f"An error occurred during data preprocessing: {str(e)}")
            raise CustomException(e,sys)
            
    
        
        

In [32]:
config = ConfigurationManager()
data_preprocessing_config = config.get_data_ingestion_config()

preprocessing = DataPreprocessing(data_preprocessing_config)
preprocessing.data_preprocessing()

[2024-11-06 11:26:53,461] INFO: common : 32] Successfully loaded yaml file: config\config.yaml
[2024-11-06 11:26:53,473] INFO: common : 32] Successfully loaded yaml file: params.yaml
[2024-11-06 11:26:53,476] INFO: common : 32] Successfully loaded yaml file: schema.yaml
[2024-11-06 11:26:53,477] INFO: common : 55] Created directory: artifacts
[2024-11-06 11:26:53,479] INFO: common : 55] Created directory: artifacts/data_preprocessing
[2024-11-06 11:26:53,673] INFO: common : 79] Successfully loaded DataFrame from: artifacts\data_ingestion\data.csv
[2024-11-06 11:26:53,673] INFO: 4147190236 : 22] The Valure counts for 'default' column: default
0    45703
1     4297
Name: count, dtype: int64
[2024-11-06 11:26:53,673] INFO: 4147190236 : 38] No missing values found in column 'cust_id'
[2024-11-06 11:26:53,688] INFO: 4147190236 : 38] No missing values found in column 'age'
[2024-11-06 11:26:53,691] INFO: 4147190236 : 38] No missing values found in column 'gender'
[2024-11-06 11:26:53,697] IN