In [1]:
import os

In [2]:
%pwd

'c:\\Users\\ambig\\jupiter_notebook\\Projects\\Credit-Risk-Model\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\ambig\\jupiter_notebook\\Projects\\Credit-Risk-Model'

In [12]:
from dataclasses import dataclass
from pathlib import  Path
from typing import List

@dataclass
class DataCleaningConfig:
    root_dir: Path
    input_filepath : Path
    test_path : Path
    train_path : Path
    columns_to_have : List[str]
    params : dict
    


In [13]:
from src.Credit_Risk_Model.utils.common import read_yaml,create_directories,load_df,save_df
from src.Credit_Risk_Model.constants import *
from src.Credit_Risk_Model.logger import logger

In [14]:
class ConfigurationManager:
    def __init__(self,
                 config_filepath = CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH,
                 schema_filepath = SCHEMA_FILE_PATH
                 ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)
        
        create_directories([self.config.artifacts_root])

    def get_data_cleaning_config(self) -> DataCleaningConfig:
        config = self.config['data_cleaning']
        params = self.params['data_cleaning']
        create_directories([config.root_dir])
        
        data_cleaning_config = DataCleaningConfig(
            root_dir = Path(config.root_dir),
            input_filepath = Path(config.input_filepath),
            test_path = Path(config.test_path),
            train_path = Path(config.train_path),
            columns_to_have = params.columns_to_have,
            params = params,
        )
        return data_cleaning_config

In [21]:
import os
import sys
import pandas as pd
from src.Credit_Risk_Model.logger import logger
from src.Credit_Risk_Model.exception import CustomException
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
import warnings
from sklearn.preprocessing import MinMaxScaler
import numpy as np
warnings.filterwarnings('ignore')


class DataCleaning:
    def __init__(self, config: DataCleaningConfig):
        self.input_filepath = config.input_filepath
        self.test_path = config.test_path
        self.train_path = config.train_path
        self.params = config.params
        
        
    def initiate_train_test_split(self, df):
        try:
            logger.info("Initiating train test split...")
            train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
            logger.info("Train test split completed successfully.")
            return train_df, test_df
        except CustomException as e:
            logger.error(f"Error occurred while initiating train test split: {str(e)}")
            raise CustomException(e,sys)
        
    def columns_removal(self, df):
        try:
            logger.info("Removing columns...")
            columns_to_have = self.params.columns_to_have
            df = df.drop(columns_to_have, axis=1)
            logger.info("Columns removed successfully.")
            return df
        except CustomException as e:
            logger.error(f"Error occurred while removing columns: {str(e)}")
            raise CustomException(e,sys)
        
    
    def data_cleaning(self):
        try:
            df = load_df(self.input_filepath)
            train_df, test_df = self.initiate_train_test_split(df)
            logger.info(f"Train_df :{test_df.columns}")
            # Outlier Removal: Processing Fee
            train_df[(train_df.processing_fee / train_df.loan_amount) <= 0.03][["loan_amount","processing_fee"]]
            test_df[(test_df.processing_fee / test_df.loan_amount) <= 0.03][["loan_amount", "processing_fee"]]

            logger.info(f" After outlier removal train_df :{train_df.shape}")
            logger.info(f" After outlier removal test_df :{test_df.shape}")
            
            # Use other business rules for data validation
            # Rule 1: GST should not be more than 20%
            GST = train_df[(train_df.gst/train_df.loan_amount)>0.2].shape
            logger.info(f"Rule 1: GST should not be more than 20% :{GST}")
            
            # Rule 2: Net disbursement should not be higher than loan_amount
            Net = train_df[train_df.net_disbursement>train_df.loan_amount].shape
            logger.info(f"Rule 2: Net disbursement should not be higher than loan_amount :{Net}")
            
            # Fixing Loan Purpose column
            train_df['loan_purpose'] = train_df['loan_purpose'].replace('Personaal', 'Personal')
            test_df['loan_purpose'] = test_df['loan_purpose'].replace('Personaal', 'Personal')
            logger.info(f"Rule 3 : Fixed loan_purpose column :{train_df['loan_purpose'].value_counts()}")
            
            # Generate loan to income (LTI) Ratio
            train_df['loan_to_income'] = round(train_df['loan_amount'] / train_df['income'],2)
            test_df['loan_to_income'] = round(test_df['loan_amount'] / test_df['income'],2)
            logger.info(f"Rule 4 : Generated loan_to_income column(LTI)")

            # Generate Delinquency Ratio
            train_df['delinquency_ratio'] = (train_df['delinquent_months']*100 / train_df['total_loan_months']).round(1)
            test_df['delinquency_ratio'] = (test_df['delinquent_months']*100 / test_df['total_loan_months']).round(1)
            logger.info(f"Generated delinquency_ratio column :{train_df['delinquency_ratio'].describe()}")
            
            # Generate Avg DPD Per Delinquency
            train_df['avg_dpd_per_delinquency'] = np.where(
                    train_df['delinquent_months'] != 0,
                   (train_df['total_dpd'] / train_df['delinquent_months']).round(1),
                    0
                )

            test_df['avg_dpd_per_delinquency'] = np.where(
                    test_df['delinquent_months'] != 0,
                    (test_df['total_dpd'] / test_df['delinquent_months']).round(1),
                        0
                )
            
            train_df = train_df.drop(['cust_id', 'loan_id'],axis="columns")
            test_df = test_df.drop(['cust_id', 'loan_id'],axis="columns")
            
            # Remove columns that business contact person asked us to remove
            train_df = self.columns_removal(train_df)
            test_df = self.columns_removal(test_df)
            
            save_df(file_path=self.train_path,df=train_df)
            save_df(file_path=self.test_path,df=test_df)
            
           
            
        except CustomException as e:
            logger.error(f"Error occurred while data cleaning: {str(e)}")
            raise CustomException(e,sys)


In [None]:
config = ConfigurationManager()
data_cleaning = config.get_data_cleaning_config()
data_cleaner = DataCleaning(data_cleaning)
cleaned_data = data_cleaner.data_cleaning()

[2024-11-10 10:47:39,664] INFO: common : 32] Successfully loaded yaml file: config\config.yaml
[2024-11-10 10:47:39,665] INFO: common : 32] Successfully loaded yaml file: params.yaml
[2024-11-10 10:47:39,665] INFO: common : 32] Successfully loaded yaml file: schema.yaml
[2024-11-10 10:47:39,665] INFO: common : 55] Created directory: artifacts
[2024-11-10 10:47:39,674] INFO: common : 55] Created directory: artifacts/data_cleaning
[2024-11-10 10:47:39,850] INFO: common : 79] Successfully loaded DataFrame from: artifacts\data_preprocessing\preprocessed_data.csv
[2024-11-10 10:47:39,857] INFO: 4094972786 : 24] Initiating train test split...
[2024-11-10 10:47:39,873] INFO: 4094972786 : 26] Train test split completed successfully.
[2024-11-10 10:47:39,876] INFO: 4094972786 : 48] Train_df :Index(['cust_id', 'age', 'gender', 'marital_status', 'employment_status',
       'income', 'number_of_dependants', 'residence_type',
       'years_at_current_address', 'city', 'state', 'zipcode', 'loan_id',