In [1]:
%pwd

'd:\\ML\\LiveProject\\ML-CreditCardDefaulter\\research'

In [2]:
import os

In [3]:
os.chdir('../')

In [4]:
%pwd

'd:\\ML\\LiveProject\\ML-CreditCardDefaulter'

In [13]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    null_val_path: Path

In [25]:
from ml_creditcard_defaulter.constants import *
from ml_creditcard_defaulter.utils.common import read_yaml, create_directories, save_bin

In [29]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            null_val_path = config.null_val_path,
        )

        return data_transformation_config

In [10]:
import os
import urllib.request as request
import zipfile
from ml_creditcard_defaulter import logger
from ml_creditcard_defaulter.utils.common import get_size
from sklearn.model_selection import train_test_split

In [24]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from kneed import KneeLocator

In [26]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        
    def remove_unwanted_spaces(self,data):
        data = pd.read_csv(self.config.data_path)

        try:
            df_without_spaces = data.apply(lambda x: x.str.strip() if x.dtype == "object" else x)  # drop the labels specified in the columns
            logger.info('Unwanted spaces removal Successful.Exited the remove_unwanted_spaces method of the Preprocessor class')
            return df_without_spaces
        except Exception as e:
            logger.info('Exception occured in remove_unwanted_spaces method of the Preprocessor class. Exception message:  ' + str(e))
            logger.info('unwanted space removal Unsuccessful. Exited the remove_unwanted_spaces method of the Preprocessor class')
            raise Exception()
    def remove_columns(self,data,columns):
        try:
            useful_data=data.drop(labels=columns, axis=1) # drop the labels specified in the columns
            logger.info('Column removal Successful.Exited the remove_columns method of the Preprocessor class')
            return useful_data
        except Exception as e:
            logger.info('Exception occured in remove_columns method of the Preprocessor class. Exception message:  '+str(e))
            logger.info('Column removal Unsuccessful. Exited the remove_columns method of the Preprocessor class')
            raise Exception()
        
    def separate_label_feature(self, data, label_column_name):
        try:
            X=data.drop(labels=label_column_name,axis=1) # drop the columns specified and separate the feature columns
            y=data[label_column_name] # Filter the Label columns
            logger.info('Label Separation Successful. Exited the separate_label_feature method of the Preprocessor class')
            return X,y
        except Exception as e:
            logger.info('Exception occured in separate_label_feature method of the Preprocessor class. Exception message:  ' + str(e))
            logger.info('Label Separation Unsuccessful. Exited the separate_label_feature method of the Preprocessor class')
            raise Exception()
        
    def is_null_present(self,data):
        null_present = False
        cols_with_missing_values=[]
        cols = data.columns
        try:
            null_counts=data.isna().sum() # check for the count of null values per column
            for i in range(len(null_counts)):
                if null_counts[i]>0:
                    null_present=True
                    cols_with_missing_values.append(cols[i])
            if(null_present): # write the logs to see which columns have null values
                dataframe_with_null = pd.DataFrame()
                dataframe_with_null['columns'] = data.columns
                dataframe_with_null['missing values count'] = np.asarray(data.isna().sum())
                dataframe_with_null.to_csv(self.config.null_val_path) # storing the null column information to file
            logger.info('Finding missing values is a success.Data written to the null values file. Exited the is_null_present method of the Preprocessor class')
            return null_present, cols_with_missing_values
        except Exception as e:
            logger.info('Exception occured in is_null_present method of the Preprocessor class. Exception message:  ' + str(e))
            logger.info('Finding missing values failed. Exited the is_null_present method of the Preprocessor class')
            raise Exception()
        
    def impute_missing_values(self, data, cols_with_missing_values):
        logger.info('Entered the impute_missing_values method of the Preprocessor class')
        
        cols_with_missing_values=cols_with_missing_values
        try:
            imputer = SimpleImputer(strategy="most_frequent")
            for col in cols_with_missing_values:
                data[col] = imputer.fit_transform(data[col])
            logger.info('Imputing missing values Successful. Exited the impute_missing_values method of the Preprocessor class')
            return data
        except Exception as e:
            logger.info('Exception occured in impute_missing_values method of the Preprocessor class. Exception message:  ' + str(e))
            logger.info('Imputing missing values failed. Exited the impute_missing_values method of the Preprocessor class')
            raise Exception()
    def scale_numerical_columns(self,data):
        logger.info('Entered the scale_numerical_columns method of the Preprocessor class')

        try:
            num_df = data.select_dtypes(include=['int64']).copy()
            scaler = StandardScaler()
            scaled_data = scaler.fit_transform(num_df)
            scaled_num_df = pd.DataFrame(data=scaled_data, columns=num_df.columns)

            logger.info( 'scaling for numerical values successful. Exited the scale_numerical_columns method of the Preprocessor class')
            return scaled_num_df

        except Exception as e:
            logger.info('Exception occured in scale_numerical_columns method of the Preprocessor class. Exception message:  ' + str(e))
            logger.info( 'scaling for numerical columns Failed. Exited the scale_numerical_columns method of the Preprocessor class')
            raise Exception()
    def encode_categorical_columns(self,data):
        logger.info( 'Entered the encode_categorical_columns method of the Preprocessor class')
        try:
            cat_df = data.select_dtypes(include=['object']).copy()
            # Using the dummy encoding to encode the categorical columns to numericsl ones
            for col in cat_df.columns:
                cat_df = pd.get_dummies(cat_df, columns=[col], prefix=[col], drop_first=True)

            logger.info('encoding for categorical values successful. Exited the encode_categorical_columns method of the Preprocessor class')
            return cat_df

        except Exception as e:
            logger.info('Exception occured in encode_categorical_columns method of the Preprocessor class. Exception message:  ' + str(e))
            logger.info('encoding for categorical columns Failed. Exited the encode_categorical_columns method of the Preprocessor class')
            raise Exception()
        
    def handle_imbalanced_dataset(self,x,y):
        logger.info('Entered the handle_imbalanced_dataset method of the Preprocessor class')

        try:
            rdsmple = RandomOverSampler()
            x_sampled,y_sampled  = rdsmple.fit_sample(x,y)
            logger.info('dataset balancing successful. Exited the handle_imbalanced_dataset method of the Preprocessor class')
            return x_sampled,y_sampled

        except Exception as e:
            logger.info('Exception occured in handle_imbalanced_dataset method of the Preprocessor class. Exception message:  ' + str(e))
            logger.info('dataset balancing Failed. Exited the handle_imbalanced_dataset method of the Preprocessor class')
            raise Exception()
        
    def elbow_plot(self,data):
        logger.info( 'Entered the elbow_plot method of the KMeansClustering class')
        wcss=[] # initializing an empty list
        try:
            for i in range (1,11):
                kmeans=KMeans(n_clusters=i,init='k-means++',random_state=42) # initializing the KMeans object
                kmeans.fit(data) # fitting the data to the KMeans Algorithm
                wcss.append(kmeans.inertia_)
            plt.plot(range(1,11),wcss) # creating the graph between WCSS and the number of clusters
            plt.title('The Elbow Method')
            plt.xlabel('Number of clusters')
            plt.ylabel('WCSS')
            #plt.show()
            plt.savefig('preprocessing_data/K-Means_Elbow.PNG') # saving the elbow plot locally
            # finding the value of the optimum cluster programmatically
            kn = KneeLocator(range(1, 11), wcss, curve='convex', direction='decreasing')
            logger.info( 'The optimum number of clusters is: '+str(kn.knee)+' . Exited the elbow_plot method of the KMeansClustering class')
            return kn.knee

        except Exception as e:
            logger.info('Exception occured in elbow_plot method of the KMeansClustering class. Exception message:  ' + str(e))
            logger.info('Finding the number of clusters failed. Exited the elbow_plot method of the KMeansClustering class')
            raise Exception()

        
    def train_test_spliting(self, data):
        # Split the data into training and test sets. (0.75, 0.25) split.
        # train, test = train_test_split(data)

        data.to_csv(os.path.join(self.config.root_dir, "train.csv"),index = False)


        logger.info("train data saved")
        logger.info(data.shape)

        print(data.shape)

        

In [37]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data = pd.read_csv(data_transformation_config.data_path)
    X, y = data_transformation.separate_label_feature(data,label_column_name='default payment next month')
    is_null_present,cols_with_missing_values=data_transformation.is_null_present(X)
    if(is_null_present):
        X=data_transformation.impute_missing_values(X,cols_with_missing_values)
    
    X['Labels']=y
    data_transformation.train_test_spliting(X)
except Exception as e:
    raise e

[2025-01-24 17:52:45,278: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-01-24 17:52:45,282: INFO: common: yaml file: params.yaml loaded successfully]
[2025-01-24 17:52:45,289: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-01-24 17:52:45,289: INFO: common: created directory at: artifacts]
[2025-01-24 17:52:45,294: INFO: common: created directory at: artifacts/data_transformation]
[2025-01-24 17:52:45,382: INFO: 2505344833: Label Separation Successful. Exited the separate_label_feature method of the Preprocessor class]
[2025-01-24 17:52:45,385: INFO: 2505344833: Finding missing values is a success.Data written to the null values file. Exited the is_null_present method of the Preprocessor class]


  if null_counts[i]>0:


[2025-01-24 17:52:45,661: INFO: 2505344833: Splited data into training and test sets]
[2025-01-24 17:52:45,662: INFO: 2505344833: (22500, 24)]
[2025-01-24 17:52:45,663: INFO: 2505344833: (7500, 24)]
(22500, 24)
(7500, 24)


In [33]:
i = X.copy()

In [35]:
i['Labels']=y

In [36]:
i

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,Labels
0,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
1,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,220000,1,3,1,39,0,0,0,0,0,...,88004,31237,15980,8500,20000,5003,3047,5000,1000,0
29996,150000,1,3,2,43,-1,-1,-1,-1,0,...,8979,5190,0,1837,3526,8998,129,0,0,0
29997,30000,1,2,2,37,4,3,2,-1,0,...,20878,20582,19357,0,0,22000,4200,2000,3100,1
29998,80000,1,3,1,41,1,-1,0,0,0,...,52774,11855,48944,85900,3409,1178,1926,52964,1804,1
