In [1]:
import os

In [2]:
os.chdir('../')

In [3]:
from pathlib import Path
from dataclasses import dataclass
@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir:Path
    split_dir:Path
    preprocess_obj:Path
    data_dir:Path
    


In [4]:
from src.Loan_defaulter.constants import *
from src.Loan_defaulter.utils.common import read_yaml,create_directories

In [5]:
class ConfigurationManager:
    def __init__(
            self,
            config_filepath=CONFIG_FILE_PATH,
            params_filepath=PARAMS_FILE_PATH

    ):
        self.config=read_yaml(config_filepath)
        self.params=read_yaml(params_filepath)
        

    def get_data_transformation_config(self)->DataTransformationConfig:
        config=self.config
       
        create_directories([config.data_transformation.root_dir])
        create_directories([config.data_transformation.split_dir])
        create_directories([config.data_transformation.preprocess_obj])
        data_transformation_config=DataTransformationConfig(
            root_dir=config.data_transformation.root_dir,
            split_dir=config.data_transformation.split_dir,
            preprocess_obj=config.data_transformation.preprocess_obj,
            data_dir=config.data_ingestion.data_dir
        )
        return data_transformation_config

In [6]:
import os
import urllib.request as request 
import gdown
from src.Loan_defaulter import logger
import zipfile
import pandas as pd
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,StandardScaler
from sklearn.impute import SimpleImputer
from src.Loan_defaulter.utils.common import save_object
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [7]:
class DataTransformation:
    def __init__(self,config:DataTransformationConfig):
        self.config = config
       
    def Transformation(self):
        try:
            logger.info('Creating Pipeline')
            X=pd.read_csv(os.path.join(self.config.data_dir,'X.csv'),index_col=False)
            

            numerical=[]
            categorical=[]
            for i in X.columns:
                if X[i].dtype=='object':
                    categorical.append(i)
                else:
                    if len(X[i].unique())<30:
                        categorical.append(i)
                    else:
                    
                        numerical.append(i)
            age=['age']
            categorical.remove('age')
            num_pipeline = Pipeline([
                ("imputer", SimpleImputer(strategy="median")),
                ("scaler", StandardScaler())
            ])

            cat_pipeline = Pipeline([
                ("imputer", SimpleImputer(strategy="most_frequent")),
                ("one_hot_encoder", OneHotEncoder(handle_unknown='ignore'))
            ])

            Age = Pipeline([
                ("imputer", SimpleImputer(strategy="most_frequent")),
                ('Ordinal_encode', OrdinalEncoder(categories=[['<25', '25-34', '35-44', '45-54', '55-64', '65-74', '>74']]))
            ])
            preprocessing=ColumnTransformer(
                [
                    ('num_pipeline',num_pipeline,numerical),
                    ('cat_pipeline',cat_pipeline,categorical),
                    ('age_oe',Age,age)
                ]
            )
            logger.info('Pipeline Created')
            return preprocessing
        except Exception as e:
            raise e  
    def split_data_transform(self):
        try:
            logger.info('Executing the pipeline on data')
            X=pd.read_csv(os.path.join(self.config.data_dir,'X.csv'),index_col=False)

            y=pd.read_csv(os.path.join(self.config.data_dir,'y.csv'),index_col=False)
            X_train,X_temp,y_train,y_temp=train_test_split(X,y,random_state=42,test_size=0.3)
            X_val,X_test,y_val,y_test=train_test_split(X_temp,y_temp,random_state=42,test_size=0.5)
            preprocessing_obj=self.Transformation()
            X_train_preprocessed=preprocessing_obj.fit_transform(X_train)
            X_test_preprocessed=preprocessing_obj.transform(X_test)
            X_val_preprocessed=preprocessing_obj.transform(X_val)
            save_object(file_path=os.path.join(self.config.preprocess_obj,'pipeline.pkl'),
                        obj=preprocessing_obj
            )
            train=np.c_[X_train_preprocessed,y_train.to_numpy()]
            test=np.c_[X_test_preprocessed,y_test.to_numpy()]
            val=np.c_[X_val_preprocessed,y_val.to_numpy()]
            save_object(file_path=os.path.join(self.config.split_dir,'train.pkl'),obj=train)
            save_object(file_path=os.path.join(self.config.split_dir,'val.pkl'),obj=val)
            save_object(file_path=os.path.join(self.config.split_dir,'test.pkl'),obj=test)
            logger.info(f'Executed the pipeline on data and data stored in {self.config.split_dir}')
        except Exception as e:
            raise e

In [8]:

try:
    config=ConfigurationManager()
    data_transformation_config=config.get_data_transformation_config()
    data_transformation=DataTransformation(config=data_transformation_config)
    data_transformation.split_data_transform()
except Exception as e:
    raise e

[2024-12-27 18:49:48,696:INFO:common:yaml file: config\config.yaml loaded successfully]
[2024-12-27 18:49:48,698:INFO:common:yaml file: params.yaml loaded successfully]
[2024-12-27 18:49:48,700:INFO:common:created directory at: artifacts/data_transformation]
[2024-12-27 18:49:48,701:INFO:common:created directory at: artifacts/data_transformation/split_data]
[2024-12-27 18:49:48,703:INFO:common:created directory at: artifacts/data_transformation/preprocess_obj]
[2024-12-27 18:49:48,703:INFO:1438372825:Executing the pipeline on data]
[2024-12-27 18:49:49,233:INFO:1438372825:Creating Pipeline]
[2024-12-27 18:49:49,652:INFO:1438372825:Pipeline Created]
[2024-12-27 18:49:51,055:INFO:1438372825:Executed the pipeline on data and data stored in artifacts/data_transformation/split_data]


In [19]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder

# Load the data
X = pd.read_csv('artifacts/data_ingestion/data/X.csv', index_col=False)
X.columns
numerical = []
categorical = []

# Identify categorical and numerical columns
for i in X.columns:
    if X[i].dtype == 'object':
        categorical.append(i)
    else:
        if len(X[i].unique()) < 30:
            categorical.append(i)
        else:
            numerical.append(i)

age = ['age']
if 'age' in categorical:
    categorical.remove('age')

# Ensure columns exist
categorical = [col for col in categorical if col in X.columns]
numerical = [col for col in numerical if col in X.columns]
assert all(col in X.columns for col in age), "Age column missing in X."

# Pipelines
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("one_hot_encoder", OneHotEncoder(handle_unknown='ignore'))
])

Age = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ('Ordinal_encode', OrdinalEncoder(categories=[['<25', '25-34', '35-44', '45-54', '55-64', '65-74', '>74']]))
])

# ColumnTransformer
preprocessing = ColumnTransformer([
    ('num_pipeline', num_pipeline, numerical),
    ('cat_pipeline', cat_pipeline, categorical),
    ('age_oe', Age, age)
])

# Fit and transform
try:
    transformed_data = preprocessing.fit_transform(X)
    print("Transformation successful!")
except ValueError as e:
    print(f"Error during transformation: {e}")


Transformation successful!


In [18]:
preprocessing.fit_transform(X).toarray()

ValueError: Found unknown categories [nan] in column 0 during fit

In [11]:
X=pd.read_csv('artifacts\data_ingestion\data\X.csv',index_col=False)
X['loan_amount'].t

0         116500
1         206500
2         406500
3         456500
4         696500
           ...  
148665    436500
148666    586500
148667    446500
148668    196500
148669    406500
Name: loan_amount, Length: 148670, dtype: int64

In [23]:
y=pd.read_csv('artifacts\data_ingestion\data\y.csv',index_col=0)
y

Unnamed: 0,Status
0,1
1,1
2,0
3,0
4,0
...,...
148665,0
148666,0
148667,0
148668,0
