In [1]:
import os
os.chdir("../")

In [30]:
from src.Credit_card_project.utils.common import read_yaml, create_directories
from pathlib import Path
from dataclasses import dataclass
from src.Credit_card_project import logger

In [31]:
@dataclass(frozen=True)
class DataTransformationConfig: 
    root_dir: Path
    data_dir: Path
    train_data_path: Path
    test_data_path: Path
    test_percentage: float
    target_column: str
    train_scaled: Path
    test_scaled: Path
    

In [32]:
from src.Credit_card_project.constant import *

In [33]:
class ConfigurationManager: 
    def __init__(self,config_file_path= CONFIG_FILE_PATH,
                 params_file_path= PARAMS_FILE_PATH,
                 schema_file_path=SCHEMA_FILE_PATH
                 ):
        self.config=read_yaml(config_file_path)
        self.sparams=read_yaml(params_file_path)
        self.schema=read_yaml(schema_file_path)
        
        create_directories([self.config.artifacts_root])
    
    def get_data_transformation_config(self)-> DataTransformationConfig:
        
        config= self.config.data_transformation
        schema=self.schema.TARGET
        create_directories([config.root_dir])
        
        get_data_transformation_config=DataTransformationConfig(
            root_dir=config.root_dir,
            data_dir=config.data_dir,
            test_data_path=config.test_data_path,
            train_data_path=config.train_data_path,
            test_percentage=config.test_percentage,
            target_column=schema, 
            train_scaled=config.train_scaled,
            test_scaled=config.test_scaled
            
            
        )
        return get_data_transformation_config

In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [37]:
class DataTransformation: 
    def __init__(self, config: DataTransformationConfig): 
        self.config=config
        logger.info(f"test percentage {config.test_percentage}")
        
    def train_test_data_split(self, data: pd.DataFrame):
        
        # data=pd.read_csv(data)
        train_data, test_data=train_test_split(data, 
                                               test_size=self.config.test_percentage,
                                               random_state=42)
        logger.info(f"train test data split")
        train_data.to_csv(self.config.train_data_path, index=False, header=True)
        logger.info(f"Train data saved : {train_data.shape}")
        test_data.to_csv(self.config.test_data_path, index=False, header=True)
        logger.info(f"test data saved {test_data.shape}")
        
        
    def get_data_preprocessor(self, num_col): 
        
        num_pipeline=Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='mean')),
                ('scaler', StandardScaler())])
        preprocess=ColumnTransformer([('num_pipeline', num_pipeline, num_col)])
        return preprocess
        
        
    def data_transformation_initiate(self): 
        
        data_df=pd.read_csv(self.config.data_dir)
        drop_column=['SEX','EDUCATION','MARRIAGE','AGE']
        data=data_df.drop(drop_column, axis=1)
        
        # Train test split
        self.train_test_data_split(data)
        
     
        
        ## Spliting dependent and independent variable.
        train_data_path=self.config.train_data_path
        test_data_path=self.config.test_data_path
       
        
        train_df=pd.read_csv(train_data_path)
        test_df=pd.read_csv(test_data_path)
        logger.info(f"train and test data loaded from data transformation file.")
        
        target_column=self.config.target_column.keys()
        logger.info(f"Target columns: {target_column}")
        
        input_feature_train_df=train_df.drop(target_column, axis=1)
        target_feature_train_df=train_df[target_column]
        
        logger.info(f"Independent Variable Input:  {input_feature_train_df.shape} OUTput {target_feature_train_df.shape}")
        
        input_feature_test_df=test_df.drop(target_column, axis=1 )
        target_feature_test_df=test_df[target_column]
        
        logger.info(f"dependent Variable Input:  {input_feature_test_df.shape} OUTput {target_feature_test_df.shape}")
        
        ## Pipeline for dataTransformation
        
        num_col=[col for col in input_feature_train_df.columns if input_feature_train_df[col].dtypes!='O']
        
        logger.info(f"NUmerical col: {num_col}")
        
        preprocessor=self.get_data_preprocessor(num_col)
        logger.info(f"Preprocessor part {preprocessor}")
        
        x_train_scaled=preprocessor.fit_transform(input_feature_train_df)
        x_train_scaled_df=pd.DataFrame(x_train_scaled, columns=preprocessor.get_feature_names_out())
        
        
        logger.info(f"x_train sclead data frame{ x_train_scaled_df.head()}")
        
        x_test_scaled=preprocessor.transform(input_feature_test_df)
        x_test_scaled_df=pd.DataFrame(x_test_scaled, columns=preprocessor.get_feature_names_out())

        logger.info(f"x_train sclead { x_test_scaled_df.head()}")
        
        
        target_train=pd.DataFrame(target_feature_train_df, columns=self.config.target_column)
        target_test=pd.DataFrame(target_feature_test_df, columns=self.config.target_column)
        
        train_scaled=pd.concat([x_train_scaled_df, target_train], axis=1)
        test_scaled=pd.concat([x_test_scaled_df, target_test], axis=1)
        
        train_scaled.to_csv(self.config.train_scaled, index=False, header=True)
        test_scaled.to_csv(self.config.test_scaled, index=False, header=True)
        
        logger.info(f"Sucessfully done")
        
        
        
        """
        preprocessor.fit_transform(input_feature_test_df)
        """

        

In [38]:
configmanager=ConfigurationManager()
data_transformation_config=configmanager.get_data_transformation_config()
data_transformation=DataTransformation(data_transformation_config)
data_transformation.data_transformation_initiate()

[2024-02-03 18:50:30,714: INFO, common : Yaml file read config\config.yaml successfully]
[2024-02-03 18:50:30,859: INFO, common : Yaml file read params.yaml successfully]
[2024-02-03 18:50:30,876: INFO, common : Yaml file read schema.yaml successfully]
[2024-02-03 18:50:30,883: INFO, common : Directories created ['artifacts']]
[2024-02-03 18:50:30,890: INFO, common : Directories created ['artifacts/data_transformation']]
[2024-02-03 18:50:30,894: INFO, 1665113560 : test percentage 0.25]
[2024-02-03 18:50:30,932: INFO, 1665113560 : train test data split]
[2024-02-03 18:50:30,959: INFO, 1665113560 : Train data saved : (750, 20)]
[2024-02-03 18:50:31,002: INFO, 1665113560 : test data saved (251, 20)]
[2024-02-03 18:50:31,103: INFO, 1665113560 : train and test data loaded from data transformation file.]
[2024-02-03 18:50:31,106: INFO, 1665113560 : Target columns: dict_keys(['default payment next month'])]
[2024-02-03 18:50:31,116: INFO, 1665113560 : Independent Variable Input:  (750, 19) O