In [4]:
import os
os.chdir("../")

In [5]:
# os.getcwd()

In [6]:
from pathlib import Path
from src.Customer_segementation.constant import *
from dataclasses import dataclass

In [7]:
@dataclass(frozen=True)
class DataTransformationConfig: 
    root_dir: Path
    data_dir: Path
    train_data__scaled_path: Path
    test_data_scaled_path: Path
    test_size: int
    random_state: int
    train_data_path: Path
    test_data_path: Path
    preprocess_data_path: Path
    preprocess_path: Path
    

In [8]:
from src.Customer_segementation.utils.common import read_yaml, create_directories

In [9]:
class configurationManager: 
    def __init__(self,config_file_path=CONFIG_FILE_PATH,
                 schema_file_path=SCHEMA_FILE_PATH,
                 params_file_path=PARAMS_FILE_PATH):
        self.config=read_yaml(config_file_path)
        self.schema=read_yaml(schema_file_path)
        self.params=read_yaml(params_file_path)

        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self)->DataTransformationConfig:
       config=self.config.data_transformation
       parmas=self.params.data_transformation
       create_directories([config.root_dir])

       get_transformation_config=DataTransformationConfig(
           root_dir=config.root_dir,
           data_dir=config.data_dir,
           train_data__scaled_path=config.train_data_scaled_path,
           test_data_scaled_path=config.test_data_scaled_path,
           test_size=parmas.test_size,
           random_state=parmas.random_state,
           train_data_path=config.train_data_path,
           test_data_path=config.test_data_path,
           preprocess_data_path=config.preprocess_Data,
           preprocess_path=config.preprocessor_path
           
           )
       return get_transformation_config


In [10]:
import pandas as pd
from src.Customer_segementation.logger import logger
from sklearn.model_selection import train_test_split

In [11]:
from src.Customer_segementation.utils.common import save_object

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
import pandas as pd

In [13]:
class DataTransformation: 
    def __init__(self, config=DataTransformationConfig) -> None:
        self.config=config

        Gender=['Male', 'Female']
        Ever_Married=['No', 'Yes']
        Graduated=['No', 'Yes']
        Profession=['Healthcare', 'Engineer', 'Lawyer', 'Entertainment', 'Artist','Executive', 'Doctor', 'Homemaker', 'Marketing']
        Spending_Score=['Low', 'Average', 'High']


    def train_test_data_split(self, data: pd.DataFrame):
        """
        method name: train test data split
        description: It divide data into training data and testing dataset.
        """
        # data.drop('Var_1', inplace=True, axis=1)
        train_data, test_data=train_test_split(data, test_size=self.config.test_size, 
                                               random_state=self.config.random_state)
        
        # Save train data and test data.

        train_data.to_csv(self.config.train_data_path, index=False, header=True)
        test_data.to_csv(self.config.test_data_path, index=False, header=True)

        


    def get_data_preprocessing(self, cat_data:list, num_data:list):
        """
        method_name: get data preprocessing
        decription: In this method feature engineering pipeline created that handle the missing vlaue and scale data.
        """
        Gender=['Male', 'Female']
        Ever_Married=['No', 'Yes']
        Graduated=['No', 'Yes']
        Profession=['Healthcare', 'Engineer', 'Lawyer', 'Entertainment', 'Artist','Executive', 'Doctor', 'Homemaker', 'Marketing']
        Spending_Score=['Low', 'Average', 'High']

        num_pipeline=Pipeline(
            steps=[
        ("imputer",SimpleImputer()),
        ("scaler",StandardScaler())
        ])
        cat_pipeline=Pipeline([
            ('imputer',SimpleImputer(strategy='most_frequent')),
            ('onehot',OrdinalEncoder(categories=[Gender, Ever_Married, Graduated, Profession, Spending_Score]))])
        
        preprocessor=ColumnTransformer([
            ("num_pipeline",num_pipeline,num_data),
            ("cat_pipeline",cat_pipeline,cat_data)])
        
        return preprocessor
        

        


    def initate_data_transformation(self):
        data=pd.read_csv(self.config.data_dir)
        # logger.info(f"Customer segementation data: {data.head()}")
        
        # drop the Id columns from dataset
        data.drop('ID', inplace=True, axis=1)

        categorical_data=[feature for feature in data.columns if data[feature].dtypes=='O']
        categorical_data=['Gender', 'Ever_Married', 'Graduated', 'Profession', 'Spending_Score']
        numerical_data=[feature for feature in data.columns if data[feature].dtypes !='O']

        logger.info(f"categorical_data : {categorical_data}")
        logger.info(f"Numerical data: {numerical_data}")

        preprocessing=self.get_data_preprocessing(cat_data=categorical_data, num_data=numerical_data)

        # logger.info(f"Preprocessing model created: {preprocessing}")

        preprocess_data=pd.DataFrame(preprocessing.fit_transform(data), columns=preprocessing.get_feature_names_out())

        # logger.info(f"Data Preprocess: {preprocess_data.head()}")
        # save_object(Path(self.config.preprocess_path), obj=preprocessing)
        

        preprocess_data.to_csv(self.config.preprocess_data_path, index=False, header=True)


        self.train_test_data_split(preprocess_data)


In [14]:
try: 
    configmanager=configurationManager()
    transformation_config=configmanager.get_data_transformation_config()
    # logger.info(f"{transformation_config}")
    data_transformation=DataTransformation(transformation_config)
    data_transformation.initate_data_transformation()

except Exception as e: 
    raise e

[2024-07-01 17:39:30,846 : INFO : common : Yaml file read config/config.yaml successfully]
[2024-07-01 17:39:30,849 : INFO : common : Yaml file read schema.yaml successfully]
[2024-07-01 17:39:30,851 : INFO : common : Yaml file read params.yaml successfully]
[2024-07-01 17:39:30,851 : INFO : common : Directories created ['artifacts']]
[2024-07-01 17:39:30,852 : INFO : common : Directories created ['artifacts/data_transformation']]
[2024-07-01 17:39:30,864 : INFO : 223563250 : categorical_data : ['Gender', 'Ever_Married', 'Graduated', 'Profession', 'Spending_Score']]
[2024-07-01 17:39:30,865 : INFO : 223563250 : Numerical data: ['Age', 'Work_Experience', 'Family_Size']]
[2024-07-01 17:39:30,888 : INFO : common : Object save at: <_io.BufferedWriter name='artifacts/data_transformation/preprocessor.pkl'>]
