In [None]:
import sys
from dataclasses import dataclass

import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, StandardScaler

from src.exception import CustomException
from src.logger import logging
import os
from src.utils import save_object

In [3]:
@dataclass
class DataTransformationConfig:
    preprocessor_obj_file_path = os.path.join('artifacts', 'preprocessor.pkl')

class DataTransformation:
    def __init__(self) -> None:
        self.data_transformation_config = DataTransformationConfig()

    def get_data_transformation_objects(self):
        try:
            logging.info('Data Transformation initiated')
            # defining which columns should be ordinal-encoded and which should be scaled

            categorical_columns = X.select_dtypes(include='object').columns
            numerical_columns = X.select_dtypes(exclude='object').columns

            # define the custom ranking for each ordinal variable

            cut_categories = ['Fair','Good','Very Good', 'Premium', 'Ideal']
            color_categories = ['D','E', 'F', 'G', 'H', 'I', 'J']
            clarity_categories = ['I1','SI2', 'SI1',  'VS2' ,'VS1','VVS2','VVS1','IF']

            logging.info('Pipeline Initiated')

            # creating numerical pipeline

            numerical_pipeline = Pipeline(
                steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())
                    ]
                )

            # Sequentially apply a list of transforms and a final estimator
            categorical_pipeline =Pipeline(
                steps=[
                ('imputer',SimpleImputer(strategy='most_frequent')),
                ('ordinalencoder', OrdinalEncoder(categories= [cut_categories,color_categories,clarity_categories])), # sequence array_like 
                ('scaler', StandardScaler())
                ]
            )

            preprocessor = ColumnTransformer([
            ('num_pipeline',numerical_pipeline,[i for i in numerical_columns] ), #should pass array for columns Passed in list comprehension
            ('cat_pipeline', categorical_pipeline, [i for i in categorical_columns])
            ])

            return preprocessor

            logging.info('pipeline completed')


        except Exception as e:
            logging.info("Error in Data Transformation")
            raise CustomException(e, sys)
        
        def initiate_data_transformation(self, train_path, test_path):
            try:

                # reading train and test data
                train_df =pd.read_csv(train_path)
                test_df = pd.read_csv(test_path)

                logging.info('Read train and test data completed')
                logging.info(f'Train Dateframe Head: \n{train_df.head().to_string()}')
                logging.info(f'Test Dataframe Head : \n{test_df.head().to_string()}')

                logging.info('Obtaining preprocessing object')

                preprocessing_obj = self.get_data_transformation_objects()

                target_column_name = 'price'
                drop_columns = [target_column_name, 'id']

                input_feature_train_df = train_df.drop(columns = drop_columns, axis=1)
                target_feature_train_df = train_df[target_column_name]

                input_feature_test_df = test_df.drop(columns=drop_columns, axis=1)
                target_feature_test_df = test_df[target_column_name]

                ## Transforming using preprocessor obj

                input_feature_train_arr = preprocessing_obj.fit_transform(input_feature_train_df)
                input_feature_test_arr = preprocessing_obj.transform(input_feature_test_df)

                logging.info('Applying preprocessing object on training and test datasets')

                train_arr = np.c_[input_feature_train_arr, np.array(target_feature_train_df)]
                test_arr = np.c_[input_feature_test_arr, np.array(target_feature_test_df)]

                save_object(

                    file_path = self.data_transformation_config.preprocessor_obj_file_path,
                    obj = preprocessing_obj
                )
                logging.info('preprocessor pickle file saved')

                return(train_arr,
                test_arr,
                self.data_transformation_config.preprocessor_obj_file_path)

            except Exception as e:
                logging.info('Exception occured in the initiate_datatransformation')
                raise CustomException(e, sys)

