In [1]:
import os
import sys
from pathlib import Path
from utils.common import create_directories
from dataclasses import dataclass
from exception.exception import customexception
from logger.logger import logging
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler,OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [2]:
# before start creating this flow update the config/config.yaml file 
# conifg_entity --> configuration_manager --> component file --> pipeline_file --> main_file

In [15]:
# config entity
@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    unzip_data_files: Path
    preprocessed_data_files: Path
    transform_data_files: Path

In [16]:
# configuration manager
from constants import CONFIG_FILE_PATH
from utils.common import create_directories,read_yaml,read_csv

class ConfigurationManager:
    def __init__(self,config_filepath=CONFIG_FILE_PATH):
        self.config = read_yaml(config_filepath)
        create_directories([self.config.artifacts_root])
        
    def get_datatransformation_config(self)->DataTransformationConfig:
        config = self.config.data_transformation
        create_directories([config.root_dir])
        data_transformation_config = DataTransformationConfig(
            root_dir = config.root_dir,
            unzip_data_files = config.unzip_data_files,
            preprocessed_data_files = config.preprocessed_data_files,
            transform_data_files = config.transform_data_files
        )
        
        return data_transformation_config

In [22]:
# component creation
class DataTransformation:
    def __init__(self,config=DataTransformationConfig):
        self.config = config
        
    def data_preprocessing(self):
        try:
            # reading data
            dataset_path = self.config.unzip_data_files
            dataframe = read_csv(dataset_path)
            
            logging.info("Data preprocessing started")
            
            # preprocessing on data
            dataframe.drop("id",axis=1,inplace=True)
            dataframe.drop(dataframe[(dataframe['price'] >= 10000) & (dataframe['carat'] > 0)  & (dataframe['carat'] < 0.9)].index,inplace=True)
            dataframe.drop(dataframe[(dataframe['x']<0.2) | (dataframe['y']<0.2) | (dataframe['z']<0.2)].index,inplace=True)
            
            # saving file in the data_transformation artifact
            preprocessed_data_path = self.config.preprocessed_data_files
            create_directories([preprocessed_data_path])
            
            file_path = os.path.join(preprocessed_data_path,"preprocessed_data.csv")
            dataframe.to_csv(file_path,index=False)
            
            
            # dependent and independent feature separation
            X = dataframe.drop('price',axis=1)
            y = dataframe['price'] 
            
            # preprocessing pipeline creation
            cat_cols = X.select_dtypes(include="object").columns
            nume_cols = X.select_dtypes(exclude="object").columns
            
            cut_categories = ["Fair","Good","Very Good","Premium","Ideal"]
            color_categories = ["D","E","F","G","H","I","J"]
            clarity_categories = ["I1","SI1","SI2","VS2","VS1","VVS2","VVS1","IF"]
            
            numeric_pipeline = Pipeline(
                steps=[
                    ('imputer',SimpleImputer(strategy='most_frequent')),
                    ('scaler',StandardScaler())
                ]
            )
            categorical_pipline = Pipeline(
                steps=[
                    ('imputer',SimpleImputer(strategy='most_frequent')),
                    ('ordinalencoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories]))
                ]
            )
            
            # use column transformer to join the two pipelines
            # (name, transformer, columns)
            preprocessor = ColumnTransformer([
                ("Numeric Pipeline",numeric_pipeline,nume_cols),
                ("Categorical Pipeline",categorical_pipline,cat_cols)
            ])
            logging.info("Data preprocessing completed, preprocessing object return successfully")
            
            return preprocessor
        except Exception as e:
            raise customexception(e,sys)
    
    def data_transformation(self):
        try: 
            preprocessor_obj = self.data_preprocessing()
            
            logging.info("Data transformation started")
            
            create_directories([self.config.transform_data_files])
            
            dataset_path = os.path.join(self.config.preprocessed_data_files,"preprocessed_data.csv")
            dataframe = read_csv(dataset_path)
            X = dataframe.drop('price',axis=1)
            y = dataframe['price']
            
            X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)
            
            X_train = preprocessor_obj.fit_transform(X_train)
            X_test = preprocessor_obj.transform(X_test)
            
            #dataframe creation
            # X_train = pd.DataFrame(data=X_train,columns=preprocessor_obj.get_feature_names_out())
            # y_train = pd.DataFrame(data=y_train,columns=['price'])
            # train_df = pd.concat([X_train,y_train],axis=1)
            # train_df_path = os.path.join(self.config.transform_data_files,"transform_train.csv")
            # train_df.to_csv(train_df_path,index=False)
            
            columns = list(dataframe.columns)
            print(columns)
            
            train_arr = np.c_[X_train,np.array(y_train)]
            train_df = pd.DataFrame(train_arr,columns=columns)
            train_df_path = os.path.join(self.config.transform_data_files,"transform_train.csv")
            train_df.to_csv(train_df_path,index=False)
            
            # X_test = pd.DataFrame(data=X_test,columns=preprocessor_obj.get_feature_names_out())
            # y_test = pd.DataFrame(data=y_test,columns=['price'])
            # test_df = pd.concat([X_test,y_test],axis=1)
            # test_df_path = os.path.join(self.config.transform_data_files,"transform_test.csv")
            # test_df.to_csv(test_df_path,index=False)
            test_arr = np.c_[X_test,np.array(y_test)]
            test_df = pd.DataFrame(test_arr,columns=columns)
            test_df_path = os.path.join(self.config.transform_data_files,"transform_test.csv")
            test_df.to_csv(test_df_path,index=False)
            

            logging.info("Data transformation completed")
        except Exception as e:
            raise customexception(e,sys)
        
        
        

In [23]:
#cd ..

In [24]:
# pwd

In [25]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_datatransformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.data_transformation()
except Exception as e:
    raise customexception(e,sys)

['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z', 'price']


In [None]:
df = pd.read_csv("Dataset/train.csv")

In [None]:

columns = list(df.columns)
columns.remove('id')
columns

['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z', 'price']