In [2]:
import os
from pathlib import Path

In [3]:
%pwd

'c:\\Users\\Hp\\Videos\\classification implementation - machine learning with MLFlow\\research'

In [4]:
os.chdir('../')

In [5]:
%pwd

'c:\\Users\\Hp\\Videos\\classification implementation - machine learning with MLFlow'

In [7]:
# Schema validation class
from dataclasses import dataclass
from pathlib import Path 

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path 

In [8]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories

In [9]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path = config.data_path,
        )

        return data_transformation_config

In [12]:
import os
from mlProject import logger
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
import numpy as np

In [13]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig) -> None:
        self.config = config
    
    def train_test_splitting(self):
        data = pd.read_csv(self.config.data_path)

        # feature engineering for ordinal categorical data        

        # Custom mapping
        occupation_mapping  = {
            'service and sales': 0,
            'skilled trades and technical': 1,
            'manufacturing and production': 2,
            'professional and managerial': 3}

        type_of_apartment_mapping = {
            'studio apartment': 0,
            'one-bedroom apartment': 1,
            'two or multi-bedroom apartment': 2}

        # Sample data

        # Preprocess the data using the custom mapping
        apartment_preprocessed_data = [[type_of_apartment_mapping[category[0]]] for category in data[['type_of_apartment']].values]

        occupation_preprocessed_data = [[occupation_mapping[category[0]]] for category in data[['occupation']].values]

        combined_ordinal_categories = np.concatenate((apartment_preprocessed_data,occupation_preprocessed_data),axis=1)

        # Use OrdinalEncoder
        ordinal_encoder = OrdinalEncoder()
        encoded_data = ordinal_encoder.fit_transform(combined_ordinal_categories)

        data[['type_of_apartment','occupation']] = combined_ordinal_categories

        train, test = train_test_split(data, test_size=0.20, random_state=42)
        
        # save train and test files as CSVs
        train.to_csv(os.path.join(self.config.root_dir,'train.csv'),index=False)
        test.to_csv(os.path.join(self.config.root_dir,'test.csv'),index=False)

        #Log information
        logger.info("Splitted data into training and test sets")
        logger.info(train.shape)
        logger.info(test.shape)

        print(train.shape)
        print(test.shape)





In [14]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.train_test_splitting()
except Exception as e:
    raise e

[2023-09-22 23:58:43,526: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-09-22 23:58:43,532: INFO: common: yaml file: params.yaml loaded successfully]
[2023-09-22 23:58:43,538: INFO: common: yaml file: schema.yaml loaded successfully]
[2023-09-22 23:58:43,542: INFO: common: created directory at: artifacts]
[2023-09-22 23:58:43,547: INFO: common: created directory at: artifacts/data_transformation]
[2023-09-22 23:58:43,686: INFO: 3153830424: Splitted data into training and test sets]
[2023-09-22 23:58:43,687: INFO: 3153830424: (800, 21)]
[2023-09-22 23:58:43,688: INFO: 3153830424: (200, 21)]
(800, 21)
(200, 21)
