In [46]:
import os

In [47]:
%pwd

'd:\\projects\\git-hub'

In [49]:
os.chdir(".\End to End Mlops with MLFlow") 

In [50]:
%pwd

'd:\\projects\\git-hub\\End to End Mlops with MLFlow'

In [51]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path

In [52]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories

In [53]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
        )

        return data_transformation_config

In [54]:
import os
from mlProject import logger
from sklearn.model_selection import train_test_split
import pandas as pd

In [55]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    """
    Note: You can add different data transformation techniques such as Scaler, PCA and all
    You can perform all kinds of EDA in ML cycle here before passing this data to the model
    I am only adding train_test_spliting cz this data is already cleaned up
    """

    def train_test_spliting(self):
        data = pd.read_csv(self.config.data_path)

        # Split the data into training and test sets. Default split (0.75, 0.25) split.
        train, test = train_test_split(data)

        train.to_csv(os.path.join(self.config.root_dir, "train.csv"),index = False)
        test.to_csv(os.path.join(self.config.root_dir, "test.csv"),index = False)

        logger.info("Splited data into training and test sets")
        logger.info(train.shape)
        logger.info(test.shape)

        print(train.shape)
        print(test.shape)
        

In [56]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.train_test_spliting()
except Exception as e:
    raise e

[2024-03-09 23:30:36,999: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-03-09 23:30:37,005: INFO: common: yaml file: params.yaml loaded successfully]
[2024-03-09 23:30:37,015: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-03-09 23:30:37,019: INFO: common: created directory at: artifacts]
[2024-03-09 23:30:37,024: INFO: common: created directory at: artifacts/data_transformation]
[2024-03-09 23:30:37,081: INFO: 842736296: Splited data into training and test sets]
[2024-03-09 23:30:37,083: INFO: 842736296: (1199, 12)]
[2024-03-09 23:30:37,086: INFO: 842736296: (400, 12)]
(1199, 12)
(400, 12)
