In [17]:

import os

In [18]:

# os.chdir("../")

In [19]:
from dataclasses import dataclass
from pathlib import Path
import pandas as pd



@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path

In [20]:
from src.utils.utlis import read_yaml,create_dir
from src.constants import *

In [21]:
class ConfigManager:
    def __init__(self,
                  config_filepath = CONFIG_FILE_PATH,
                params_filepath = PARAMS_FILE_PATH,
                schema_filepath = SCHEMA_FILE_PATH):
        self.config=read_yaml(config_filepath)
        self.params=read_yaml(params_filepath)
        self.schema=read_yaml(schema_filepath)

        create_dir([self.config.artifacts_root])


    def data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        

        create_dir([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
        )
        

        return data_transformation_config

In [22]:
import os
from src.logger.custom_logging import logger
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

In [23]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        self.scaler = StandardScaler()

    
    ## Note: You can add different data transformation techniques such as Scaler, PCA and all
    #You can perform all kinds of EDA in ML cycle here before passing this data to the model

    # I am only adding train_test_spliting cz this data is already cleaned up


    def train_test_spliting_and_scaling(self):
        data = pd.read_csv(self.config.data_path)

        # Split the data into training and test sets. (0.75, 0.25) split.
        train, test = train_test_split(data)

        train.to_csv(os.path.join(self.config.root_dir, "train.csv"),index = False)
        test.to_csv(os.path.join(self.config.root_dir, "test.csv"),index = False)

        # Assuming the features are all columns except the target (last column)
        X_train = train.iloc[:, :-1]
        y_train = train.iloc[:, -1]
        X_test = test.iloc[:, :-1]
        y_test = test.iloc[:, -1]

        # Apply StandardScaler to the features (not target)
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)

        # Combine scaled features with the target
        train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
        train_scaled['target'] = y_train.values

        test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)
        test_scaled['target'] = y_test.values

        # Save the scaled train and test sets as CSV files
        train_scaled.to_csv(os.path.join(self.config.root_dir, "train.csv"), index=False)
        test_scaled.to_csv(os.path.join(self.config.root_dir, "test.csv"), index=False)

        # Logging information
        logger.info("Data split into training and test sets")
        logger.info(f"Train shape: {train_scaled.shape}")
        logger.info(f"Test shape: {test_scaled.shape}")

        # Optional: Print the shapes for quick debugging
        print(f"Train shape: {train_scaled.shape}")
        print(f"Test shape: {test_scaled.shape}")

        

In [24]:
try:
    config = ConfigManager()
    data_transformation_config = config.data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.train_test_spliting_and_scaling()
except Exception as e:
    raise e

Train shape: (1199, 12)
Test shape: (400, 12)
