In [1]:
#config.yaml
'''
data_transformation:
  root_dir: artifacts/data_transformation
  raw_data_file: raw_data.csv

'''

'\ndata_transformation:\n  root_dir: artifacts/data_transformation\n  raw_data_file: raw_data.csv\n\n'

In [2]:
## finance_ml.entity.config_entity 

from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    raw_data_file: str


In [2]:
# src/finance_ml/config/configuration.py

from finance_ml.constants import *
from finance_ml.utils.common import read_yaml, create_directories
# from finance_ml.entity.config_entity import DataTransformationConfig
import os

class ConfigurationManager:
    def __init__(
        self,
        config_filepath=CONFIG_FILE_PATH,
        params_filepath=PARAMS_FILE_PATH,
        schema_filepath=SCHEMA_FILE_PATH
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root]) 

    def get_data_transformation_config(self) -> DataTransformationConfig: # Add this method
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            raw_data_file=config.raw_data_file # Use the raw_data_file from config
        )

        return data_transformation_config

NameError: name 'DataTransformationConfig' is not defined

In [3]:
# from finance_ml.entity.config_entity import DataTransformationConfig
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import os
import traceback
from joblib import dump # Import dump to save the scaler
from finance_ml import logger # Import logger for logging within the component

class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    def transform_and_save_data(self, feature="Close", lookback=60, split_ratio=0.95):
        """
        Loads raw data, transforms it, and saves the transformed data and scaler.

        Args:
            feature (str): The feature column to use for transformation (default 'close').
            lookback (int): Number of previous time steps to use for prediction (default 60).
            split_ratio (float): Ratio for splitting data into training and testing sets (default 0.95).
        """
        raw_data_path = self.config.raw_data_file

        df = pd.read_csv(raw_data_path)
        logger.info(f"Raw data loaded from: {raw_data_path}")

        df['Datetime'] = pd.to_datetime(df['Datetime'])
        data = df[[feature]].copy()
        values = data.values

        scaler = StandardScaler()
        scaled_data = scaler.fit_transform(values)

        training_data_len = int(len(scaled_data) * split_ratio)

        train_data = scaled_data[:training_data_len]
        test_data = scaled_data[training_data_len - lookback:]

        def create_sequences(data):
            X, y = [], []
            for i in range(lookback, len(data)):
                X.append(data[i - lookback:i, 0])
                y.append(data[i, 0])
            return np.array(X), np.array(y)

        X_train, y_train = create_sequences(train_data)
        X_test, y_test = create_sequences(test_data)

        X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
        X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

        try:
        # Save transformed data and scaler
            np.save(os.path.join(self.config.root_dir, 'X_train.npy'), X_train)
            np.save(os.path.join(self.config.root_dir, 'y_train.npy'), y_train)
            np.save(os.path.join(self.config.root_dir, 'X_test.npy'), X_test)
            np.save(os.path.join(self.config.root_dir, 'y_test.npy'), y_test)
            dump(scaler, os.path.join(self.config.root_dir, 'scaler.joblib')) # Save the scaler

            logger.info("Transformed data and scaler saved.")
        except Exception as e:
            logger.error(f"Error saving transformed data: {e}")
            logger.error(traceback.format_exc())

NameError: name 'DataTransformationConfig' is not defined

In [None]:
#src/finance_ml/pipeline/stage_04_data_transformation.py

from finance_ml.config.configuration import ConfigurationManager
# from finance_ml.component.data_transformation import DataTransformation
from finance_ml import logger
import os # Keep os for potentially joining paths if needed within the component

STAGE_NAME = "Data Transformation stage"

class DataTransformationTrainingPipeline:
    def __init__(self):
        pass

    def main(self):
        try:
            with open(Path("artifacts/data_validation/status.txt"), "r")as f:
                status = f.read().split(" ")[-1]

            if status == "True":
                config = ConfigurationManager()
                data_transformation_config = config.get_data_transformation_config()
                data_transformation = DataTransformation(config=data_transformation_config)
                data_transformation.transform_and_save_data() # Call a dedicated method in the component

            else:
                raise Exception("Data schema is not valid")
        except Exception as e:
            print (e)

# Example of how to run this stage
if __name__ == '__main__':
    try:
        logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
        obj = DataTransformationTrainingPipeline()
        obj.main()
        logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
    except Exception as e:
        logger.exception(e)
        raise e