In [2]:
import os 

In [3]:
%pwd

'c:\\Users\\abhis\\OneDrive\\Desktop\\Oil_Retail\\research'

In [4]:
os.chdir('../')

In [5]:
%pwd

'c:\\Users\\abhis\\OneDrive\\Desktop\\Oil_Retail'

In [6]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    processed_data: Path
    train_data: Path
    test_data: Path


In [7]:
from oil_retail.constants import *
from oil_retail.utils.common import read_yaml, create_directories

In [8]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


class ConfigurationManager:

    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            processed_data=config.processed_data,
            train_data=config.train_data,
            test_data=config.test_data
        )

        return data_transformation_config

In [9]:
from oil_retail import logger 
from sklearn.model_selection import train_test_split
import pandas as pd 

In [10]:
import os
import pandas as pd
from oil_retail import logger
from oil_retail.entity.config_entity import DataTransformationConfig
from sklearn.model_selection import train_test_split

class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    def transform(self):
        df = pd.read_csv(self.config.data_path)

        # Convert date
        df['date'] = pd.to_datetime(df['date'])

        # Price gaps
        df['gap_comp1'] = df['price'] - df['comp1_price']
        df['gap_comp2'] = df['price'] - df['comp2_price']
        df['gap_comp3'] = df['price'] - df['comp3_price']

        # Lag features
        df['price_lag_1'] = df['price'].shift(1)
        df['volume_lag_1'] = df['volume'].shift(1)

        # Moving averages
        df['volume_ma_7'] = df['volume'].rolling(window=7).mean()
        df['volume_ma_30'] = df['volume'].rolling(window=30).mean()

        # Drop missing
        df = df.dropna()

        # Save processed dataset
        df.to_csv(self.config.processed_data, index=False)
        logger.info(f"Processed data saved at {self.config.processed_data}")

        return df

    def split_data(self):
        df = pd.read_csv(self.config.processed_data)

        train_size = int(len(df) * 0.8)

        train = df.iloc[:train_size]
        test = df.iloc[train_size:]

        train.to_csv(self.config.train_data, index=False)
        test.to_csv(self.config.test_data, index=False)

        logger.info(f"Train shape: {train.shape}")
        logger.info(f"Test shape: {test.shape}")