In [1]:
import os 
os.chdir('../')

In [2]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class PrepareDataConfig:
    root_dir: Path
    raw_data_path: Path
    prepared_data_path: Path
    base_scaler_path: Path
    feature_columns: list
    target_column: str

In [3]:
from src.MatchAnalysis.constants import *
from src.MatchAnalysis.utils.common import read_yaml, create_directories

In [4]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_prepare_data_config(self):

        prepare_data = self.config.prepare_data
        raw_data = self.config.data_ingestion.data_file

        create_directories([
            Path(prepare_data.root_dir)
        ])

        prepare_data_config = PrepareDataConfig(
            root_dir = Path(prepare_data.root_dir),
            raw_data_path = Path(raw_data),
            prepared_data_path = Path(prepare_data.prepared_data_path),
            base_scaler_path = Path(prepare_data.base_scaler_path),
            feature_columns = prepare_data.feature_columns,
            target_column = prepare_data.target_column
            
        )

        return prepare_data_config

In [5]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

class PrepareData:
    def __init__(self, config: PrepareDataConfig):
        self.config = config

    def _prepare_data(self):
        
        clean_data = self.data[
            [*self.config.feature_columns, 
              self.config.target_column]
        ]

        self.scaler.fit(self.data[self.config.feature_columns])

        clean_data[self.config.feature_columns] = self.scaler.transform(
            clean_data[self.config.feature_columns]
        )

        return clean_data

        
    def get_data(self):
        self.data = pd.read_csv(self.config.raw_data_path)
    
    def get_scaler(self):
        self.scaler = MinMaxScaler()

    def update_data(self):

        self.normalized_data = self._prepare_data()

        
        self.save_data(
            path=self.config.prepared_data_path,
            data=self.normalized_data
        )

        self.save_scaler(
            path=self.config.base_scaler_path,
            scaler=self.scaler
        )

    @staticmethod
    def save_data(path: Path, data):
        data.to_csv(path, index=False)

    @staticmethod
    def save_scaler(path: Path, scaler: MinMaxScaler):
        import joblib
        joblib.dump(scaler, path)

In [6]:
try:
    config = ConfigurationManager()
    prepare_data_config = config.get_prepare_data_config()
    prepare_data = PrepareData(prepare_data_config)
    prepare_data.get_data()
    prepare_data.get_scaler()
    prepare_data.update_data()
except Exception as e:
    raise e

[2024-02-05 10:39:46,849: INFO: common] yaml file: config/config.yaml loaded successfully
[2024-02-05 10:39:46,850: INFO: common] yaml file: params.yaml loaded successfully
[2024-02-05 10:39:46,851: INFO: common] Creating directory: artifacts
[2024-02-05 10:39:46,852: INFO: common] Creating directory: artifacts/prepare_data
