In [None]:
import os
os.getcwd()
os.chdir("../")
os.getcwd()

'c:\\Users\\Marina\\Desktop\\ML Operations\\0 - KrishNaik Course\\18 - Getting Started With Your First End To End Data Science Project With\\my_project'

## Creating the Dataclass

**Todos os elementos do config.yaml devem estar nesta classe**

In [None]:
from dataclasses import dataclass
from pathlib import Path

# from yaml
# data_transformation:
#   root_dir: artifacts/data_transformation
#   data_path: artifacts/data_ingestion/winequality-red.csv

@dataclass
class DataTransformationConfig():
    root_dir: Path
    data_path: Path


## Update the Config Manager

In [None]:
from src.datascience.constants import CONFIG_FILEPATH, PARAMS_FILEPATH, SCHEMA_FILEPATH
from src.datascience.utils.commons import read_yaml, create_directories

class ConfigurationManager():
    def __init__(self, # recebe os parâmetros que seram usados para instanciar as propriedades p/leitura
                 config_filepath = CONFIG_FILEPATH,
                 params_filepath = PARAMS_FILEPATH,
                 schema_filepath = SCHEMA_FILEPATH,):
        
        # instância as propriedades com base nas leituras dos parâmetros passados acima
        self.config = read_yaml(config_filepath) # lê as configurações
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root]) # cria o /artifacts

    def get_data_transformation_config(self) -> DataTransformationConfig:
        
        config = self.config.data_transformation # change the "section" of the config file
        
        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir = config.root_dir,
            data_path = config.data_path,
            )
        
        return data_transformation_config

## Update the Components

In [None]:
import urllib.request as request
import pandas as pd
from src.datascience.utils import logger
from sklearn.model_selection import train_test_split

class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        # config should be an instance of DataTransformationConfig(the output of the previous cell)
        self.config = config

    def split_data(self) -> pd.DataFrame:
        """Splits the Data"""
        try:
            data = pd.read_csv(self.config.data_path)

            X = data.iloc[:, :-1]  # Features
            y = data.iloc[:, -1]   # Target variable

            X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size= 0.2,)
            
            logger.info(f"Sucesfully splitted the data \nX_train dimensions:{X_train.shape}\nX_test dimensions:{X_test.shape}\ny_train dimensions:{y_train.shape}\ny_test dimensions:{y_test.shape}   ")

            return X_train, X_test, y_train, y_test
        
        except Exception as e:
            logger.exception(f"Exception during data_splitting \n Exception:{e}")
            raise e
    
    def save_splits(self, X_train, X_test, y_train, y_test):
        try:
            X_train.to_csv(os.path.join(self.config.root_dir, "X_train.csv"), index = False)
            X_test.to_csv(os.path.join(self.config.root_dir, "X_test.csv"), index = False)
            y_train.to_csv(os.path.join(self.config.root_dir, "y_train.csv"), index = False)
            y_test.to_csv(os.path.join(self.config.root_dir, "y_test.csv"), index = False)

            logger.info(f"Sucesfully Saved the splits to: {self.config.root_dir}")

        except Exception as e:
            logger.exception(f"Exception during data_splitting \n Exception:{e}")
            raise e

## Testing

In [None]:
from src.datascience.utils import logger

try:
    # Get the variables from needed for the process
    data_transformation_config = ConfigurationManager().get_data_transformation_config()
    
    # Instantiate the object for performing the Data Transforming
    data_transformation_object = DataTransformation(data_transformation_config)

    # Split and Save
    X_train, X_test, y_train, y_test = data_transformation_object.split_data()
    data_transformation_object.save_splits(X_train, X_test, y_train, y_test)
    
except Exception as e:
    logger.error(f"Error during data_ingestion. Error {e}")
    raise(e)

[2024-11-04 14:13:20,123: INFO: commons: yaml file: config\config.yaml loaded successfully ]
[2024-11-04 14:13:20,129: INFO: commons: yaml file: params.yaml loaded successfully ]
[2024-11-04 14:13:20,149: INFO: commons: yaml file: schema.yaml loaded successfully ]
[2024-11-04 14:13:20,153: INFO: commons: created directory at: artifacts ]
[2024-11-04 14:13:20,156: INFO: commons: created directory at: artifacts/data_transformation ]
[2024-11-04 14:13:20,226: INFO: 4182609033: Sucesfully splitted the data 
X_train dimensions:(1279, 11)
X_test dimensions:(320, 11)
y_train dimensions:(1279,)
y_test dimensions:(320,)    ]
[2024-11-04 14:13:20,264: INFO: 4182609033: Sucesfully Saved the splits to: artifacts/data_transformation ]


## Agora iremos converter para código modular

Começamos do passo 4 em diante(do workflow em notes.ipynb)

O código inteiro da seção `Creating the Dataclass` vai para `src/entity/config_entity.py`

O código inteiro da seção `Update the Config Manager` vai para `src/config/configuration.py`

O código inteiro da seção `Update the Components` vai para `src/components/nome_da_step.py`

O código inteiro da seção `Testing` vai para `src/pipeline/nome_da_step_pipeline.py`, com implementação OOP, ver exemplo 

O código anterior, vai para `main.py`, podendo chamar diretamente a classe, só ver o exemplo