2- Data Cleaning

In [1]:
import os

In [2]:
%pwd

'c:\\Users\\Orçamento\\Desktop\\Bike Sales\\VeloAnalytics\\notebooks'

In [3]:
os.chdir('../')

In [4]:
%pwd

'c:\\Users\\Orçamento\\Desktop\\Bike Sales\\VeloAnalytics'

In [5]:
from dataclasses import dataclass
from pathlib import Path

In [6]:
# --- Data Validation Configuration Entity ---
# This defines the structure for the data validation configuration.
@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    unzip_data_dir: Path
    status_file: Path

In [7]:
from src.utils import read_yaml, create_directories
from pathlib import Path

In [8]:
class ConfigurationManager:
    def __init__(
        self, 
        config_filepath = Path("config.yaml")):
        """
        Initializes the ConfigurationManager by reading the main config file.
        It also creates the main artifacts directory.
        """
        self.config = read_yaml(config_filepath)
        create_directories([Path(self.config.artifacts_root)])
    def get_data_validation_config(self) -> DataValidationConfig:
        """
        Extracts the data validation configuration from the main config file,
        creates its specific artifact directory, and returns it as a 
        DataValidationConfig object.
        """
        config = self.config.data_validation
        create_directories([Path(config.root_dir)])

        data_validation_config = DataValidationConfig(
            root_dir=Path(config.root_dir),
            unzip_data_dir=Path(config.unzip_data_dir),
            status_file=Path(config.status_file)
        )
        return data_validation_config

In [9]:
from src.logging import logger

class DataValidation:
    def __init__(self, config: DataValidationConfig):
        """
        Initializes the DataValidation component with its configuration.
        """
        self.config = config

    def validate_all_files_exist(self) -> bool:
        """
        Validates that all expected files exist in the unzipped data directory.
        For this project, we expect 9 CSV files.
        """
        try:
            validation_status = True
            
            # Get a list of all files in the directory
            all_files = os.listdir(self.config.unzip_data_dir)
            
            # For this specific challenge, we know there should be 9 CSV files.
            # A more robust solution could take a list of required files from the config.
            expected_file_count = 9
            
            if len(all_files) != expected_file_count:
                validation_status = False
                logger.warning(f"File count validation failed. Expected {expected_file_count} files, but found {len(all_files)}.")
            else:
                logger.info(f"File count validation successful. Found {len(all_files)} files.")

            # Write the final validation status to the status file
            with open(self.config.status_file, "w") as f:
                f.write(f"Validation status: {validation_status}")
            
            return validation_status

        except Exception as e:
            logger.error(f"An error occurred during file validation: {e}")
            # Ensure status is written as false if an error occurs
            with open(self.config.status_file, "w") as f:
                f.write(f"Validation status: False")
            return False

In [None]:
# --- STAGE 2: DATA VALIDATION ---
STAGE_NAME = "Data Validation stage"
try:
    logger.info(f">>>>>> Stage '{STAGE_NAME}' started <<<<<<")
            
    # Initialize the configuration manager
    config = ConfigurationManager()
            
    # Get the specific configuration for data validation
    data_validation_config = config.get_data_validation_config()
            
    # Initialize the data validation component with the configuration
    data_validation = DataValidation(config=data_validation_config)
            
    # Run the validation process
    data_validation.validate_all_files_exist()
            
    logger.info(f">>>>>> Stage '{STAGE_NAME}' completed successfully <<<<<<\n\nx==========x")
except Exception as e:
    logger.exception(e)
    raise e

[2025-08-27 16:00:26,072: INFO: 566605: >>>>>> Stage 'Data Validation stage' started <<<<<<]
[2025-08-27 16:00:26,077: INFO: utils: YAML file loaded successfully: config.yaml]
[2025-08-27 16:00:26,080: INFO: utils: Directory created or already exists: artifacts]
[2025-08-27 16:00:26,082: ERROR: 566605: 'ConfigurationManager' object has no attribute 'get_data_transformation_config']
Traceback (most recent call last):
  File "C:\Users\Orçamento\AppData\Local\Temp\ipykernel_27000\566605.py", line 10, in <module>
    data_transformation_config = config.get_data_transformation_config()
                                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: 'ConfigurationManager' object has no attribute 'get_data_transformation_config'


AttributeError: 'ConfigurationManager' object has no attribute 'get_data_transformation_config'