In [2]:
import os

In [3]:
%pwd

'c:\\Users\\Orçamento\\Desktop\\Bike Sales\\VeloAnalytics\\notebooks'

In [4]:
os.chdir('../')

In [5]:
%pwd

'c:\\Users\\Orçamento\\Desktop\\Bike Sales\\VeloAnalytics'

In [6]:
from dataclasses import dataclass
from pathlib import Path
from src.logging import logger
import pandas as pd

In [7]:
# --- Data Transformation Configuration Entity ---
# This defines the structure for the data transformation configuration.
@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    output_path: Path

In [8]:
from src.utils import read_yaml, create_directories

In [9]:
class ConfigurationManager:
    def __init__(
        self, 
        config_filepath = Path("config.yaml")):
        """
        Initializes the ConfigurationManager by reading the main config file.
        It also creates the main artifacts directory.
        """
        self.config = read_yaml(config_filepath)
        create_directories([Path(self.config.artifacts_root)])
    def get_data_transformation_config(self) -> DataTransformationConfig:
        """
        Extracts the data transformation configuration from the main config file.
        """
        config = self.config.data_transformation
        create_directories([Path(config.root_dir), Path(config.output_path)])

        data_transformation_config = DataTransformationConfig(
            root_dir=Path(config.root_dir),
            data_path=Path(config.data_path),
            output_path=Path(config.output_path)
        )
        return data_transformation_config

In [12]:
import os
import pandas as pd
from pathlib import Path
from src.logging import logger
from src.utils import read_yaml

class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        """
        Initializes the DataTransformation component with its configuration.
        """
        self.config = config
        self.schema = read_yaml(Path("schema.yaml"))

    def _clean_and_transform(self, df: pd.DataFrame, schema: dict) -> pd.DataFrame:
        """
        Private helper method to apply cleaning and transformations to a dataframe.
        """
        # --- 1. Drop Corrupted "Unnamed" Columns ---
        unnamed_cols = [col for col in df.columns if 'unnamed' in col.lower()]
        if unnamed_cols:
            df = df.drop(columns=unnamed_cols)
            logger.info(f"Dropped unnamed columns: {unnamed_cols}")

        # --- 2. Enforce Data Types ---
        for col, dtype in schema.items():
            if col in df.columns:
                if 'date' in col.lower() or 'at' in col.lower():
                    df[col] = pd.to_datetime(df[col], errors='coerce')
                else:
                    df[col] = df[col].astype(dtype, errors='ignore')
        
        # --- 3. Handle Missing Values (Updated to avoid 'inplace=True') ---
        for col in df.select_dtypes(include=['number']).columns:
            df[col] = df[col].fillna(0)
        for col in df.select_dtypes(include=['object']).columns:
            df[col] = df[col].fillna('N/A')
            
        return df

    def validate_and_transform_data(self):
        """
        Reads all raw CSV files, validates them against the defined schema,
        applies transformations, and saves them as processed Parquet files.
        """
        try:
            data_files = os.listdir(self.config.data_path)
            csv_files = [f for f in data_files if f.endswith('.csv')]
            logger.info(f"Found {len(csv_files)} CSV files to transform.")

            for csv_file in csv_files:
                file_name = Path(csv_file).stem
                
                if file_name not in self.schema.COLUMNS:
                    logger.warning(f"Schema not defined for {csv_file}. Skipping this file.")
                    continue

                logger.info(f"Processing and validating file: {csv_file}")
                
                file_schema = self.schema.COLUMNS[file_name]
                df = pd.read_csv(os.path.join(self.config.data_path, csv_file), encoding='latin1')

                # --- ROBUST FIX: Clean column names to remove BOM and other issues ---
                # The BOM character 'ï»¿' is sometimes read as part of the first column name.
                # This explicitly removes it.
                df.columns = df.columns.str.replace('ï»¿', '', regex=False).str.strip()

                # Validate columns exist
                validation_errors = [col for col in file_schema.keys() if col not in df.columns]
                if validation_errors:
                    logger.error(f"Schema validation failed for {csv_file}. Missing columns: {validation_errors}")
                    continue
                
                # Apply cleaning and transformations
                df_transformed = self._clean_and_transform(df, file_schema)

                # Save the processed dataframe as a parquet file
                output_file_path = os.path.join(self.config.output_path, f"{file_name}.parquet")
                df_transformed.to_parquet(output_file_path, index=False)
                logger.info(f"Successfully transformed and saved {csv_file} to {output_file_path}")

        except Exception as e:
            logger.exception(f"An error occurred during data transformation: {e}")
            raise e


In [None]:
# --- STAGE 3: DATA TRANSFORMATION ---
STAGE_NAME = "Data Transformation stage"
try:
    logger.info(f">>>>>> Stage '{STAGE_NAME}' started <<<<<<")
            
    # Initialize the configuration manager
    config = ConfigurationManager()
            
    # Get the specific configuration for data transformation
    data_transformation_config = config.get_data_transformation_config()
            
    # Initialize the data transformation component
    data_transformation = DataTransformation(config=data_transformation_config)
            
    # Run the transformation process
    data_transformation.validate_and_transform_data()
            
    logger.info(f">>>>>> Stage '{STAGE_NAME}' completed successfully <<<<<<\n\nx==========x")
except Exception as e:
    logger.exception(e)
    raise e

[2025-08-27 17:23:44,860: INFO: 4195453137: >>>>>> Stage 'Data Validation stage' started <<<<<<]
[2025-08-27 17:23:44,863: INFO: utils: YAML file loaded successfully: config.yaml]
[2025-08-27 17:23:44,865: INFO: utils: Directory created or already exists: artifacts]
[2025-08-27 17:23:44,867: INFO: utils: Directory created or already exists: artifacts\data_transformation]
[2025-08-27 17:23:44,869: INFO: utils: Directory created or already exists: data\02_processed]
[2025-08-27 17:23:44,878: INFO: utils: YAML file loaded successfully: schema.yaml]
[2025-08-27 17:23:44,880: INFO: 2255547456: Found 9 CSV files to transform.]
[2025-08-27 17:23:44,881: INFO: 2255547456: Processing and validating file: Addresses.csv]
[2025-08-27 17:23:44,904: INFO: 2255547456: Successfully transformed and saved Addresses.csv to data\02_processed\Addresses.parquet]
[2025-08-27 17:23:44,905: INFO: 2255547456: Processing and validating file: BusinessPartners.csv]
[2025-08-27 17:23:44,921: INFO: 2255547456: Succe

  df[col] = pd.to_datetime(df[col], errors='coerce')
  df[col] = pd.to_datetime(df[col], errors='coerce')
  df[col] = pd.to_datetime(df[col], errors='coerce')
  df[col] = pd.to_datetime(df[col], errors='coerce')
  df[col] = pd.to_datetime(df[col], errors='coerce')
  df[col] = pd.to_datetime(df[col], errors='coerce')


                # (Optional) Here you would add more transformation logic:
                # - Enforce data types from schema
                # - Handle missing values
                # - Create new features