In [2]:
import os

In [1]:
%pwd

'c:\\Users\\Orçamento\\Desktop\\Bike Sales\\VeloAnalytics\\notebooks'

In [3]:
os.chdir('../')

In [4]:
%pwd

'c:\\Users\\Orçamento\\Desktop\\Bike Sales\\VeloAnalytics'

In [None]:
from dataclasses import dataclass
from pathlib import Path
from src.logging import logger
import pandas as pd

In [None]:
# --- Data Modelling Configuration Entity ---
# This defines the structure for the data modelling configuration.
@dataclass(frozen=True)
class DataModellingConfig:
    root_dir: Path
    processed_data_path: Path
    presentation_path: Path

In [None]:
from src.utils import read_yaml, create_directories

In [None]:
class ConfigurationManager:
    def __init__(
        self, 
        config_filepath = Path("config.yaml")):
        """
        Initializes the ConfigurationManager by reading the main config file.
        It also creates the main artifacts directory.
        """
        self.config = read_yaml(config_filepath)
        create_directories([Path(self.config.artifacts_root)])
        
    def get_data_modelling_config(self) -> DataModellingConfig:
        """
        Extracts the data modelling configuration from the main config file.
        """
        config = self.config.data_modelling
        create_directories([Path(config.root_dir), Path(config.presentation_path)])

        data_modelling_config = DataModellingConfig(
            root_dir=Path(config.root_dir),
            processed_data_path=Path(config.processed_data_path),
            presentation_path=Path(config.presentation_path)
        )
        return data_modelling_config

In [None]:
class DataModelling:
    def __init__(self, config: DataModellingConfig):
        """
        Initializes the DataModelling component with its configuration.
        """
        self.config = config

    def _load_processed_data(self) -> dict:
        """
        Loads all processed Parquet files into a dictionary of pandas DataFrames.
        """
        dataframes = {}
        path = self.config.processed_data_path
        for file_name in os.listdir(path):
            if file_name.endswith('.parquet'):
                table_name = Path(file_name).stem
                dataframes[table_name] = pd.read_parquet(os.path.join(path, file_name))
        logger.info(f"Loaded {len(dataframes)} processed tables.")
        return dataframes

    def build_star_schema(self):
        """
        Builds the fact and dimension tables for the star schema.
        """
        try:
            logger.info("Starting the data modelling process to build the star schema.")
            df_dict = self._load_processed_data()

            # --- 1. Build dim_customer ---
            df_partners = df_dict['BusinessPartners']
            df_addresses = df_dict['Addresses']
            dim_customer = pd.merge(df_partners, df_addresses, on='ADDRESSID', how='left')
            
            # --- 2. Build dim_product (Now with ProductTexts) ---
            df_products = df_dict['Products']
            df_prod_cat_text = df_dict['ProductCategoryText']
            df_prod_text = df_dict['ProductTexts']
            
            # Filter for English descriptions for consistency
            df_prod_cat_text = df_prod_cat_text[df_prod_cat_text['LANGUAGE'] == 'EN']
            df_prod_text = df_prod_text[df_prod_text['LANGUAGE'] == 'EN']
            
            # First join: Products with Category Text
            dim_product_intermediate = pd.merge(df_products, df_prod_cat_text, on='PRODCATEGORYID', how='left')
            # Second join: Result with Product Text
            dim_product = pd.merge(dim_product_intermediate, df_prod_text, on='PRODUCTID', how='left')


            # --- 3. Build dim_date ---
            df_sales_orders = df_dict['SalesOrders']
            df_sales_orders['CREATEDAT'] = pd.to_datetime(df_sales_orders['CREATEDAT'])
            min_date = df_sales_orders['CREATEDAT'].min()
            max_date = df_sales_orders['CREATEDAT'].max()
            
            dim_date = pd.DataFrame({'Date': pd.date_range(min_date, max_date)})
            dim_date['Year'] = dim_date['Date'].dt.year
            dim_date['Month'] = dim_date['Date'].dt.month
            dim_date['Day'] = dim_date['Date'].dt.day
            dim_date['Quarter'] = dim_date['Date'].dt.quarter
            dim_date['DayOfWeek'] = dim_date['Date'].dt.dayofweek # Monday=0, Sunday=6

            # --- 4. Build fact_sales ---
            df_sales_items = df_dict['SalesOrderItems']
            # The fact table is based on the items, as it's the lowest grain
            fact_sales = pd.merge(df_sales_items, df_sales_orders[['SALESORDERID', 'PARTNERID', 'CREATEDAT']], on='SALESORDERID', how='left')
            fact_sales.rename(columns={'CREATEDAT': 'OrderDate'}, inplace=True)
            
            # --- 5. Save Presentation Tables ---
            presentation_path = self.config.presentation_path
            dim_customer.to_parquet(os.path.join(presentation_path, "dim_customer.parquet"), index=False)
            dim_product.to_parquet(os.path.join(presentation_path, "dim_product.parquet"), index=False)
            dim_date.to_parquet(os.path.join(presentation_path, "dim_date.parquet"), index=False)
            fact_sales.to_parquet(os.path.join(presentation_path, "fact_sales.parquet"), index=False)
            
            logger.info(f"Successfully built and saved star schema tables to '{presentation_path}'")

        except Exception as e:
            logger.error(f"An error occurred during data modelling: {e}")
            raise e

In [None]:
# --- STAGE 4: DATA MODELLING ---
STAGE_NAME = "Data Modelling stage"
try:
    logger.info(f">>>>>> Stage '{STAGE_NAME}' started <<<<<<")
            
    # Initialize the configuration manager
    config = ConfigurationManager()
            
    # Get the specific configuration for data modelling
    data_modelling_config = config.get_data_modelling_config()
            
    # Initialize the data modelling component with the configuration
    data_modelling = DataModelling(config=data_modelling_config)
            
    # Run the star schema building process
    data_modelling.build_star_schema()
            
    logger.info(f">>>>>> Stage '{STAGE_NAME}' completed successfully <<<<<<\n\nx==========x")
except Exception as e:
    logger.exception(e)
    raise e