In [0]:
# jobs/transformer

from pyspark.sql.functions import col, to_date, year, round as spark_round
from pyspark.sql.types import DoubleType
from pyspark.sql import DataFrame
from pyspark.sql import SparkSession


class Transformer:
    def __init__(self, spark: SparkSession):
        self.spark = spark

    def transform_orders(self, orders_df: DataFrame) -> DataFrame:
        """
        Transforms the orders DataFrame by:
        - Converting 'Order_Date' and 'Ship_Date' to date type
        - Casting 'Profit' to double type

        Parameters:
            orders_df (DataFrame): Input DataFrame with raw order data

        Returns:
            DataFrame: Transformed DataFrame with updated schema

        Raises:
            ValueError: If DataFrame is invalid or required columns are missing
            Exception: For unexpected transformation errors
        """
        try:
            # Validate input DataFrame
            if orders_df is None or not isinstance(orders_df, DataFrame):
                raise ValueError("Invalid DataFrame provided.")

            required_cols = ["order_date", "ship_date", "profit"]
            missing_cols = [c for c in required_cols if c not in orders_df.columns]
            if missing_cols:
                raise ValueError(f"Missing required columns: {', '.join(missing_cols)}")

            # Apply transformations
            transformed_df = (
                orders_df
                .withColumn("order_date", to_date(col("order_date"), "d/M/yyyy"))
                .withColumn("ship_date", to_date(col("ship_date"), "d/M/yyyy"))
                .withColumn("profit", col("profit").cast(DoubleType()))
            )

            return transformed_df

        except ValueError as ve:
            raise Exception(f"ValueError error during order enrichment: {ve}") from ve
        except Exception as e:
            raise Exception(f"Unexpected error during order transformation: {e}") from e

    def enrich_orders(self, orders: DataFrame, customers: DataFrame, products: DataFrame) -> DataFrame:
        """
        Enriches orders by joining with customers and products datasets.

        Parameters:
            orders (DataFrame): Orders DataFrame (must contain order_id, order_date, ship_date, customer_id, product_id, profit)
            customers (DataFrame): Customers DataFrame (must contain customer_id, customer_name, country)
            products (DataFrame): Products DataFrame (must contain product_id, category, sub_category)

        Returns:
            DataFrame: Enriched DataFrame with customer and product information.

        Raises:
            ValueError: If any input DataFrame is invalid or required columns are missing.
            Exception: For unexpected join or transformation errors.
        """
        try:
            # Validate input DataFrames
            for df, name in [(orders, "orders"), (customers, "customers"), (products, "products")]:
                if df is None or not isinstance(df, DataFrame):
                    raise ValueError(f"Invalid DataFrame provided for {name}.")

            # Required columns for each DataFrame
            required_cols = {
                "orders": ["order_id", "order_date", "ship_date", "customer_id", "product_id", "profit"],
                "customers": ["customer_id", "customer_name", "country"],
                "products": ["product_id", "category", "sub_category"],
            }

            # Check missing columns
            for df, name in [(orders, "orders"), (customers, "customers"), (products, "products")]:
                missing_cols = [c for c in required_cols[name] if c not in df.columns]
                if missing_cols:
                    raise ValueError(f"Missing required columns in {name} DataFrame: {', '.join(missing_cols)}")

            # Perform joins and select required fields
            enriched_df = (
                orders.alias("o")
                .join(customers.alias("c"), col("o.customer_id") == col("c.customer_id"), "inner")
                .join(products.alias("p"), col("o.product_id") == col("p.product_id"), "inner")
                .select(
                    col("o.order_id"),
                    col("order_date"),
                    col("ship_date"),
                    col("c.customer_id"),
                    col("c.customer_name"),
                    col("c.country"),
                    col("o.product_id"),
                    col("p.category"),
                    col("p.sub_category"),
                    spark_round(col("profit"), 2).alias("profit"),
                    year(col("order_date")).alias("year")
                )
            )

            return enriched_df

        except ValueError as ve:
            # Validation errors (bad schema, missing columns, invalid DataFrame)  
            # should be raised as-is so caller can handle them explicitly.
            raise Exception(f"ValueError error during order enrichment: {ve}") from ve
        except Exception as e:
            # Any unexpected Spark or join errors are wrapped with context for debugging.
            raise Exception(f"Unexpected error during order enrichment: {e}") from e
