In [0]:
"""
write_utils.py

Contains helper functions for writing Spark DataFrames to Delta Lake with standardized configurations.
Promotes consistent use of overwrite behavior, schema merging, and partitioning across the pipeline.
"""

def write_to_delta(df, path, mode="overwrite", partition_by=None, merge_schema=True):
    """
    Writes a Spark DataFrame to Delta Lake format.

    Parameters:
    ----------
    df : pyspark.sql.DataFrame
        The DataFrame to write.
    path : str
        The target Delta Lake path.
    mode : str, optional
        Save mode, default is 'overwrite'.
    partition_by : str or list[str], optional
        Column(s) to partition by.
    merge_schema : bool, optional
        Whether to merge schema during write.

    Returns:
    -------
    None
    """


In [0]:
# /utils/write_utils

from pyspark.sql import DataFrame, SparkSession
from typing import Optional, List

def write_df_to_delta(
    df: DataFrame,
    path: str,
    mode: str = "overwrite",
    merge_schema: bool = True,
    register_table: bool = True,
    partition_by: Optional[List[str]] = None,
    dry_run: bool = False,
    verbose: bool = False,
) -> Optional[str]:
    """
    Writes a Spark DataFrame to Delta Lake with optional schema merging, table registration, and partitioning.

    Parameters:
    - df (DataFrame): Spark DataFrame to write
    - path (str): Destination path (e.g., /mnt/delta/bronze/table_name)
    - mode (str): Write mode ('overwrite', 'append', etc.)
    - merge_schema (bool): Enable mergeSchema (default True)
    - register_table (bool): Register as Hive/Unity Catalog table (default True)
    - partition_by (list[str], optional): List of columns to partition by
    - dry_run (bool): If True, does not actually write — just prints what would happen
    - verbose (bool): If True, prints schema, path, and action details

    Returns:
    - str: Table name if registered, otherwise None
    """
    try:
        if verbose:
            print(f"📁 Write Path: {path}")
            print(f"📝 Write Mode: {mode}")
            print(f"🔀 Partitioning: {partition_by}")
            print(f"📊 Schema:")
            df.printSchema()

        writer = df.write.format("delta").mode(mode)

        if merge_schema:
            writer = writer.option("mergeSchema", "true")

        if partition_by:
            writer = writer.partitionBy(partition_by)

        if not dry_run:
            writer.save(path)
            if verbose:
                print(f"✅ Data written to {path}")

        table_name = path.rstrip("/").split("/")[-1]

        if register_table:
            spark = SparkSession.builder.getOrCreate()
            spark.sql(f"DROP TABLE IF EXISTS {table_name}")
            spark.sql(f"""
                CREATE TABLE {table_name}
                USING DELTA
                LOCATION '{path}'
            """)
            if verbose:
                print(f"📚 Table registered: {table_name}")

            return table_name

        return None

    except Exception as e:
        print(f"❌ Error writing to Delta: {e}")
        raise
