In [0]:
# Databricks notebook source
# ============================================================================
# SECTION 1: NOTEBOOK CONFIGURATION AND SETUP
# ============================================================================
# This section initializes the notebook environment and sets up necessary
# configurations for the ETL pipeline

# Import required libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, trim, upper, lower, when, count, sum, avg, 
    max, min, round, to_date, year, month, current_timestamp,
    regexp_replace, coalesce, isnan, lit, countDistinct
)
from pyspark.sql.types import (
    StructType, StructField, StringType, IntegerType, 
    DoubleType, DateType, TimestampType
)
from pyspark.sql.window import Window
from datetime import datetime

start_notebook_datetime = datetime.now()

# Display notebook configuration
print("=" * 80)
print("ETL PIPELINE - RAW DATA TO UNITY CATALOG")
print("=" * 80)
print(f"Spark Version: {spark.version}")
print(f"Start Execution Time: {start_notebook_datetime}")

ETL PIPELINE - RAW DATA TO UNITY CATALOG
Spark Version: 4.0.0
Start Execution Time: 2026-02-02 19:19:16.526047


In [0]:
# ============================================================================
# SECTION 2: UNITY CATALOG CONFIGURATION
# ============================================================================
# Define the three-level namespace for Unity Catalog: catalog.schema.table
# Unity Catalog provides centralized governance for all data assets

# Define Notebook Parameters
dbutils.widgets.text("CATALOG_NAME", "workspace")
dbutils.widgets.text("SCHEMA_NAME", "portfolio_project")
dbutils.widgets.text("TABLE_NAME", "sales_summary")

# Define Unity Catalog namespace variables
CATALOG_NAME = dbutils.widgets.get("CATALOG_NAME")
SCHEMA_NAME = dbutils.widgets.get("SCHEMA_NAME")
TABLE_NAME = dbutils.widgets.get("TABLE_NAME")
FULL_TABLE_PATH = f"{CATALOG_NAME}.{SCHEMA_NAME}.{TABLE_NAME}"

# Raw data paths
RAW_CSV_PATH = "/Volumes/workspace/portfolio_projects/volume_portfolio_projects/simple_etl_project_raw_data_csv/"
RAW_PARQUET_PATH = "/Volumes/workspace/portfolio_projects/volume_portfolio_projects/simple_etl_project_raw_data_parquet/"

print(f"Unity Catalog Configuration:")
print(f"  - Catalog: {CATALOG_NAME}")
print(f"  - Schema: {SCHEMA_NAME}")
print(f"  - Table: {TABLE_NAME}")
print(f"  - Full Path: {FULL_TABLE_PATH}")

Unity Catalog Configuration:
  - Catalog: workspace
  - Schema: portfolio_projects
  - Table: sales_summary
  - Full Path: workspace.portfolio_projects.sales_summary


In [0]:
# ============================================================================
# SECTION 3: CREATE CATALOG AND SCHEMA
# ============================================================================
# Set up the Unity Catalog infrastructure if it doesn't exist
# This ensures proper organization and governance of data assets

# Activate the target catalog
spark.sql(f"USE CATALOG {CATALOG_NAME}")
print(f"‚úì Using catalog: {CATALOG_NAME}")

# Create schema if it doesn't exist
spark.sql(f"""
    CREATE SCHEMA IF NOT EXISTS {CATALOG_NAME}.{SCHEMA_NAME}
    COMMENT 'Portfolio project schema for ETL demonstration'
""")
print(f"‚úì Schema created/verified: {SCHEMA_NAME}")

# Verify the schema was created successfully
schemas_df = spark.sql(f"SHOW SCHEMAS IN {CATALOG_NAME}")
display(schemas_df)

‚úì Using catalog: workspace
‚úì Schema created/verified: portfolio_projects


databaseName
default
information_schema
mongo_db
mybudget
portfolio_projects


In [0]:
# ============================================================================
# SECTION 4: EXTRACT - LOAD RAW DATA
# ============================================================================
# Extract raw data from various file formats
# Databricks supports CSV, Parquet, JSON, Delta, and more

# Option 1: Load data from CSV
print("Loading data from CSV...")
df_raw = spark.read \
    .format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .option("dateFormat", "yyyy-MM-dd") \
    .option("nullValue", "NULL") \
    .option("mode", "PERMISSIVE") \
    .load(RAW_CSV_PATH)

# Option 2: Load data from Parquet (more efficient for large datasets)
# Uncomment the following lines to use Parquet instead
# print("Loading data from Parquet...")
# df_raw = spark.read \
#     .format("parquet") \
#     .load(RAW_PARQUET_PATH)

# Option 3: Load from Delta Lake (recommended for Databricks)
# df_raw = spark.read.format("delta").load("/path/to/delta/table")

print(f"‚úì Raw data loaded successfully")
print(f"  - Total rows: {df_raw.count():,}")
print(f"  - Total columns: {len(df_raw.columns)}")

# Display sample of raw data
print("\nSample of raw data (first 10 rows):")
display(df_raw.limit(10))

Loading data from CSV...
‚úì Raw data loaded successfully
  - Total rows: 408,000
  - Total columns: 12

Sample of raw data (first 10 rows):


transaction_id,transaction_date,customer_id,customer_name,product_id,product_name,category,quantity,unit_price,discount,region,sales_person
TXN00066292,2025-07-25,CUST000272,James Gonzalez,PROD00741,Building Set Basic 387,Toys,7,83.66,0.2,Africa,Henry Taylor
TXN00011411,2025-03-28,CUST000274,David Jones,PROD00551,Smartphone Standard 594,Electronics,9,1562.22,0.0,Latin America,Carol Davis
TXN00013653,2024-02-25,CUST003813,David Jones,PROD00070,Biography Standard 560,Books,3,17.53,0.1,Europe,Bob Smith
TXN00017090,2025-10-27,CUST000838,Michael Lopez,PROD00527,Shoes Plus 930,Clothing,7,184.83,0.2,Middle East,Eve Wilson
TXN00007861,2024-05-08,CUST002964,Patricia Rodriguez,PROD00076,Soccer Ball Pro 487,Sports,10,220.74,0.2,Latin America,Frank Miller
TXN00014185,2025-01-18,CUST001092,David Taylor,PROD00899,Novel Standard 754,,7,37.79,0.0,Europe,Frank Miller
TXN00055601,2024-11-28,CUST000223,Sarah Lopez,PROD00333,Plant Standard 419,Home & Garden,8,82.17,0.1,North America,Bob Smith
TXN00044684,2024-09-05,CUST001520,Jessica Brown,PROD00856,Lotion Basic 257,Health & Beauty,9,20.67,0.0,North America,Carol Davis
TXN00039910,2024-05-08,CUST003034,Daniel Thomas,PROD00116,Dress Elite 302,Clothing,5,79.87,0.15,Africa,Eve Wilson
TXN00093983,2024-11-20,CUST003496,Daniel Taylor,PROD00388,Wiper Basic 754,Automotive,5,422.98,0.2,,Frank Miller


In [0]:
# ============================================================================
# SECTION 5: DATA PROFILING AND QUALITY ASSESSMENT
# ============================================================================
# Before cleaning, it's crucial to understand data quality issues
# This section provides comprehensive data profiling

print("=" * 80)
print("DATA QUALITY ASSESSMENT")
print("=" * 80)

# Check for null values in each column
print("\n1. NULL VALUE ANALYSIS:")
null_counts = df_raw.select([
    count(when(col(c).isNull(), c)).alias(c) 
    for c in df_raw.columns
])
display(null_counts)

# Check for duplicate records
duplicate_count = df_raw.count() - df_raw.dropDuplicates().count()
print(f"\n2. DUPLICATE RECORDS: {duplicate_count:,}")

# Basic statistical summary
print("\n3. STATISTICAL SUMMARY:")
display(df_raw.describe())

# Check data types
print("\n4. CURRENT DATA TYPES:")
df_raw.printSchema()

DATA QUALITY ASSESSMENT

1. NULL VALUE ANALYSIS:


transaction_id,transaction_date,customer_id,customer_name,product_id,product_name,category,quantity,unit_price,discount,region,sales_person
0,0,0,5038,0,0,5031,0,0,0,5208,5014



2. DUPLICATE RECORDS: 128,000

3. STATISTICAL SUMMARY:


summary,transaction_id,customer_id,customer_name,product_id,product_name,category,quantity,unit_price,discount,region,sales_person
count,408000,408000,402962,408000,408000,402969,408000.0,408000.0,408000.0,402792,402986
mean,,,,,,,5.406877450980392,246.34731129902016,0.0937318627450976,,
stddev,,,,,,,2.989053081544615,372.78288000674223,0.0916419297820889,,
min,TXN00000001,CUST000001,charles anderson,PROD00001,Action Figure Basic 100,Automotive,-5.0,5.0,0.0,Africa,Alice Johnson
max,TXN00280000,CUST005000,William Wilson,PROD01000,Yoga Mat Standard 999,Toys,10.0,1999.97,0.25,North America,Henry Taylor



4. CURRENT DATA TYPES:
root
 |-- transaction_id: string (nullable = true)
 |-- transaction_date: date (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- unit_price: double (nullable = true)
 |-- discount: double (nullable = true)
 |-- region: string (nullable = true)
 |-- sales_person: string (nullable = true)



In [0]:
# ============================================================================
# SECTION 6: TRANSFORM - DATA CLEANING
# ============================================================================
# Apply comprehensive data cleaning operations
# Each step is explained and can be customized based on your data

print("Starting data cleaning process...")

# Step 1: Remove exact duplicate rows
df_cleaned = df_raw.dropDuplicates()
print(f"‚úì Step 1: Removed {df_raw.count() - df_cleaned.count()} duplicate rows")

# Step 2: Handle missing values with business logic
# Different strategies for different columns
df_cleaned = df_cleaned \
    .na.fill({
        "quantity": 0,          # Fill missing quantities with 0
        "unit_price": 0.0,      # Fill missing prices with 0
        "discount": 0.0,        # Assume no discount if missing
        "category": "Unknown",  # Default category
        "region": "Unspecified" # Default region
    })
print("‚úì Step 2: Filled missing values with appropriate defaults")

# Step 3: Drop rows where critical fields are null
# Transaction ID and Date are mandatory for our analysis
df_cleaned = df_cleaned \
    .filter(col("transaction_id").isNotNull()) \
    .filter(col("transaction_date").isNotNull()) \
    .filter(col("customer_id").isNotNull())
print("‚úì Step 3: Removed rows with null critical fields")

# Step 4: Standardize text fields (trim whitespace, consistent casing)
df_cleaned = df_cleaned \
    .withColumn("customer_name", trim(upper(col("customer_name")))) \
    .withColumn("product_name", trim(upper(col("product_name")))) \
    .withColumn("category", trim(upper(col("category")))) \
    .withColumn("region", trim(upper(col("region")))) \
    .withColumn("sales_person", trim(upper(col("sales_person"))))
print("‚úì Step 4: Standardized text fields (uppercase and trimmed)")

# Step 5: Convert date strings to proper date type
df_cleaned = df_cleaned \
    .withColumn("transaction_date", to_date(col("transaction_date"), "yyyy-MM-dd"))
print("‚úì Step 5: Converted transaction_date to proper DateType")

# Step 6: Add derived columns for better analysis
df_cleaned = df_cleaned \
    .withColumn("year", year(col("transaction_date"))) \
    .withColumn("month", month(col("transaction_date"))) \
    .withColumn("total_amount", 
                round((col("quantity") * col("unit_price")) * (1 - col("discount")), 2))
print("‚úì Step 6: Added derived columns (year, month, total_amount)")

# Step 7: Data validation - remove invalid records
# Ensure quantity and price are positive
df_cleaned = df_cleaned \
    .filter(col("quantity") >= 0) \
    .filter(col("unit_price") >= 0) \
    .filter(col("discount") >= 0) \
    .filter(col("discount") <= 1)  # Discount should be between 0 and 1
print("‚úì Step 7: Filtered out invalid records (negative values, invalid discounts)")

# Step 8: Remove outliers (optional - adjust thresholds based on your data)
# For example, remove transactions with unrealistic quantities
df_cleaned = df_cleaned.filter(col("quantity") <= 1000)
print("‚úì Step 8: Removed outliers (quantity > 1000)")

print(f"\nCleaning complete:")
print(f"  - Original rows: {df_raw.count():,}")
print(f"  - Cleaned rows: {df_cleaned.count():,}")
print(f"  - Rows removed: {df_raw.count() - df_cleaned.count():,}")
print(f"  - Data retention rate: {(df_cleaned.count() / df_raw.count() * 100):.2f}%")

# Display sample of cleaned data
print("\nSample of cleaned data:")
display(df_cleaned.limit(10))

Starting data cleaning process...
‚úì Step 1: Removed 128000 duplicate rows
‚úì Step 2: Filled missing values with appropriate defaults
‚úì Step 3: Removed rows with null critical fields
‚úì Step 4: Standardized text fields (uppercase and trimmed)
‚úì Step 5: Converted transaction_date to proper DateType
‚úì Step 6: Added derived columns (year, month, total_amount)
‚úì Step 7: Filtered out invalid records (negative values, invalid discounts)
‚úì Step 8: Removed outliers (quantity > 1000)

Cleaning complete:
  - Original rows: 408,000
  - Cleaned rows: 277,182
  - Rows removed: 130,818
  - Data retention rate: 67.94%

Sample of cleaned data:


transaction_id,transaction_date,customer_id,customer_name,product_id,product_name,category,quantity,unit_price,discount,region,sales_person,year,month,total_amount
TXN00096437,2025-10-05,CUST003451,EMMA WILSON,PROD00046,LOTION PLUS 451,HEALTH & BEAUTY,1,149.97,0.15,LATIN AMERICA,DAN BROWN,2025,10,127.47
TXN00045979,2024-08-03,CUST001623,JENNIFER THOMAS,PROD00455,PUZZLE STANDARD 494,TOYS,10,82.99,0.25,EUROPE,GRACE LEE,2024,8,622.43
TXN00039352,2025-04-22,CUST002668,ROBERT WILLIAMS,PROD00948,DUMBBELL PRO 184,SPORTS,1,78.65,0.15,LATIN AMERICA,BOB SMITH,2025,4,66.85
TXN00045840,2025-01-23,CUST002021,JENNIFER TAYLOR,PROD00407,NOVEL BASIC 905,BOOKS,2,29.85,0.25,EUROPE,DAN BROWN,2025,1,44.78
TXN00031545,2024-11-15,CUST004245,PATRICIA ANDERSON,PROD00369,COOKBOOK BASIC 441,BOOKS,10,22.96,0.1,ASIA PACIFIC,HENRY TAYLOR,2024,11,206.64
TXN00005422,2026-01-14,CUST003964,ROBERT SMITH,PROD00746,LAMP PLUS 708,HOME & GARDEN,7,327.28,0.25,LATIN AMERICA,EVE WILSON,2026,1,1718.22
TXN00014514,2024-07-23,CUST002228,DANIEL SMITH,PROD00421,TIRE ELITE 457,AUTOMOTIVE,1,278.26,0.0,EUROPE,GRACE LEE,2024,7,278.26
TXN00015573,2024-03-03,CUST000448,MARIA BROWN,PROD00662,SHAMPOO BASIC 236,HEALTH & BEAUTY,7,16.17,0.0,ASIA PACIFIC,BOB SMITH,2024,3,113.19
TXN00091671,2024-02-04,CUST002216,SUSAN LOPEZ,PROD00797,JACKET PLUS 971,CLOTHING,3,87.54,0.2,LATIN AMERICA,GRACE LEE,2024,2,210.1
TXN00044846,2025-03-16,CUST001562,ELIZABETH MILLER,PROD00865,SHAMPOO PLUS 655,HEALTH & BEAUTY,7,109.34,0.0,MIDDLE EAST,CAROL DAVIS,2025,3,765.38


In [0]:
# ============================================================================
# SECTION 7: TRANSFORM - DATA AGGREGATION
# ============================================================================
# Create aggregated views of the data for analytical purposes
# This demonstrates various aggregation techniques in PySpark

print("Creating aggregated dataset...")

# Aggregation 1: Monthly sales summary by category and region
df_aggregated = df_cleaned.groupBy(
    "year",
    "month",
    "category",
    "region"
).agg(
    count("transaction_id").alias("total_transactions"),
    sum("quantity").alias("total_quantity_sold"),
    sum("total_amount").alias("total_revenue"),
    avg("total_amount").alias("avg_transaction_value"),
    max("total_amount").alias("max_transaction_value"),
    min("total_amount").alias("min_transaction_value"),
    countDistinct("customer_id").alias("unique_customers"),
    countDistinct("product_id").alias("unique_products")
).orderBy("year", "month", "category", "region")

# Round decimal values for better readability
df_aggregated = df_aggregated \
    .withColumn("total_revenue", round(col("total_revenue"), 2)) \
    .withColumn("avg_transaction_value", round(col("avg_transaction_value"), 2)) \
    .withColumn("max_transaction_value", round(col("max_transaction_value"), 2)) \
    .withColumn("min_transaction_value", round(col("min_transaction_value"), 2))

# Add metadata columns for tracking
df_aggregated = df_aggregated \
    .withColumn("created_at", current_timestamp()) \
    .withColumn("data_source", lit("sales_etl_pipeline"))

print(f"‚úì Aggregation complete")
print(f"  - Aggregated rows: {df_aggregated.count():,}")
print(f"  - Aggregation dimensions: year, month, category, region")
print(f"  - Metrics calculated: 8 business metrics")

# Display aggregated data
print("\nSample of aggregated data:")
display(df_aggregated.limit(20))

# Show aggregation summary
print("\nAggregation summary by category:")
display(
    df_aggregated.groupBy("category")
    .agg(
        sum("total_revenue").alias("category_revenue"),
        sum("total_transactions").alias("category_transactions")
    )
    .orderBy(col("category_revenue").desc())
)

Creating aggregated dataset...
‚úì Aggregation complete
  - Aggregated rows: 1,488
  - Aggregation dimensions: year, month, category, region
  - Metrics calculated: 8 business metrics

Sample of aggregated data:


year,month,category,region,total_transactions,total_quantity_sold,total_revenue,avg_transaction_value,max_transaction_value,min_transaction_value,unique_customers,unique_products,created_at,data_source
2024,2,AUTOMOTIVE,AFRICA,220,1207,273375.98,1242.62,4980.2,50.57,215,195,2026-02-02T19:19:31.854Z,sales_etl_pipeline
2024,2,AUTOMOTIVE,ASIA PACIFIC,213,1128,265254.2,1245.32,4935.2,41.49,208,196,2026-02-02T19:19:31.854Z,sales_etl_pipeline
2024,2,AUTOMOTIVE,EUROPE,280,1612,388037.47,1385.85,4863.8,24.68,270,247,2026-02-02T19:19:31.854Z,sales_etl_pipeline
2024,2,AUTOMOTIVE,LATIN AMERICA,209,1135,268887.21,1286.54,4371.12,53.8,205,188,2026-02-02T19:19:31.854Z,sales_etl_pipeline
2024,2,AUTOMOTIVE,MIDDLE EAST,219,1266,307004.3,1401.85,4671.6,38.39,215,193,2026-02-02T19:19:31.854Z,sales_etl_pipeline
2024,2,AUTOMOTIVE,NORTH AMERICA,211,1229,292057.61,1384.16,4582.3,45.58,205,196,2026-02-02T19:19:31.854Z,sales_etl_pipeline
2024,2,AUTOMOTIVE,UNSPECIFIED,15,85,16975.65,1131.71,2915.25,121.25,15,15,2026-02-02T19:19:31.854Z,sales_etl_pipeline
2024,2,BOOKS,AFRICA,219,1213,29574.45,135.04,475.9,6.41,217,196,2026-02-02T19:19:31.854Z,sales_etl_pipeline
2024,2,BOOKS,ASIA PACIFIC,243,1297,33134.53,136.36,481.7,5.49,239,207,2026-02-02T19:19:31.854Z,sales_etl_pipeline
2024,2,BOOKS,EUROPE,224,1194,29561.58,131.97,431.55,8.06,220,205,2026-02-02T19:19:31.854Z,sales_etl_pipeline



Aggregation summary by category:


category,category_revenue,category_transactions
ELECTRONICS,173425829.61000004,34173
AUTOMOTIVE,44911629.8,34382
HOME & GARDEN,44357286.48000002,34248
SPORTS,26594903.18,34120
CLOTHING,18314848.83000001,34327
HEALTH & BEAUTY,13263567.500000004,34386
TOYS,9342228.250000004,34146
BOOKS,4606240.689999998,33961
UNKNOWN,4345065.369999999,3439


In [0]:
# ============================================================================
# SECTION 8: LOAD - WRITE TO UNITY CATALOG WITH SMART PARTITIONING
# ============================================================================
# Save the cleaned and aggregated data to Unity Catalog
# Using Delta Lake format with automatic partition detection and selective overwrite

from datetime import datetime

print("=" * 80)
print("WRITING TO UNITY CATALOG - PARTITIONED TABLE")
print("=" * 80)
print(f"Target: {FULL_TABLE_PATH}")

# ============================================================================
# STEP 1: ANALYZE PARTITIONS IN INPUT DATA
# ============================================================================

# Automatically detect which partitions are present in the incoming data
partitions_df = df_aggregated.select("year", "month").distinct().orderBy("year", "month")
partitions_list = partitions_df.collect()

print(f"\nüìä Partition Analysis:")
print(f"  - Total partitions in input data: {len(partitions_list)}")
print(f"  - Partition columns: year, month")
print(f"\n  Partitions to be overwritten:")

# Build the replaceWhere clause dynamically
partition_conditions = []
for partition in partitions_list:
    year_val = partition.year
    month_val = partition.month
    print(f"    ‚Ä¢ year={year_val}, month={month_val}")
    partition_conditions.append(f"(year = {year_val} AND month = {month_val})")

# Combine all partition conditions with OR
replace_where_clause = " OR ".join(partition_conditions)
print(f"\n  Generated replaceWhere clause:")
print(f"    {replace_where_clause[:150]}{'...' if len(replace_where_clause) > 150 else ''}")

# ============================================================================
# STEP 2: COLLECT PRE-WRITE STATISTICS (if table exists)
# ============================================================================

# Check if table already exists to collect statistics
table_exists = spark.catalog.tableExists(FULL_TABLE_PATH)

if table_exists:
    print(f"\nüìà Pre-Write Statistics:")
    
    # Total row count before
    pre_total_count = spark.sql(f"SELECT COUNT(*) as cnt FROM {FULL_TABLE_PATH}").collect()[0].cnt
    print(f"  - Total rows before: {pre_total_count:,}")
    
    # Count rows in partitions that will be overwritten
    pre_partition_count = spark.sql(f"""
        SELECT COUNT(*) as cnt 
        FROM {FULL_TABLE_PATH} 
        WHERE {replace_where_clause}
    """).collect()[0].cnt
    print(f"  - Rows in target partitions before: {pre_partition_count:,}")
    
    # Partition distribution before
    print(f"\n  Current partition distribution:")
    pre_partition_dist = spark.sql(f"""
        SELECT year, month, COUNT(*) as row_count
        FROM {FULL_TABLE_PATH}
        GROUP BY year, month
        ORDER BY year DESC, month DESC
    """)
    display(pre_partition_dist)
    
    # Table size estimation
    table_details = spark.sql(f"DESCRIBE DETAIL {FULL_TABLE_PATH}").collect()[0]
    pre_size_bytes = table_details.sizeInBytes if hasattr(table_details, 'sizeInBytes') else 0
    pre_num_files = table_details.numFiles if hasattr(table_details, 'numFiles') else 0
    print(f"\n  Table storage metrics:")
    print(f"    - Size: {pre_size_bytes / (1024**2):.2f} MB")
    print(f"    - Number of files: {pre_num_files}")
else:
    print(f"\nüìä Table does not exist yet - will be created")
    pre_total_count = 0
    pre_partition_count = 0
    pre_size_bytes = 0
    pre_num_files = 0

# ============================================================================
# STEP 3: WRITE DATA WITH SELECTIVE PARTITION OVERWRITE
# ============================================================================

write_start_time = datetime.now()
print(f"\n‚è≥ Starting write operation at {write_start_time.strftime('%H:%M:%S')}")

# Count rows being written
rows_to_write = df_aggregated.count()
print(f"  - Rows to write: {rows_to_write:,}")

# Write with replaceWhere for selective partition overwrite
df_aggregated.write \
    .format("delta") \
    .mode("overwrite") \
    .partitionBy("year", "month") \
    .option("replaceWhere", replace_where_clause) \
    .option("overwriteSchema", "false") \
    .option("delta.columnMapping.mode", "name") \
    .saveAsTable(FULL_TABLE_PATH)

write_end_time = datetime.now()
write_duration = (write_end_time - write_start_time).total_seconds()

print(f"‚úì Data successfully written to Unity Catalog")
print(f"  - Write duration: {write_duration:.2f} seconds")
print(f"  - Throughput: {rows_to_write/write_duration:,.0f} rows/second")

# ============================================================================
# STEP 4: POST-WRITE VALIDATION AND STATISTICS
# ============================================================================

print(f"\nüìä Post-Write Statistics:")

# Total row count after
post_total_count = spark.sql(f"SELECT COUNT(*) as cnt FROM {FULL_TABLE_PATH}").collect()[0].cnt
print(f"  - Total rows after: {post_total_count:,}")

# Count rows in affected partitions
post_partition_count = spark.sql(f"""
    SELECT COUNT(*) as cnt 
    FROM {FULL_TABLE_PATH} 
    WHERE {replace_where_clause}
""").collect()[0].cnt
print(f"  - Rows in target partitions after: {post_partition_count:,}")

# Calculate changes
if table_exists:
    rows_added = post_total_count - pre_total_count
    rows_replaced = pre_partition_count
    print(f"\n  Changes:")
    print(f"    ‚Ä¢ Rows in affected partitions replaced: {rows_replaced:,}")
    print(f"    ‚Ä¢ Net change in total rows: {rows_added:+,}")
    print(f"    ‚Ä¢ Rows in unaffected partitions: {post_total_count - post_partition_count:,} (unchanged)")

# Partition distribution after
print(f"\n  Updated partition distribution:")
post_partition_dist = spark.sql(f"""
    SELECT 
        year, 
        month, 
        COUNT(*) as row_count,
        ROUND(SUM(total_revenue), 2) as partition_revenue,
        COUNT(DISTINCT category) as distinct_categories,
        COUNT(DISTINCT region) as distinct_regions
    FROM {FULL_TABLE_PATH}
    GROUP BY year, month
    ORDER BY year DESC, month DESC
""")
display(post_partition_dist)

# ============================================================================
# STEP 5: TABLE METADATA AND PROPERTIES
# ============================================================================

print(f"\n‚öôÔ∏è  Configuring table properties...")

# Add table comment with detailed metadata
spark.sql(f"""
    COMMENT ON TABLE {FULL_TABLE_PATH} IS 
    'Aggregated sales data by month, category, and region.
    
    Partitioning: year, month
    Granularity: Monthly aggregations
    Source: ETL pipeline from raw sales data
    Last Updated: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
    Rows: {post_total_count:,}
    Partitions: {len(partitions_list)}'
""")

# Enable auto-optimization features
spark.sql(f"""
    ALTER TABLE {FULL_TABLE_PATH} SET TBLPROPERTIES (
        'delta.autoOptimize.optimizeWrite' = 'true',
        'delta.autoOptimize.autoCompact' = 'true',
        'delta.logRetentionDuration' = '30 days',
        'delta.deletedFileRetentionDuration' = '7 days'
    )
""")
print("‚úì Auto-optimize and retention policies configured")

# ============================================================================
# STEP 6: OPTIMIZE TABLE FOR QUERY PERFORMANCE
# ============================================================================

print(f"\nüöÄ Optimizing table for query performance...")

optimize_start_time = datetime.now()

# Optimize with Z-ORDER on frequently queried columns
spark.sql(f"""
    OPTIMIZE {FULL_TABLE_PATH} 
    ZORDER BY (category, region)
""")

optimize_end_time = datetime.now()
optimize_duration = (optimize_end_time - optimize_start_time).total_seconds()

print(f"‚úì OPTIMIZE completed in {optimize_duration:.2f} seconds")
print(f"  - Z-ORDER applied on: category, region")
print(f"  - Benefit: Faster filtering on these columns")

# ============================================================================
# STEP 7: COMPUTE TABLE STATISTICS
# ============================================================================

print(f"\nüìä Computing statistics for query optimizer...")

stats_start_time = datetime.now()

# Compute comprehensive statistics for all columns
spark.sql(f"ANALYZE TABLE {FULL_TABLE_PATH} COMPUTE STATISTICS FOR ALL COLUMNS")

stats_end_time = datetime.now()
stats_duration = (stats_end_time - stats_start_time).total_seconds()

print(f"‚úì Statistics computed in {stats_duration:.2f} seconds")
print(f"  - Benefit: Better query execution plans")

# ============================================================================
# STEP 8: DETAILED TABLE INSIGHTS
# ============================================================================

print("\n" + "=" * 80)
print("TABLE INSIGHTS AND PERFORMANCE METRICS")
print("=" * 80)

# Get detailed table information
table_details = spark.sql(f"DESCRIBE DETAIL {FULL_TABLE_PATH}").collect()[0]

post_size_bytes = table_details.sizeInBytes if hasattr(table_details, 'sizeInBytes') else 0
post_num_files = table_details.numFiles if hasattr(table_details, 'numFiles') else 0

print(f"\nüì¶ Storage Metrics:")
print(f"  - Table size: {post_size_bytes / (1024**2):.2f} MB")
print(f"  - Number of files: {post_num_files}")
print(f"  - Average file size: {post_size_bytes / post_num_files / (1024**2):.2f} MB" if post_num_files > 0 else "  - Average file size: N/A")
print(f"  - Format: Delta Lake (Parquet + transaction log)")

if table_exists:
    size_change = post_size_bytes - pre_size_bytes
    file_change = post_num_files - pre_num_files
    print(f"\n  Storage changes:")
    print(f"    ‚Ä¢ Size change: {size_change / (1024**2):+.2f} MB")
    print(f"    ‚Ä¢ File count change: {file_change:+d}")

# Show partition file distribution
print(f"\nüìÅ Partition File Distribution:")
partition_details = spark.sql(f"""
    SELECT year, month, COUNT(*) as record_count
    FROM {FULL_TABLE_PATH}
    GROUP BY year, month
    ORDER BY year DESC, month DESC
""")
display(partition_details)

# Show physical partition paths
print(f"\nüóÇÔ∏è  Physical Partitions:")
physical_partitions = spark.sql(f"SHOW PARTITIONS {FULL_TABLE_PATH}").limit(10)
display(physical_partitions)

WRITING TO UNITY CATALOG - PARTITIONED TABLE
Target: workspace.portfolio_projects.sales_summary

üìä Partition Analysis:
  - Total partitions in input data: 24
  - Partition columns: year, month

  Partitions to be overwritten:
    ‚Ä¢ year=2024, month=2
    ‚Ä¢ year=2024, month=3
    ‚Ä¢ year=2024, month=4
    ‚Ä¢ year=2024, month=5
    ‚Ä¢ year=2024, month=6
    ‚Ä¢ year=2024, month=7
    ‚Ä¢ year=2024, month=8
    ‚Ä¢ year=2024, month=9
    ‚Ä¢ year=2024, month=10
    ‚Ä¢ year=2024, month=11
    ‚Ä¢ year=2024, month=12
    ‚Ä¢ year=2025, month=1
    ‚Ä¢ year=2025, month=2
    ‚Ä¢ year=2025, month=3
    ‚Ä¢ year=2025, month=4
    ‚Ä¢ year=2025, month=5
    ‚Ä¢ year=2025, month=6
    ‚Ä¢ year=2025, month=7
    ‚Ä¢ year=2025, month=8
    ‚Ä¢ year=2025, month=9
    ‚Ä¢ year=2025, month=10
    ‚Ä¢ year=2025, month=11
    ‚Ä¢ year=2025, month=12
    ‚Ä¢ year=2026, month=1

  Generated replaceWhere clause:
    (year = 2024 AND month = 2) OR (year = 2024 AND month = 3) OR (year = 2024 AND 

year,month,row_count,partition_revenue,distinct_categories,distinct_regions
2026,1,62,14587575.83,9,7
2025,12,62,13925808.42,9,7
2025,11,62,14154550.79,9,7
2025,10,62,14490210.96,9,7
2025,9,62,13846241.91,9,7
2025,8,62,14067380.3,9,7
2025,7,62,14428670.22,9,7
2025,6,62,14093866.68,9,7
2025,5,62,14476644.76,9,7
2025,4,62,14210068.54,9,7



‚öôÔ∏è  Configuring table properties...
‚úì Auto-optimize and retention policies configured

üöÄ Optimizing table for query performance...
‚úì OPTIMIZE completed in 1.66 seconds
  - Z-ORDER applied on: category, region
  - Benefit: Faster filtering on these columns

üìä Computing statistics for query optimizer...
‚úì Statistics computed in 1.15 seconds
  - Benefit: Better query execution plans

TABLE INSIGHTS AND PERFORMANCE METRICS

üì¶ Storage Metrics:
  - Table size: 0.22 MB
  - Number of files: 24
  - Average file size: 0.01 MB
  - Format: Delta Lake (Parquet + transaction log)

üìÅ Partition File Distribution:


year,month,record_count
2026,1,62
2025,12,62
2025,11,62
2025,10,62
2025,9,62
2025,8,62
2025,7,62
2025,6,62
2025,5,62
2025,4,62



üóÇÔ∏è  Physical Partitions:


year,month
2026,1
2025,3
2025,8
2024,12
2025,6
2025,11
2024,3
2024,10
2025,12
2024,6


In [0]:
# ============================================================================
# STEP 9: QUERY PERFORMANCE TESTING
# ============================================================================

print(f"\n‚ö° Query Performance Testing:")

# Test 1: Full table scan
test1_start = datetime.now()
full_scan_count = spark.sql(f"SELECT COUNT(*) FROM {FULL_TABLE_PATH}").collect()[0][0]
test1_duration = (datetime.now() - test1_start).total_seconds()
print(f"  Test 1 - Full table scan: {test1_duration:.3f}s ({full_scan_count:,} rows)")

# Test 2: Single partition query (partition pruning)
if len(partitions_list) > 0:
    test_year = partitions_list[0].year
    test_month = partitions_list[0].month
    test2_start = datetime.now()
    partition_count = spark.sql(f"""
        SELECT COUNT(*) FROM {FULL_TABLE_PATH} 
        WHERE year = {test_year} AND month = {test_month}
    """).collect()[0][0]
    test2_duration = (datetime.now() - test2_start).total_seconds()
    print(f"  Test 2 - Single partition (year={test_year}, month={test_month}): {test2_duration:.3f}s ({partition_count:,} rows)")
    
    if test2_duration > 0:
        print(f"    ‚Üí Speedup from partition pruning: {test1_duration/test2_duration:.1f}x faster")

# Test 3: Z-ORDER column filter
test3_start = datetime.now()
category_count = spark.sql(f"""
    SELECT COUNT(*) FROM {FULL_TABLE_PATH} 
    WHERE category = 'ELECTRONICS'
""").collect()[0][0]
test3_duration = (datetime.now() - test3_start).total_seconds()
print(f"  Test 3 - Category filter (Z-ORDERed): {test3_duration:.3f}s ({category_count:,} rows)")


‚ö° Query Performance Testing:
  Test 1 - Full table scan: 0.445s (1,488 rows)
  Test 2 - Single partition (year=2024, month=2): 0.469s (62 rows)
    ‚Üí Speedup from partition pruning: 0.9x faster
  Test 3 - Category filter (Z-ORDERed): 0.541s (168 rows)


In [0]:
# ============================================================================
# STEP 10: TABLE HISTORY AND VERSION CONTROL
# ============================================================================

print(f"\nüìú Table History (Delta Time Travel):")

# Show last 5 operations
history_df = spark.sql(f"DESCRIBE HISTORY {FULL_TABLE_PATH}").limit(5)
display(history_df.select(
    "version", 
    "timestamp", 
    "operation", 
    "operationMetrics",
    "userName"
))

# Get current version
current_version = spark.sql(f"DESCRIBE HISTORY {FULL_TABLE_PATH}").first().version
print(f"\n  Current table version: {current_version}")
print(f"  You can time travel to any previous version using:")
print(f"    SELECT * FROM {FULL_TABLE_PATH} VERSION AS OF <version>")


üìú Table History (Delta Time Travel):


version,timestamp,operation,operationMetrics,userName
2,2026-02-02T19:19:46.000Z,SET TBLPROPERTIES,Map(),christiandelprete01@gmail.com
1,2026-02-02T19:19:45.000Z,SET TBLPROPERTIES,Map(),christiandelprete01@gmail.com
0,2026-02-02T19:19:42.000Z,WRITE,"Map(numFiles -> 24, numCopiedRows -> 0, numAddedChangeFiles -> 0, numOutputRows -> 1488, numOutputBytes -> 227720)",christiandelprete01@gmail.com



  Current table version: 2
  You can time travel to any previous version using:
    SELECT * FROM workspace.portfolio_projects.sales_summary VERSION AS OF <version>


In [0]:
# ============================================================================
# STEP 11: DATA QUALITY VALIDATION
# ============================================================================

print(f"\n‚úÖ Data Quality Checks:")

# Check 1: No null values in partition columns
null_partitions = spark.sql(f"""
    SELECT COUNT(*) as null_count
    FROM {FULL_TABLE_PATH}
    WHERE year IS NULL OR month IS NULL
""").collect()[0].null_count

print(f"  ‚úì Null partition keys: {null_partitions} (should be 0)")

# Check 2: Validate aggregation logic
print(f"\n  Business metrics validation:")
validation_df = spark.sql(f"""
    SELECT 
        COUNT(DISTINCT year) as distinct_years,
        COUNT(DISTINCT month) as distinct_months,
        COUNT(DISTINCT category) as distinct_categories,
        COUNT(DISTINCT region) as distinct_regions,
        MIN(total_transactions) as min_transactions,
        MAX(total_transactions) as max_transactions,
        ROUND(SUM(total_revenue), 2) as grand_total_revenue
    FROM {FULL_TABLE_PATH}
""")
display(validation_df)

# Check 3: Top performers
print(f"\n  Top 5 performing segments:")
top_performers = spark.sql(f"""
    SELECT 
        year, month, category, region,
        ROUND(total_revenue, 2) as revenue,
        total_transactions as transactions
    FROM {FULL_TABLE_PATH}
    ORDER BY total_revenue DESC
    LIMIT 5
""")
display(top_performers)


‚úÖ Data Quality Checks:
  ‚úì Null partition keys: 0 (should be 0)

  Business metrics validation:


distinct_years,distinct_months,distinct_categories,distinct_regions,min_transactions,max_transactions,grand_total_revenue
3,12,9,7,10,287,339161599.71



  Top 5 performing segments:


year,month,category,region,revenue,transactions
2025,3,ELECTRONICS,EUROPE,1462315.6,260
2024,5,ELECTRONICS,AFRICA,1436852.44,254
2024,3,ELECTRONICS,EUROPE,1434735.08,251
2024,3,ELECTRONICS,AFRICA,1405253.37,251
2024,10,ELECTRONICS,EUROPE,1395092.43,270


In [0]:
# Databricks notebook source
# ============================================================================
# SECTION 12: ADD TAGS, COMMENTS AND METADATA TO TABLE AND COLUMNS
# ============================================================================
# Enhance data discoverability and governance by adding comprehensive metadata
# Tags and comments help data consumers understand the data and its usage

from datetime import datetime

print("=" * 80)
print("ADDING METADATA: TAGS, COMMENTS AND DESCRIPTIONS")
print("=" * 80)
print(f"Target: {FULL_TABLE_PATH}")

# ============================================================================
# STEP 1: ADD COLUMN COMMENTS (DESCRIPTIONS)
# ============================================================================

print("\nüìù Adding column descriptions...")

# Define column comments for data documentation
# These descriptions appear in Catalog Explorer and DESCRIBE commands
column_comments = {
    "year": "Transaction year - Used for partitioning",
    "month": "Transaction month (1-12) - Used for partitioning",
    "category": "Product category - Z-ORDERed for query optimization",
    "region": "Geographic region where the sale occurred - Z-ORDERed for query optimization",
    "total_transactions": "Total number of individual transactions in this segment",
    "total_quantity_sold": "Sum of all quantities sold across all transactions",
    "total_revenue": "Total revenue generated (quantity * unit_price * (1-discount))",
    "avg_transaction_value": "Average revenue per transaction in this segment",
    "max_transaction_value": "Maximum single transaction value in this segment",
    "min_transaction_value": "Minimum single transaction value in this segment",
    "unique_customers": "Count of distinct customers who made purchases",
    "unique_products": "Count of distinct products sold in this segment",
    "created_at": "Timestamp when this record was created in the table",
    "data_source": "Source system or pipeline that created this record"
}

# Apply comments to all columns at once (supports multiple columns in one command)
alter_column_comments = []
for column, comment in column_comments.items():
    # Escape single quotes in comments to prevent SQL injection
    escaped_comment = comment.replace("'", "''")
    alter_column_comments.append(f"{column} COMMENT '{escaped_comment}'")

# Execute ALTER TABLE with all column comments
alter_statement = f"""
    ALTER TABLE {FULL_TABLE_PATH} ALTER COLUMN
    {',\n    '.join(alter_column_comments)}
"""

spark.sql(alter_statement)
print(f"‚úì Added descriptions to {len(column_comments)} columns")

# Verify comments were added
print("\nColumn descriptions preview:")
describe_df = spark.sql(f"DESCRIBE TABLE {FULL_TABLE_PATH}")
display(describe_df.select("col_name", "data_type", "comment").limit(10))

# ============================================================================
# STEP 2: ADD COLUMN TAGS
# ============================================================================

print("\nüè∑Ô∏è  Adding column tags for governance...")

# Define tags for each column
# Tags help with data classification, security, and compliance
# Common tag categories: PII, Sensitivity, Data Quality, Business Domain

column_tags = {
    "year": {"category": "partition_key", "data_classification": "public"},
    "month": {"category": "partition_key", "data_classification": "public"},
    "category": {"category": "business_dimension", "data_classification": "public", "indexed": "z_order"},
    "region": {"category": "business_dimension", "data_classification": "public", "indexed": "z_order"},
    "total_transactions": {"category": "kpi", "metric_type": "count", "data_classification": "internal"},
    "total_quantity_sold": {"category": "kpi", "metric_type": "sum", "data_classification": "internal"},
    "total_revenue": {"category": "kpi", "metric_type": "currency", "data_classification": "confidential", "pii": "false"},
    "avg_transaction_value": {"category": "kpi", "metric_type": "average", "data_classification": "internal"},
    "max_transaction_value": {"category": "kpi", "metric_type": "max", "data_classification": "internal"},
    "min_transaction_value": {"category": "kpi", "metric_type": "min", "data_classification": "internal"},
    "unique_customers": {"category": "kpi", "metric_type": "distinct_count", "data_classification": "internal"},
    "unique_products": {"category": "kpi", "metric_type": "distinct_count", "data_classification": "internal"},
    "created_at": {"category": "metadata", "data_classification": "public"},
    "data_source": {"category": "metadata", "data_classification": "public"}
}

# Apply tags to each column (must be done separately per column)
tag_count = 0
for column, tags in column_tags.items():
    # Convert tags dict to proper format: ('key' = 'value', 'key2' = 'value2')
    tag_pairs = [f"'{k}' = '{v}'" for k, v in tags.items()]
    tags_string = ", ".join(tag_pairs)
    
    # Apply tags to column
    spark.sql(f"""
        ALTER TABLE {FULL_TABLE_PATH} 
        ALTER COLUMN {column} 
        SET TAGS ({tags_string})
    """)
    tag_count += len(tags)
    print(f"  ‚úì Tagged column '{column}' with {len(tags)} tags")

print(f"\n‚úì Applied {tag_count} total tags across {len(column_tags)} columns")

# ============================================================================
# STEP 3: ADD TABLE-LEVEL TAGS
# ============================================================================

print("\nüè∑Ô∏è  Adding table-level tags...")

# Define table-level tags for governance and classification
table_tags = {
    "data_domain": "sales_analytics",
    "data_owner": "data_engineering_team",
    "refresh_frequency": "daily",
    "data_classification": "internal",
    "contains_pii": "false",
    "quality_tier": "gold",
    "business_criticality": "high",
    "retention_policy": "3_years",
    "compliance": "gdpr_compliant",
    "sla": "24_hours",
    "table_type": "aggregated",
    "partition_strategy": "year_month",
    "optimization": "z_order_enabled"
}

# Convert to proper format
table_tag_pairs = [f"'{k}' = '{v}'" for k, v in table_tags.items()]
table_tags_string = ", ".join(table_tag_pairs)

# Apply tags to table
spark.sql(f"""
    ALTER TABLE {FULL_TABLE_PATH} 
    SET TAGS ({table_tags_string})
""")

print(f"‚úì Applied {len(table_tags)} tags to table")
print("\nTable tags:")
for key, value in table_tags.items():
    print(f"  ‚Ä¢ {key}: {value}")

# ============================================================================
# STEP 4: UPDATE TABLE DESCRIPTION (EXTENDED COMMENT)
# ============================================================================

print("\nüìÑ Updating table description...")

# Create comprehensive table description with Markdown support
table_description = f"""
# Sales Summary Analytics Table

## Overview
Aggregated sales data providing monthly insights by category and region. 
This table serves as the primary source for sales performance dashboards and reporting.

## Data Characteristics
- **Granularity**: Monthly aggregations
- **Dimensions**: Year, Month, Category, Region
- **Metrics**: Revenue, Transactions, Quantities, Customer counts
- **Partitioning**: Year and Month for optimal time-based queries
- **Optimization**: Z-ORDER on Category and Region

## Business Use Cases
1. Monthly sales performance tracking
2. Category and regional comparison analysis
3. Customer behavior insights
4. Revenue forecasting and planning

## Technical Details
- **Source**: Raw transactional sales data (CSV/Parquet)
- **ETL Pipeline**: Automated data cleaning and aggregation
- **Update Frequency**: Daily incremental loads
- **Data Quality**: Validated and cleaned
- **Time Travel**: Enabled (30 days retention)

## Data Lineage
Raw Sales Data ‚Üí Data Cleaning ‚Üí Aggregation ‚Üí Unity Catalog Table

## Owner & Contact
- **Team**: Data Engineering
- **Contact**: data-eng@company.com
- **Last Updated**: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
- **Version**: 1.0

## Important Notes
- Partitioned by year/month - always include these in WHERE clauses for best performance
- Z-ORDER optimized for category and region filters
- Revenue excludes returns and cancellations
- Historical data available via Delta Lake time travel
"""

# Apply comprehensive comment to table (supports Markdown)
escaped_description = table_description.replace("'", "''")
spark.sql(f"""
    COMMENT ON TABLE {FULL_TABLE_PATH} IS '{escaped_description}'
""")

print("‚úì Table description updated with comprehensive documentation")

# ============================================================================
# STEP 5: VERIFY ALL METADATA
# ============================================================================

print("\n" + "=" * 80)
print("METADATA VERIFICATION")
print("=" * 80)

# Show table details with comments
print("\n1. Table Information:")
table_info = spark.sql(f"DESCRIBE EXTENDED {FULL_TABLE_PATH}")
display(table_info)

# Show column details with comments and data types
print("\n2. Column Details with Descriptions:")
column_details = spark.sql(f"DESCRIBE TABLE {FULL_TABLE_PATH}")
display(column_details)

# Query table tags from INFORMATION_SCHEMA
print("\n3. Table Tags:")
table_tags_query = spark.sql(f"""
    SELECT 
        tag_name,
        tag_value
    FROM {CATALOG_NAME}.INFORMATION_SCHEMA.TABLE_TAGS
    WHERE catalog_name = '{CATALOG_NAME}'
    AND schema_name = '{SCHEMA_NAME}'
    AND table_name = '{TABLE_NAME}'
    ORDER BY tag_name
""")
display(table_tags_query)

# Query column tags from INFORMATION_SCHEMA
print("\n4. Column Tags:")
column_tags_query = spark.sql(f"""
    SELECT 
        column_name,
        tag_name,
        tag_value
    FROM {CATALOG_NAME}.INFORMATION_SCHEMA.COLUMN_TAGS
    WHERE catalog_name = '{CATALOG_NAME}'
    AND schema_name = '{SCHEMA_NAME}'
    AND table_name = '{TABLE_NAME}'
    ORDER BY column_name, tag_name
""")
display(column_tags_query)

# ============================================================================
# STEP 6: CREATE METADATA SUMMARY REPORT
# ============================================================================

print("\nüìä Metadata Summary Report:")

# Count total metadata elements
total_column_comments = len(column_comments)
total_column_tags = tag_count
total_table_tags = len(table_tags)

print(f"\n  Documentation Metrics:")
print(f"    ‚Ä¢ Columns documented: {total_column_comments}/{len(df_aggregated.columns)}")
print(f"    ‚Ä¢ Column tags applied: {total_column_tags}")
print(f"    ‚Ä¢ Table tags applied: {total_table_tags}")
print(f"    ‚Ä¢ Table description: ‚úì Comprehensive Markdown documentation")

# Create a summary DataFrame
metadata_summary = spark.sql(f"""
    SELECT 
        '{FULL_TABLE_PATH}' as table_path,
        COUNT(DISTINCT c.column_name) as documented_columns,
        COUNT(DISTINCT ct.column_name) as tagged_columns,
        COUNT(DISTINCT ct.tag_name) as total_column_tags,
        '{len(table_tags)}' as table_level_tags
    FROM {CATALOG_NAME}.INFORMATION_SCHEMA.COLUMNS c
    LEFT JOIN {CATALOG_NAME}.INFORMATION_SCHEMA.COLUMN_TAGS ct
        ON c.table_catalog = ct.catalog_name
        AND c.table_schema = ct.schema_name
        AND c.table_name = ct.table_name
        AND c.column_name = ct.column_name
    WHERE c.table_catalog = '{CATALOG_NAME}'
    AND c.table_schema = '{SCHEMA_NAME}'
    AND c.table_name = '{TABLE_NAME}'
""")

display(metadata_summary)

# ============================================================================
# STEP 7: METADATA BEST PRACTICES VALIDATION
# ============================================================================

print("\n‚úÖ Metadata Best Practices Check:")

# Check 1: All columns have comments
columns_without_comments = spark.sql(f"""
    SELECT column_name
    FROM {CATALOG_NAME}.INFORMATION_SCHEMA.COLUMNS
    WHERE table_catalog = '{CATALOG_NAME}'
    AND table_schema = '{SCHEMA_NAME}'
    AND table_name = '{TABLE_NAME}'
    AND (comment IS NULL OR comment = '')
""").count()

if columns_without_comments == 0:
    print("  ‚úì All columns have descriptions")
else:
    print(f"  ‚ö†Ô∏è  {columns_without_comments} columns missing descriptions")

# Check 2: Critical columns have tags
critical_columns = ['total_revenue', 'unique_customers']
tagged_critical = spark.sql(f"""
    SELECT DISTINCT column_name
    FROM {CATALOG_NAME}.INFORMATION_SCHEMA.COLUMN_TAGS
    WHERE catalog_name = '{CATALOG_NAME}'
    AND schema_name = '{SCHEMA_NAME}'
    AND table_name = '{TABLE_NAME}'
    AND column_name IN ('total_revenue', 'unique_customers')
""").count()

if tagged_critical == len(critical_columns):
    print(f"  ‚úì All critical columns are tagged")
else:
    print(f"  ‚ö†Ô∏è  Some critical columns need tags")

# Check 3: Table has governance tags
governance_tags = ['data_classification', 'data_owner', 'contains_pii']
has_governance = spark.sql(f"""
    SELECT COUNT(DISTINCT tag_name) as gov_tag_count
    FROM {CATALOG_NAME}.INFORMATION_SCHEMA.TABLE_TAGS
    WHERE catalog_name = '{CATALOG_NAME}'
    AND schema_name = '{SCHEMA_NAME}'
    AND table_name = '{TABLE_NAME}'
    AND tag_name IN ('data_classification', 'data_owner', 'contains_pii')
""").collect()[0].gov_tag_count

if has_governance >= 3:
    print(f"  ‚úì Table has all required governance tags")
else:
    print(f"  ‚ö†Ô∏è  Table missing some governance tags")

# ============================================================================
# FINAL SUMMARY
# ============================================================================

print("\n" + "=" * 80)
print("‚úÖ METADATA ENRICHMENT COMPLETED")
print("=" * 80)

print(f"\nüìã Summary:")
print(f"  ‚Ä¢ Table: {FULL_TABLE_PATH}")
print(f"  ‚Ä¢ Column descriptions: {total_column_comments} added")
print(f"  ‚Ä¢ Column tags: {total_column_tags} applied")
print(f"  ‚Ä¢ Table tags: {total_table_tags} applied")
print(f"  ‚Ä¢ Table documentation: ‚úì Comprehensive Markdown description")

print(f"\nüéØ Benefits:")
print(f"  ‚úì Improved data discoverability in Catalog Explorer")
print(f"  ‚úì Enhanced data governance and compliance tracking")
print(f"  ‚úì Better data lineage and ownership transparency")
print(f"  ‚úì Easier onboarding for new data consumers")
print(f"  ‚úì Automated data classification for security policies")

print(f"\nüìñ View Metadata:")
print(f"  ‚Ä¢ Catalog Explorer: Browse to {FULL_TABLE_PATH}")
print(f"  ‚Ä¢ SQL: DESCRIBE EXTENDED {FULL_TABLE_PATH}")
print(f"  ‚Ä¢ SQL: SELECT * FROM {CATALOG_NAME}.INFORMATION_SCHEMA.TABLE_TAGS")
print(f"  ‚Ä¢ SQL: SELECT * FROM {CATALOG_NAME}.INFORMATION_SCHEMA.COLUMN_TAGS")

print("\n" + "=" * 80)


ADDING METADATA: TAGS, COMMENTS AND DESCRIPTIONS
Target: workspace.portfolio_projects.sales_summary

üìù Adding column descriptions...
‚úì Added descriptions to 14 columns

Column descriptions preview:


col_name,data_type,comment
year,int,Transaction year - Used for partitioning
month,int,Transaction month (1-12) - Used for partitioning
category,string,Product category - Z-ORDERed for query optimization
region,string,Geographic region where the sale occurred - Z-ORDERed for query optimization
total_transactions,bigint,Total number of individual transactions in this segment
total_quantity_sold,bigint,Sum of all quantities sold across all transactions
total_revenue,double,Total revenue generated (quantity * unit_price * (1-discount))
avg_transaction_value,double,Average revenue per transaction in this segment
max_transaction_value,double,Maximum single transaction value in this segment
min_transaction_value,double,Minimum single transaction value in this segment



üè∑Ô∏è  Adding column tags for governance...
  ‚úì Tagged column 'year' with 2 tags
  ‚úì Tagged column 'month' with 2 tags
  ‚úì Tagged column 'category' with 3 tags
  ‚úì Tagged column 'region' with 3 tags
  ‚úì Tagged column 'total_transactions' with 3 tags
  ‚úì Tagged column 'total_quantity_sold' with 3 tags
  ‚úì Tagged column 'total_revenue' with 4 tags
  ‚úì Tagged column 'avg_transaction_value' with 3 tags
  ‚úì Tagged column 'max_transaction_value' with 3 tags
  ‚úì Tagged column 'min_transaction_value' with 3 tags
  ‚úì Tagged column 'unique_customers' with 3 tags
  ‚úì Tagged column 'unique_products' with 3 tags
  ‚úì Tagged column 'created_at' with 2 tags
  ‚úì Tagged column 'data_source' with 2 tags

‚úì Applied 39 total tags across 14 columns

üè∑Ô∏è  Adding table-level tags...
‚úì Applied 13 tags to table

Table tags:
  ‚Ä¢ data_domain: sales_analytics
  ‚Ä¢ data_owner: data_engineering_team
  ‚Ä¢ refresh_frequency: daily
  ‚Ä¢ data_classification: internal
  ‚Ä¢ con

col_name,data_type,comment
year,int,Transaction year - Used for partitioning
month,int,Transaction month (1-12) - Used for partitioning
category,string,Product category - Z-ORDERed for query optimization
region,string,Geographic region where the sale occurred - Z-ORDERed for query optimization
total_transactions,bigint,Total number of individual transactions in this segment
total_quantity_sold,bigint,Sum of all quantities sold across all transactions
total_revenue,double,Total revenue generated (quantity * unit_price * (1-discount))
avg_transaction_value,double,Average revenue per transaction in this segment
max_transaction_value,double,Maximum single transaction value in this segment
min_transaction_value,double,Minimum single transaction value in this segment



2. Column Details with Descriptions:


col_name,data_type,comment
year,int,Transaction year - Used for partitioning
month,int,Transaction month (1-12) - Used for partitioning
category,string,Product category - Z-ORDERed for query optimization
region,string,Geographic region where the sale occurred - Z-ORDERed for query optimization
total_transactions,bigint,Total number of individual transactions in this segment
total_quantity_sold,bigint,Sum of all quantities sold across all transactions
total_revenue,double,Total revenue generated (quantity * unit_price * (1-discount))
avg_transaction_value,double,Average revenue per transaction in this segment
max_transaction_value,double,Maximum single transaction value in this segment
min_transaction_value,double,Minimum single transaction value in this segment



3. Table Tags:


tag_name,tag_value
business_criticality,high
compliance,gdpr_compliant
contains_pii,false
data_classification,internal
data_domain,sales_analytics
data_owner,data_engineering_team
optimization,z_order_enabled
partition_strategy,year_month
quality_tier,gold
refresh_frequency,daily



4. Column Tags:


column_name,tag_name,tag_value
avg_transaction_value,category,kpi
avg_transaction_value,data_classification,internal
avg_transaction_value,metric_type,average
category,category,business_dimension
category,data_classification,public
category,indexed,z_order
created_at,category,metadata
created_at,data_classification,public
data_source,category,metadata
data_source,data_classification,public



üìä Metadata Summary Report:

  Documentation Metrics:
    ‚Ä¢ Columns documented: 14/14
    ‚Ä¢ Column tags applied: 39
    ‚Ä¢ Table tags applied: 13
    ‚Ä¢ Table description: ‚úì Comprehensive Markdown documentation


table_path,documented_columns,tagged_columns,total_column_tags,table_level_tags
workspace.portfolio_projects.sales_summary,14,14,5,13



‚úÖ Metadata Best Practices Check:
  ‚úì All columns have descriptions
  ‚úì All critical columns are tagged
  ‚úì Table has all required governance tags

‚úÖ METADATA ENRICHMENT COMPLETED

üìã Summary:
  ‚Ä¢ Table: workspace.portfolio_projects.sales_summary
  ‚Ä¢ Column descriptions: 14 added
  ‚Ä¢ Column tags: 39 applied
  ‚Ä¢ Table tags: 13 applied
  ‚Ä¢ Table documentation: ‚úì Comprehensive Markdown description

üéØ Benefits:
  ‚úì Improved data discoverability in Catalog Explorer
  ‚úì Enhanced data governance and compliance tracking
  ‚úì Better data lineage and ownership transparency
  ‚úì Easier onboarding for new data consumers
  ‚úì Automated data classification for security policies

üìñ View Metadata:
  ‚Ä¢ Catalog Explorer: Browse to workspace.portfolio_projects.sales_summary
  ‚Ä¢ SQL: DESCRIBE EXTENDED workspace.portfolio_projects.sales_summary
  ‚Ä¢ SQL: SELECT * FROM workspace.INFORMATION_SCHEMA.TABLE_TAGS
  ‚Ä¢ SQL: SELECT * FROM workspace.INFORMATION_SCHEMA.COLU

In [0]:
# ============================================================================
# FINAL SUMMARY
# ============================================================================

print("=" * 80)
print("ETL PIPELINE COMPLETED SUCCESSFULLY")
print("=" * 80)
print(f"\nüìä Summary:")
print(f"  ‚Ä¢ Source: Raw CSV/Parquet files")
print(f"  ‚Ä¢ Destination: {FULL_TABLE_PATH}")
print(f"  ‚Ä¢ Records processed: {df_raw.count():,}")
print(f"  ‚Ä¢ Records loaded: {table_count:,}")
print(f"  ‚Ä¢ Data quality: ‚úì Cleaned and validated")
print(f"  ‚Ä¢ Format: Delta Lake")
print(f"  ‚Ä¢ Governance: Unity Catalog")
print(f"  ‚Ä¢ Start Execution DateTime: {start_notebook_datetime}")
print(f"  ‚Ä¢ End Execution DateTime: {end_notebook_datetime}")
print(f"  ‚Ä¢ Execution Time: {int(hours)}h {int(minutes)}m {seconds:.2f}s")
print(f"\n‚úÖ Data is now available for analytics and reporting")
print(f"\nNext steps:")
print(f"  1. Query the table: SELECT * FROM {FULL_TABLE_PATH}")
print(f"  2. Create dashboards using Databricks SQL")
print(f"  3. Set up automated jobs for regular updates")
print(f"  4. Configure alerts for data quality monitoring")

ETL PIPELINE COMPLETED SUCCESSFULLY

üìä Summary:
  ‚Ä¢ Source: Raw CSV/Parquet files
  ‚Ä¢ Destination: workspace.portfolio_projects.sales_summary
  ‚Ä¢ Records processed: 408,000
  ‚Ä¢ Records loaded: 1,488
  ‚Ä¢ Data quality: ‚úì Cleaned and validated
  ‚Ä¢ Format: Delta Lake
  ‚Ä¢ Governance: Unity Catalog
  ‚Ä¢ Start Execution DateTime: 2026-02-02 19:19:16.526047
  ‚Ä¢ End Execution DateTime: 2026-02-02 19:03:40.188565
  ‚Ä¢ Execution Time: 0h 0m 29.09s

‚úÖ Data is now available for analytics and reporting

Next steps:
  1. Query the table: SELECT * FROM workspace.portfolio_projects.sales_summary
  2. Create dashboards using Databricks SQL
  3. Set up automated jobs for regular updates
  4. Configure alerts for data quality monitoring
