In [0]:
%python
# COMMAND ----------
# DBTITLE 1, Table Size Report - Plain Text for Excel

# ============================================================================
# TABLE LIST - Edit this section
# ============================================================================
table_list = [
"gap_catalog.ads_owner.case_phase_properties",
"gap_catalog.ads_src_20250901.dlk_ads_sma_monitor_events_full",
"gap_catalog.ads_owner.SMA_CASE_PHASE_PROPERTIES",
"gap_catalog.ads_etl_owner.STG_SMA_CASE_PHASE_PROPERTIES_HUMANTASK",
"gap_catalog.ads_etl_owner.STG_SMA_CASE_PHASE_PROPERTIES_PARSE",
"gap_catalog.ads_etl_owner.STG_SMA_CASE_PHASE_PROPERTIES",
"gap_catalog.ads_owner.CASES",
"gap_catalog.ads_owner.CASE_PHASES",
"gap_catalog.ads_owner.CASE_PHASE_PROPERTY_TYPES",
"gap_catalog.ads_OWNER.case_types",
"gap_catalog.ADS_ETL_OWNER.STG_SMA_PROCESS_EVENTS_PARSE",
"gap_catalog.dwh_owner.parties",
"gap_catalog.ads_src_20250901.event_types",
"gap_catalog.ads_src_20250901.status_reason",
"gap_catalog.ads_src_20250901.PROD_INST_CASES",
"gap_catalog.ads_owner.PROCESS_EVENTS",
"gap_catalog.ads_etl_owner.XC_SMA_CASE_PHASE_PROPERTIES_HUMANTASK",
"gap_catalog.ads_etl_owner.XC_SMA_CASE_PHASE_PROPERTIES_MAIN",
"gap_catalog.ads_etl_owner.XC_SMA_CASE_PHASE_PROPERTIES_POT_OWN",
"gap_catalog.ads_etl_owner.XC_SMA_PROCESS_EVENTS_MAIN",
"gap_catalog.ads_etl_owner.XC_STG_SMA_CASE_PHASE_PROPERTIES_MAIN",
"gap_catalog.ads_etl_owner.XC_STG_SMA_CASE_PHASE_PROPERTIES_UPDATE"
]

# COMMAND ----------
# DBTITLE 1, Get Table Sizes and Row Counts

from pyspark.sql.functions import col

results = []

print(f"Analyzing {len(table_list)} tables...")
print("=" * 100)

for i, table_name in enumerate(table_list, 1):
    try:
        print(f"[{i}/{len(table_list)}] Processing {table_name}...", end=" ")
        
        # Get table details
        detail = spark.sql(f"DESCRIBE DETAIL {table_name}").first()
        
        # Get row count
        row_count = spark.table(table_name).count()
        
        # Split table name into schema and table
        parts = table_name.split('.')
        if len(parts) == 3:
            catalog = parts[0]
            schema = parts[1]
            table = parts[2]
            full_schema = f"{catalog}.{schema}"
        elif len(parts) == 2:
            schema = parts[0]
            table = parts[1]
            full_schema = schema
        else:
            full_schema = ""
            table = table_name
        
        # Calculate size in GB
        size_gb = round(detail.sizeInBytes / (1024**3), 2)
        
        results.append({
            "Schema": full_schema,
            "Table": table,
            "Storage_GB": size_gb,
            "Row_count": row_count,
            "type": detail.format,
            "num_files": detail.numFiles,
            "format": detail.format
        })
        
        print(f"✅ {size_gb} GB, {row_count} rows")
        
    except Exception as e:
        # Handle errors
        parts = table_name.split('.')
        if len(parts) >= 2:
            full_schema = '.'.join(parts[:-1])
            table = parts[-1]
        else:
            full_schema = ""
            table = table_name
            
        print(f"❌ ERROR: {str(e)}")
        results.append({
            "Schema": full_schema,
            "Table": table,
            "Storage_GB": 0,
            "Row_count": 0,
            "type": "ERROR",
            "num_files": 0,
            "format": "ERROR"
        })

print("=" * 100)
print("✅ Analysis complete")

# COMMAND ----------
# DBTITLE 1, Display Results - Excel Ready Format

# Create DataFrame with specific column order
result_df = spark.createDataFrame(results)

# Reorder columns
result_df = result_df.select(
    "Schema",
    "Table", 
    "Storage_GB",
    "Row_count",
    "type",
    "num_files",
    "format"
)

# Sort by Storage_GB descending
result_df_sorted = result_df.orderBy(col("Storage_GB").desc())

# Display
display(result_df_sorted)

# COMMAND ----------
# DBTITLE 1, Print Summary

from pyspark.sql.functions import sum as spark_sum, count as spark_count

summary = result_df.select(
    spark_count("Table").alias("total_tables"),
    spark_sum("Storage_GB").alias("total_size_gb"),
    spark_sum("Row_count").alias("total_rows"),
    spark_sum("num_files").alias("total_files")
).first()

print("=" * 80)
print("SUMMARY")
print("=" * 80)
print(f"Total Tables:  {summary.total_tables}")
print(f"Total Size:    {summary.total_size_gb} GB")
print(f"Total Rows:    {summary.total_rows}")
print(f"Total Files:   {summary.total_files}")
print("=" * 80)
