In [0]:
%sql
USE CATALOG samples;
USE SCHEMA tpch;

USE CATALOG catalog_cp;
CREATE SCHEMA IF NOT EXISTS bronze;

CREATE OR REPLACE TABLE catalog_cp.bronze.customer AS SELECT * FROM samples.tpch.customer;
CREATE OR REPLACE TABLE catalog_cp.bronze.orders AS SELECT * FROM samples.tpch.orders;
CREATE OR REPLACE TABLE catalog_cp.bronze.lineitem AS SELECT * FROM samples.tpch.lineitem;
CREATE OR REPLACE TABLE catalog_cp.bronze.nation AS SELECT * FROM samples.tpch.nation;
CREATE OR REPLACE TABLE catalog_cp.bronze.part AS SELECT * FROM samples.tpch.part;
CREATE OR REPLACE TABLE catalog_cp.bronze.partsupp AS SELECT * FROM samples.tpch.partsupp;
CREATE OR REPLACE TABLE catalog_cp.bronze.supplier AS SELECT * FROM samples.tpch.supplier;
CREATE OR REPLACE TABLE catalog_cp.bronze.region AS SELECT * FROM samples.tpch.region;



In [0]:
tables = ['customer', 'orders', 'lineitem', 'nation', 'part', 'partsupp', 'supplier', 'region']

source_catalog = "samples"
source_schema = "tpch"
target_catalog = "catalog_cp"
target_schema = "bronze"

union_queries = []
for table in tables:
    union_queries.append(f"""
        SELECT 
            '{table}' AS table_name,
            (SELECT COUNT(*) FROM {source_catalog}.{source_schema}.{table}) AS source_count,
            (SELECT COUNT(*) FROM {target_catalog}.{target_schema}.{table}) AS target_count
    """)

final_query = " UNION ALL ".join(union_queries)

validation_df = spark.sql(final_query).toPandas()

validation_df['validation_result'] = validation_df.apply(
    lambda row: 'MATCH' if row.source_count == row.target_count else 'MISMATCH', axis=1
)

display(validation_df)
