In [0]:
%sql
USE CATALOG samples;
USE SCHEMA tpch;

USE CATALOG catalog_cp;
CREATE SCHEMA IF NOT EXISTS bronze;

CREATE OR REPLACE TABLE catalog_cp.bronze.customer AS SELECT * FROM samples.tpch.customer;
CREATE OR REPLACE TABLE catalog_cp.bronze.orders AS SELECT * FROM samples.tpch.orders;
CREATE OR REPLACE TABLE catalog_cp.bronze.lineitem AS SELECT * FROM samples.tpch.lineitem;
CREATE OR REPLACE TABLE catalog_cp.bronze.nation AS SELECT * FROM samples.tpch.nation;
CREATE OR REPLACE TABLE catalog_cp.bronze.part AS SELECT * FROM samples.tpch.part;
CREATE OR REPLACE TABLE catalog_cp.bronze.partsupp AS SELECT * FROM samples.tpch.partsupp;
CREATE OR REPLACE TABLE catalog_cp.bronze.supplier AS SELECT * FROM samples.tpch.supplier;
CREATE OR REPLACE TABLE catalog_cp.bronze.region AS SELECT * FROM samples.tpch.region;
SELECT COUNT(*) FROM catalog_cp.bronze.customer;



In [0]:
from pyspark.sql.functions import sha2, concat_ws, col
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

tables = ['customer', 'orders', 'lineitem', 'nation', 'part', 'partsupp', 'supplier', 'region']

source_catalog = "samples"
source_schema = "tpch"
target_catalog = "catalog_cp"
target_schema = "bronze"

def get_row_hashes_and_count(catalog, schema, table):
    df = spark.table(f"{catalog}.{schema}.{table}")
    row_hashes = df.select(sha2(concat_ws("||", *df.columns), 256).alias("row_hash"))
    return row_hashes, df.count()

all_match = True

for table in tables:
    source_hashes, source_count = get_row_hashes_and_count(source_catalog, source_schema, table)
    target_hashes, target_count = get_row_hashes_and_count(target_catalog, target_schema, table)

    row_count_match = (source_count == target_count)
    missing_in_target = source_hashes.subtract(target_hashes)
    missing_in_source = target_hashes.subtract(source_hashes)

    hash_match = (missing_in_target.count() == 0 and missing_in_source.count() == 0)

    if row_count_match and hash_match:
        print(f"Table '{table}': MATCH (rows: {source_count})")
    else:
        print(f"Table '{table}': MISMATCH")
        if not row_count_match:
            print(f"   - Row count differs: source={source_count}, target={target_count}")
        if missing_in_target.count() > 0:
            print(f"   - {missing_in_target.count()} rows missing in target")
        if missing_in_source.count() > 0:
            print(f"   - {missing_in_source.count()} rows missing in source")
        all_match = False

if all_match:
    print("\nAll tables match.")
else:
    print("\n One or more tables do not match.")
