In [0]:
from pyspark.sql.functions import * 
import json

In [0]:
%sql
CREATE TABLE IF NOT EXISTS inventory_project.metadata.schema_registry (
    source STRING,
    schema_version INT,
    schema_json STRING,        -- store as JSON string
    effective_from TIMESTAMP,
    status STRING              -- e.g., 'active', 'deprecated'
)

In [0]:
def publish_schema(source_name):
  bronze_df = spark.read.table(f"inventory_project.bronze.{source_name}")
  current_schema = bronze_df.schema.json()
  try:
    latest_schema_row = (spark.table("inventory_project.metadata.schema_registry")
                              .filter(col("source") == source_name)
                              .orderBy(col("schema_version").desc())
                              .first())
    latest_schema = latest_schema_row['schema_json'] if latest_schema_row else None
    latest_version = latest_schema_row['schema_version'] if latest_schema_row else 0
  except:
    latest_schema = None
    latest_version = 0
  def compare_schema(current_schema, latest_schema):
    if not latest_schema:
      return {"new_cols": "ALL", "drop_cols" : [], "type_change" : {}}
    current_fields = {f["name"]: f["type"] for f in json.loads(current_schema)["fields"]}
    latest_fields = {f["name"] : f["type"] for f in json.loads(latest_schema)["fields"]}

    new_cols = set(current_fields) - set(latest_fields)
    drop_cols = set(latest_fields) - set(current_fields)
    type_change = {c:(latest_fields[c], current_fields[c]) for c in current_fields if c in latest_fields and latest_fields[c] != current_fields[c]}
    return {"new_cols": list(new_cols), "drop_cols": list(drop_cols), "type_change": type_change}
  schema_diff = compare_schema(current_schema, latest_schema)
  print(schema_diff)
  if schema_diff["new_cols"] or schema_diff["drop_cols"] or schema_diff["type_change"]:
    print(f"Schema change detected for {source_name}: {schema_diff}")
    new_version = latest_version + 1
    df_new_schema = spark.createDataFrame(
      [(source_name,new_version,current_schema)],
      ["source","schema_version","schema_json"]
    ).withColumn("effective_from", current_timestamp())\
      .withColumn("status", lit("ACTIVE"))\
      .withColumn("schema_version", col("schema_version").cast("int") )
    df_new_schema.write.mode("append").saveAsTable("inventory_project.metadata.schema_registry")
    print(f"✅ Registered new schema version {new_version} for {source_name}")
  else:
    print(f"✅ No schema changes for {source_name}")


In [0]:
fileLookup_df = (
    spark.read.table("inventory_project.metadata.filelookup")
         .filter((col("schema") == "bronze") & (col("description").startswith("WMS")))
)
fileLookup_df = fileLookup_df.collect()
for row in fileLookup_df:
  table_name = row['table_name']
  try:
    print(f"Started processing {row['table_name']}")
    publish_schema(table_name)
  except:
    error = str(e)
    print(f"Error processing {row['table_name']}: {e}")
    raise e
  

In [0]:
%sql
select * from inventory_project.metadata.schema_registry

In [0]:
source_name = "wms_inventory_snapshot_raw"
bronze_df = spark.read.table(f"inventory_project.bronze.{source_name}")
current_schema = bronze_df.schema.json()
try:
    latest_schema_row = (spark.table("inventory_project.metadata.schema_registry")
                              .filter(col("source") == source_name)
                              .orderBy(col("schema_version").desc())
                              .first())
    latest_schema = latest_schema_row['schema_json'] if latest_schema_row else None
    latest_version = latest_schema_row['schema_version'] if latest_schema_row else 0
except:
    latest_schema = None
    latest_version = 0

print(latest_schema)
print(latest_version)

In [0]:
latest_schema_row = (spark.table("inventory_project.metadata.schema_registry")
                              .filter(col("source") == source_name)
                              .orderBy(col("schema_version").desc())
                              .first())
latest_schema = latest_schema_row['schema_json'] if latest_schema_row else None
latest_version = latest_schema_row['schema_version'] if latest_schema_row else 0

print(latest_schema)
print(latest_version)