![](/Workspace/Users/beenpornkanok@gmail.com/medallian_framework/Medallian.png)

In [0]:
sources_name = "dev_catalog.bronze.shop_name_raw"
keys = ["shop_id"]
schema = {
    "shop_id": "int",
    "shop_name": "string",
    "branch_name": "string",
    "file_dt":"date"}

target_table = "dev_catalog.silver.shop_nameb_dwh"
mode = ["overwrite"]

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.window import *

In [0]:
# hard code
stage_1 = (
    spark.table("dev_catalog.bronze.shop_name_raw")
 # add sk column : key unique   
    .withColumn("sk",monotonically_increasing_id())
#   drop useless column
    .drop("_load_dt","_load_dttm")
           )
stage_1.display()

In [0]:
# function control column
def list_all_control_col(df:DataFrame)->list[str]:
    return[_column_name for _column_name in df.columns if _column_name.startswith("_")]


In [0]:
# unhardcode
data_rules = {
    "int": "^[0-9]+$",
    "date": "^\\d{4}-\\d{2}-\\d{2}$"
}
# key is null
for _col_name in keys:
    invalid_df = stage_1.withColumn(f"_{_col_name}_missing",col(_col_name).isNull())
# invalid datatype
for _col_name,_col_type in schema.items():
    if _col_type == "string":
        continue
    invalid_df = invalid_df.withColumn(f"_{_col_name}_invalid",when(col(_col_name).rlike(data_rules[_col_type]),False).otherwise(True))

invalid_col = " OR ".join(list_all_control_col(invalid_df))

invalid_df = invalid_df.filter(invalid_col)
display(invalid_df)
    

In [0]:
# # stage 2 clear key is missing
# # invalid datatype
# int_rules = "^[0-9]+$"
# date_rules = "^\\d{4}-\\d{2}-\\d{2}$"
# invalid_df = (
#     stage_1
#     .withColumn("_shop_id_missing", col("shop_id").isNull())
#     .withColumn("_shop_id_invalid",when(col("shop_id").rlike(int_rules),False).otherwise(True))
#     .withColumn("_file_dt_invalid",when(col("file_dt").rlike(date_rules),False).otherwise(True))
#               )

# invalid_df.display()

In [0]:
invalid_col = " OR ".join(list_all_control_col(invalid_df))

invalid_df = (
    invalid_df
    # .filter(col("_shop_id_missing") | col("_shop_id_invalid") | col("_file_dt_invalid") )
    .filter(invalid_col)
    .drop("_shop_id_missing","_shop_id_invalid","_file_dt_invalid")
)
invalid_df.display()

In [0]:
date_invalid_df = (
    invalid_df
    .filter(col("_file_dt_invalid"))
    
)
date_invalid_df.display()

In [0]:
correct_date_df = (
    date_invalid_df
    .withColumn("file_dt",to_date(col("file_dt"),"yyyyMMdd") )
)
correct_date_df.display()

In [0]:
# # row duplicate
# partition_by_all = Window.partitionBy("shop_id","shop_name","branch_name","file_dt").orderBy("shop_id")

# row_dup_df = (
#   stage_2
#   .withColumn("rn", row_number().over(partition_by_all))
#   .filter(col("rn")>1)
#   .drop("rn")
# #   .withColumn("_remarks",lit("row_dup"))
# )
# row_dup_df.display()
# # key duplicate
# partition_by_key = Window.partitionBy("shop_id").orderBy("shop_id")

# key_dup_df = (
#     stage_2
#     .join(row_dup_df, ["sk"],"left_anti")
#     .withColumn("rn", count("*").over(partition_by_key))
#     .filter(col("rn") == 2)
#     .drop("rn")
#     # .withColumn("_remarks",lit("key_dup"))
# )
# key_dup_df.display()

In [0]:
# unhardcode
# row duplicate
partition_by_all = Window.partitionBy(list(schema.keys())).orderBy(keys)

row_dup_df = (
  stage_1
  .withColumn("rn", row_number().over(partition_by_all))
  .filter(col("rn")>1)
  .drop("rn")
#   .withColumn("_remarks",lit("row_dup"))
)
row_dup_df.display()
# key duplicate
partition_by_key = Window.partitionBy(keys).orderBy(keys)

key_dup_df = (
    stage_1
    .join(row_dup_df, ["sk"],"left_anti")
    .withColumn("rn", count("*").over(partition_by_key))
    .filter(col("rn") == 2)
    .drop("rn")
    # .withColumn("_remarks",lit("key_dup"))
)
key_dup_df.display()


In [0]:
duplicate_df = row_dup_df.unionByName(key_dup_df)
duplicate_df.display()

In [0]:
error_df = (
    stage_1
    .filter(col("branch_name").isNull())
)
print("error_df")
error_df.display()

In [0]:
reject_df = invalid_df.unionByName(duplicate_df).unionByName(error_df)
print("reject_df")
reject_df.display()

In [0]:
stage_2 = (
    stage_1.join(reject_df, ["sk"],"left_anti")
    
)
stage_2.display()

In [0]:
stage_2 = (
    stage_2.unionByName(correct_date_df)
)
stage_2.display()

In [0]:
# stage_3 = (
#     stage_2
#     .select(
#     col("sk")
#     ,col("shop_id").cast("int")
#     ,col("shop_name").cast("string")
#     ,col("branch_name").cast("string")
#     ,col("file_dt").cast("date") 
#     )
# )
# stage_3.display()

In [0]:
# unhardcode
select_cast_col = [col(_col_name).cast(_col_type) for _col_name, _col_type in schema.items()]

In [0]:
print(*select_cast_col)

In [0]:
stage_3 = (
    stage_2
    .select(
        *select_cast_col,
        "sk")
)
stage_3.display()

In [0]:

spark.sql("drop table if exists dev_catalog.silver.shop_nameb_dwh")

stage_3.write.mode(*mode).saveAsTable(
    target_table
)

spark.table("dev_catalog.silver.shop_name_dwh").display()

In [0]:
spark.sql("desc history dev_catalog.silver.shop_nameb_dwh").display()
