In [0]:
adls_account = "insurancedatalake01"
storage_key = os.environ.get("AZURE_STORAGE_KEY")

spark.conf.set(
    f"fs.azure.account.key.{adls_account}.dfs.core.windows.net",
    storage_key
)

In [0]:
from pyspark.sql import functions as F

# =========================
# CONFIG – CHỈNH CHO PHÙ HỢP
# =========================

adls_account = "insurancedatalake01"   
container = "datalake"

gold_db_name = "gold"

gold_path = f"abfss://{container}@{adls_account}.dfs.core.windows.net/gold"

print("Gold root path:", gold_path)

# =========================
# HELPER: Chạy lệnh SQL và log
# =========================

def run_sql(sql_text: str):
    print(f"\n>>> Running SQL:\n{sql_text}")
    spark.sql(sql_text)


Gold root path: abfss://datalake@insurancedatalake01.dfs.core.windows.net/gold


In [0]:
# =========================
# TẠO DATABASE GOLD
# =========================

run_sql(f"CREATE DATABASE IF NOT EXISTS {gold_db_name}")
run_sql(f"USE {gold_db_name}")

print(f"Current database: {spark.catalog.currentDatabase()}")



>>> Running SQL:
CREATE DATABASE IF NOT EXISTS gold

>>> Running SQL:
USE gold
Current database: gold


In [0]:
# =========================
# LIỆT KÊ CÁC "TABLE FOLDERS" TRONG GOLD
# =========================

folders = dbutils.fs.ls(gold_path)

print("Found subfolders under /gold:")
for f in folders:
    print("-", f.name, "=>", f.path)


Found subfolders under /gold:
- dim_customer/ => abfss://datalake@insurancedatalake01.dfs.core.windows.net/gold/dim_customer/
- dim_date/ => abfss://datalake@insurancedatalake01.dfs.core.windows.net/gold/dim_date/
- dim_handler/ => abfss://datalake@insurancedatalake01.dfs.core.windows.net/gold/dim_handler/
- dim_policy/ => abfss://datalake@insurancedatalake01.dfs.core.windows.net/gold/dim_policy/
- fact_claims/ => abfss://datalake@insurancedatalake01.dfs.core.windows.net/gold/fact_claims/
- fact_claims_daily/ => abfss://datalake@insurancedatalake01.dfs.core.windows.net/gold/fact_claims_daily/
- mart_customer_claims/ => abfss://datalake@insurancedatalake01.dfs.core.windows.net/gold/mart_customer_claims/
- mart_handler_performance/ => abfss://datalake@insurancedatalake01.dfs.core.windows.net/gold/mart_handler_performance/
- mart_policy_performance/ => abfss://datalake@insurancedatalake01.dfs.core.windows.net/gold/mart_policy_performance/


In [0]:
# =========================
# AUTO-REGISTER GOLD TABLES
# =========================

for f in folders:
    # Bỏ qua các file lẻ, chỉ lấy folder
    if not f.isDir():
        continue

    # Tên table = tên folder, bỏ dấu "/"
    table_name = f.name.rstrip("/")   # "DimCustomer", "FactClaim", ...

    table_location = f.path           # full abfss path
    sql_stmt = f"""
    CREATE TABLE IF NOT EXISTS {gold_db_name}.{table_name}
    USING DELTA
    LOCATION '{table_location}'
    """

    run_sql(sql_stmt)

print("\n✅ Done registering GOLD tables.")



>>> Running SQL:

    CREATE TABLE IF NOT EXISTS gold.dim_customer
    USING DELTA
    LOCATION 'abfss://datalake@insurancedatalake01.dfs.core.windows.net/gold/dim_customer/'
    

>>> Running SQL:

    CREATE TABLE IF NOT EXISTS gold.dim_date
    USING DELTA
    LOCATION 'abfss://datalake@insurancedatalake01.dfs.core.windows.net/gold/dim_date/'
    

>>> Running SQL:

    CREATE TABLE IF NOT EXISTS gold.dim_handler
    USING DELTA
    LOCATION 'abfss://datalake@insurancedatalake01.dfs.core.windows.net/gold/dim_handler/'
    

>>> Running SQL:

    CREATE TABLE IF NOT EXISTS gold.dim_policy
    USING DELTA
    LOCATION 'abfss://datalake@insurancedatalake01.dfs.core.windows.net/gold/dim_policy/'
    

>>> Running SQL:

    CREATE TABLE IF NOT EXISTS gold.fact_claims
    USING DELTA
    LOCATION 'abfss://datalake@insurancedatalake01.dfs.core.windows.net/gold/fact_claims/'
    

>>> Running SQL:

    CREATE TABLE IF NOT EXISTS gold.fact_claims_daily
    USING DELTA
    LOCATION 'abfss://

In [0]:
# =========================
# KIỂM TRA TABLES TRONG GOLD DB
# =========================

tables = spark.catalog.listTables(gold_db_name)
print(f"Tables in database '{gold_db_name}':")
for t in tables:
    print("-", t.name, "(type:", t.tableType, ")")

display(spark.sql(f"SHOW TABLES IN {gold_db_name}"))


Tables in database 'gold':
- dim_customer (type: EXTERNAL )
- dim_date (type: EXTERNAL )
- dim_handler (type: EXTERNAL )
- dim_policy (type: EXTERNAL )
- fact_claims (type: EXTERNAL )
- fact_claims_daily (type: EXTERNAL )
- mart_customer_claims (type: EXTERNAL )
- mart_handler_performance (type: EXTERNAL )
- mart_policy_performance (type: EXTERNAL )


database,tableName,isTemporary
gold,dim_customer,False
gold,dim_date,False
gold,dim_handler,False
gold,dim_policy,False
gold,fact_claims,False
gold,fact_claims_daily,False
gold,mart_customer_claims,False
gold,mart_handler_performance,False
gold,mart_policy_performance,False
