In [0]:
from pyspark.sql import functions as F, Window
from pyspark.sql.types import DecimalType, IntegerType, TimestampType, DateType, StringType

def trim_all(df):
    # Trim all string columns and normalize whitespace
    exprs = [
        F.regexp_replace(F.trim(F.col(c)), r"\s+", " ").alias(c) if t == "string" else F.col(c)
        for c, t in df.dtypes
    ]
    return df.select(*exprs)

def clean_number(col):
    # Remove currency symbols and thousands separators
    return F.regexp_replace(
        F.regexp_replace(F.col(col), r"^\s*[^\d\-\.,]+\s*", ""), r"[,\s]", ""
    )

def to_decimal(col, p=18, s=2):
    return clean_number(col).cast(DecimalType(p, s))

def to_int(col):
    return clean_number(col).cast(IntegerType())

def to_ts(col):
    # Try common formats, fallback to Spark inference
    return F.coalesce(
        F.to_timestamp(F.col(col), "M/d/yyyy H:mm"),
        F.to_timestamp(F.col(col), "M/d/yyyy HH:mm"),
        F.to_timestamp(F.col(col), "yyyy-MM-dd HH:mm:ss"),
        F.to_timestamp(F.col(col), "yyyy/MM/dd HH:mm:ss"),
        F.to_timestamp(F.col(col))  # fallback
    )

def dedupe_latest(df, key_cols, order_col):
    # Keep latest record per key using order_col (e.g., LastModifiedDate)
    w = Window.partitionBy(*key_cols).orderBy(F.col(order_col).desc_nulls_last())
    return df.withColumn("_rn", F.row_number().over(w)).filter("_rn=1").drop("_rn")


In [0]:
CATALOG = "workspace"
SCHEMA  = "ai_bi_reporting"
spark.sql(f"USE CATALOG {CATALOG}")
spark.sql(f"USE SCHEMA  {SCHEMA}")

bz = f"{CATALOG}.{SCHEMA}.anon_bst_acct_bz"
slv = f"{CATALOG}.{SCHEMA}.anon_bst_acct_slv"

df = spark.table(bz)
df = trim_all(df)

# Type casting (based on your samples)
df = (df
      .withColumn("Id", to_int("Id"))
      .withColumn("FY_Revenue_Target", to_decimal("FY_Revenue_Target", 18, 2))
      .withColumn("CVA_NPS", to_int("CVA_NPS"))
      .withColumn("CVA_CSAT", to_int("CVA_CSAT"))
)

# If you have a reliable modification timestamp column, dedupe by it; else, keep as-is
# Example: no timestamp → just drop exact duplicates on business key
df = df.dropDuplicates(["Id"])

(df.write
   .option("delta.columnMapping.mode","name")
   .option("overwriteSchema","true")
   .mode("overwrite")
   .saveAsTable(slv))


In [0]:
from pyspark.sql.functions import expr

bz = f"{CATALOG}.{SCHEMA}.anon_bst_opp_bz"
slv = f"{CATALOG}.{SCHEMA}.anon_bst_opp_slv"

df = spark.table(bz)
df = trim_all(df)

# Integers
for c in [
    "ID",
    "AccountID",
    "Auto_Number",
    "ContractRenewalMonths"
]:
    if c in df.columns:
        df = df.withColumn(
            c,
            expr(f"try_cast({c} AS INT)")
        )

# Decimals
for c in [
    "Amount_ACV",
    "CX_Ops_Overall_ACV",
    "CX_Ops_Overall_ECR",
    "Amount_ACV_Static",
    "ECR_Converted",
    "Amount_ACV_USD"
]:
    if c in df.columns:
        df = df.withColumn(
            c,
            to_decimal(c, 18, 2)
        )

# Timestamps
for c in [
    "CloseDate",
    "LastModifiedDate"
]:
    if c in df.columns:
        df = df.withColumn(
            c,
            to_ts(c)
        )

# Deduplicate by ID using LastModifiedDate if present
order_col = "LastModifiedDate" if "LastModifiedDate" in df.columns else None
df = dedupe_latest(df, ["ID"], order_col) if order_col else df.dropDuplicates(["ID"])

(df.write
   .option("delta.columnMapping.mode", "name")
   .option("overwriteSchema", "true")
   .mode("overwrite")
   .saveAsTable(slv)
)

In [0]:
bz = f"{CATALOG}.{SCHEMA}.anon_cust_intelligence_opportunity_bz"
slv = f"{CATALOG}.{SCHEMA}.anon_cust_intelligence_opportunity_slv"

df = spark.table(bz)
df = trim_all(df)

if "ID" in df.columns:
    df = df.withColumn("ID", to_int("ID"))

# If you later identify a better unique key, update the dedupe strategy
df = df.dropDuplicates(["ID"])

(df.write
   .option("delta.columnMapping.mode","name")
   .option("overwriteSchema","true")
   .mode("overwrite")
   .saveAsTable(slv))
