#Import libraries/Initialization

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.functions import trim, col
from pyspark.sql.types import StringType

# Mapping/Renaming Columns
## dictionary:  key/values

In [0]:

RENAME_MAP = {
    "prd_id": "product_id",
    "cat_id": "category_id",
    "prd_key": "product_number_key",
    "prd_nm": "product_name",
    "prd_cost": "product_cost",
    "prd_line": "product_line",
    "prd_start_dt": "start_date",
    "prd_end_dt": "end_date"
}

#Reading from Bronze

In [0]:

df = spark.table("workspace.bronze.crm_prd_info")

# Silver transformation

## Check data types/schema

In [0]:
df.printSchema()



## Trim whitespaces


In [0]:
for field in df.schema.fields:
    if isinstance(field.dataType, StringType):
        df = df.withColumn(field.name,trim(col(field.name)))

## Rename columns

In [0]:
def rename_columns(df, RENAME_MAP: dict):
    for old_name, new_name in RENAME_MAP.items():
        df = df.withColumnRenamed(old_name, new_name)
    return df

In [0]:
df = rename_columns(df, RENAME_MAP)
df.printSchema()

# Trim all string columns

In [0]:
def trim_string_columns(df):
    for field in df.schema.fields:
        if isinstance(field.dataType, StringType):
            df = df.withColumn(field.name, F.trim(F.col(field.name)))
    return df


In [0]:
def is_missing_str(col_name: str):
    return F.col(col_name).isNull() | (F.trim(F.col(col_name)) == "")


In [0]:
df.is_missing_str()

In [0]:
exact_dups = (
    df.groupBy("product_number_key","start_date","end_date","product_cost","product_name","product_line")
      .count()
      .filter("count > 1")
)
display(exact_dups)


In [0]:
period_dups = (
    df.groupBy("product_number_key","start_date","end_date")
      .count()
      .filter("count > 1")
)
display(period_dups)

In [0]:
dups_after_df = (
    df.filter(F.col("product_number_key").isNotNull() & (F.trim(F.col("product_number_key")) != ""))
      .groupBy("product_number_key")
      .count()
      .filter(F.col("count") > 1)
      .orderBy(F.col("count").desc())
)

display(dups_after_df)

# Write into silver Table

In [0]:
(
    df.write
    .mode("overwrite")
    .format("delta")
    .saveAsTable("silver.crm_products")
)