**SalesLT_Customer**

In [0]:
# For analytical purposes, the companies "Trust Sports", "RYZN Store", and "RadC Outlet" should be grouped together under the unified store name "Ryzin Sports".

In [0]:
# Enable auto merge

spark.sql("SET spark.databricks.delta.schema.autoMerge.enabled = true")

In [0]:
%run "/Workspace/Utils/Utils"


In [0]:
from pyspark.sql import DataFrame, Window
from pyspark.sql import functions as F
from pyspark.sql.types import (
    IntegerType, StringType, TimestampType, StructType, StructField
)
from pyspark.sql.functions import col, desc, count

In [0]:
#Loading Table

df = spark.table("adlslmcompany_bronze.managed_bronze.saleslt_customer")

In [0]:
#Displaying table

df.limit(10).display()

In [0]:
#Checking for duplicated values

checkduplicates(df, "CustomerID" )

In [0]:
# Analyzing the distribution by CompanyName
graphbycolumnd(df, "CompanyName")

In [0]:
# Based on the previous analysis, the function will be created to clean the Customer table. 
# It will involve changing certain company names, as well as removing data such as "PasswordSalt" and "PasswordHash" which are deemed unnecessary for analytical purposes. 
# Additionally, personal information such as names, phone numbers, and email addresses will either be deleted or anonymized in accordance with company policies, ensuring that such sensitive data is not accessible to those managing the Silver data.

In [0]:
def silver_clean_salescustomer(df): 

    # Deletes irrelevant columns from the DataFrame
    df = df.drop("NameStyle", "Firstname", "MiddleName", "LastName", "Suffix", "PasswordHash", "PasswordSalt" )

    # Changes company names based on predefined store names
    store_names = ["Trust Sports", "RYZN Store", "RadC Outlet"]
    df = df.withColumn(
        "CompanyName",
        F.when(df["CompanyName"].isin(store_names), "Ryzin Sports")
         .otherwise(df["CompanyName"])
    )

    # Creates a gender column based on the title column
    df = df.withColumn("Gender", F.when(df.Title == "Mr.", "M").otherwise("F"))

    # Extracts SalesPerson from the SalesPerson column
    df = df.withColumn("SalesPerson", F.expr("substring(SalesPerson, 17, length(SalesPerson) - 14)"))

    # Deletes additional irrelevant columns
    df = df.drop("NameStyle", "Firstname", "MiddleName", "LastName", "Suffix", "PasswordHash", "PasswordSalt", "Title" )

    # Adds columns to indicate if the contact has a phone and email, without displaying the values
    df = df.withColumn("HasPhone", F.when(df.Phone.isNotNull(), 1).otherwise(0))
    df = df.withColumn("HasEmail", F.when(df.EmailAddress.isNotNull(), 1).otherwise(0))

    # Deletes further irrelevant columns
    df = df.drop("NameStyle", "Firstname", "MiddleName", "LastName", "Suffix", "PasswordHash", "PasswordSalt", "Title", "Phone", "EmailAddress" )

    # Adds transformation date column
    df = df.withColumn("silves_transformed_timestamp", F.current_timestamp())

    # Casts columns to ensure correct data types
    df = df.select(
         F.col('CustomerID').cast(IntegerType()).alias('CustomerID'),
         F.col('Gender').cast(StringType()).alias('Gender'), 
         F.col('SalesPerson').cast(StringType()).alias('SalesPerson'),
         F.col('CompanyName').cast(StringType()).alias('CompanyName'),
         F.col('HasPhone').cast(StringType()).alias('HasPhone'),
         F.col('HasEmail').cast(StringType()).alias('HasEmail'),
         F.col('rowguid').cast(StringType()).alias('rowguid'),
         F.col('ModifiedDate').cast(TimestampType()).alias('ModifiedDate'),
         F.col('bronze_ingestion_timestamp').cast(TimestampType()).alias('bronze_ingestion_timestamp'),
         F.col('silves_transformed_timestamp').cast(TimestampType()).alias('silves_transformed_timestamp'),
    )
    return df

In [0]:
#Defining expected schema

expected_schema = StructType([
    StructField("CustomerID", IntegerType(), False),             
    StructField("Gender", StringType(), True),                  
    StructField("SalesPerson", StringType(), True),
    StructField("CompanyName", StringType(), True),
    StructField("HasPhone", StringType(), False),  
    StructField("HasEmail", StringType(), False),
    StructField("rowguid", StringType(), False),
    StructField("ModifiedDate", TimestampType(), False) ,
    StructField("bronze_ingestion_timestamp", TimestampType(), False),
    StructField("silves_transformed_timestamp", TimestampType(), False)
                            ])


In [0]:
# Transform df 


silver_df = silver_clean_salescustomer(df)

In [0]:
# Checking the distribution by companies

graphbycolumnd(silver_df, "CompanyName")

In [0]:
#Comparing lenghts

compare_lengths(df, silver_df)

In [0]:
#Checking the schema 
_validate_schema(silver_df, expected_schema)

**IMPORTANT: Please note that this is a simulated project; the upsert operation will be executed within this notebook. In a production environment, a dedicated notebook containing only the function and validations would be developed. All function notebooks would be orchestrated by Azure Data Factory (ADF) pipelines or Azure Databricks (ADB) workflows. The method of upsert may vary based on the utilization of auto loader, streaming, or Change Data Feed (CDF).**


In [0]:
 #Loading into the Silver Layer   

target_table= "saleslt_customer"   

schema = "managed_silver"

catalog = "adlslmcompany_silver"

primary_keys = ["CustomerID"]


upsert_table(silver_df, target_table, primary_keys, schema, catalog )