**SalesLT_Address**

In [0]:
# For this table, there is a business rule that for analytical purposes, all addresses from South America should be grouped into a single value "Others". This includes the corresponding State and City fields.
# 
# This applies to countries in South America.

In [0]:
# Enable auto merge

spark.sql("SET spark.databricks.delta.schema.autoMerge.enabled = true")

In [0]:
%run "/Workspace/Utils/Utils"


In [0]:
from pyspark.sql import DataFrame, Window
from pyspark.sql import functions as F
from pyspark.sql.types import (
    IntegerType, StringType, TimestampType, StructType, StructField
)


In [0]:
#Loading Table

df = spark.table("adlslmcompany_bronze.managed_bronze.saleslt_address")

In [0]:
#Displaying table

df.limit(10).display()

In [0]:
#Checking for duplicated values

checkduplicates(df, "AddressID" )

In [0]:
# Analyzing the distribution of addresses by country regions
graphbycolumnd(df, "CountryRegion")

In [0]:
# This function cleans the Address table based on previous analysis. 
# For analytical purposes, it retains only the country, city, and state information.
def silver_clean_salesaddress(df): 
    # Drop unnecessary columns
    df = df.drop("AddressLine1", "AddressLine2", "PostalCode")

    # List of South American countries
    south_american_countries = [
        "Argentina", "Bolivia", "Brazil", "Chile", "Colombia", "Ecuador", 
        "Guyana", "Paraguay", "Peru", "Suriname", "Uruguay", "Venezuela"
    ]

    # Change South American countries to "Others"
    df = df.withColumn(
        "CountryRegion", 
        F.when(df.CountryRegion.isin(south_american_countries), "Others")
         .otherwise(df.CountryRegion)
    )
    
    df = df.withColumn(
        "StateProvince", 
        F.when(df.CountryRegion == "Others", "Others")
         .otherwise(df.StateProvince)
    ).withColumn(
        "City", 
        F.when(df.CountryRegion == "Others", "Others")
         .otherwise(df.City)
    )

    # Adds transformation date column
    df = df.withColumn("silves_transformed_timestamp", F.current_timestamp())

    # Cast to ensure correct data types
    df = df.select(
         F.col('AddressID').cast(IntegerType()).alias('AddressID'),
         F.col('City').cast(StringType()).alias('City'), 
         F.col('StateProvince').cast(StringType()).alias('StateProvince'),
         F.col('CountryRegion').cast(StringType()).alias('CountryRegion'),
         F.col('rowguid').cast(StringType()).alias('rowguid'),
         F.col('ModifiedDate').cast(TimestampType()).alias('ModifiedDate'),
         F.col('bronze_ingestion_timestamp').cast(TimestampType()).alias('bronze_ingestion_timestamp'),
         F.col('silves_transformed_timestamp').cast(TimestampType()).alias('silves_transformed_timestamp'),
    )

    return df

In [0]:
#Defining expected schema
expected_schema = StructType([
    StructField("AddressID", IntegerType(), False),             
    StructField("City", StringType(), True),                  
    StructField("StateProvince", StringType(), True),
    StructField("CountryRegion", StringType(), True),
    StructField("rowguid", StringType(), False),                
    StructField("ModifiedDate", TimestampType(), False) ,
    StructField("bronze_ingestion_timestamp", TimestampType(), False),
    StructField("silves_transformed_timestamp", TimestampType(), False)
                            ])

In [0]:
# Transform df 

silver_df = silver_clean_salesaddress(df)

In [0]:
# Checking the distribution by countries
graphbycolumnd(silver_df, "CountryRegion")

In [0]:
#Comparing lenghts

compare_lengths(df, silver_df)

In [0]:
#Checking the schema 
_validate_schema(silver_df, expected_schema)

**IMPORTANT: Please note that this is a simulated project; the upsert operation will be executed within this notebook. In a production environment, a dedicated notebook containing only the function and validations would be developed. All function notebooks would be orchestrated by Azure Data Factory (ADF) pipelines or Azure Databricks (ADB) workflows. The method of upsert may vary based on the utilization of auto loader, streaming, or Change Data Feed (CDF).**

In [0]:
#Loading into the Silver Layer

target_table= "saleslt_address"   

schema = "managed_silver"

catalog = "adlslmcompany_silver"

primary_keys = ["AddressID"]


upsert_table(silver_df, target_table, primary_keys, schema, catalog )