In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
# read data from source
df = spark.read.format("parquet")\
        .load("abfss://bronze@databricksdevetl.dfs.core.windows.net/customers")

In [0]:
df.display()

In [0]:
df = df.drop("order_id", "product_id", "order_date", "quantity", "total_amount", "_rescued_data")

In [0]:
df.display()

In [0]:
df = df.withColumn("domains", split(col("email"), "@")[1])
df.display()

In [0]:
df = df.withColumn("full_name", concat(col("first_name"),lit(" "), col("last_name")))
df = df.drop("first_name", "last_name")
df.display()

In [0]:
df = df.withColumnRenamed("state", "state_abbr")
df.display

In [0]:
state_map = {
    "OR": "Oregon", "MS": "Mississippi", "SC": "South Carolina", "AZ": "Arizona", "IL": "Illinois", "NE": "Nebraska", "MT": "Montana", "PA": "Pennsylvania", "CO": "Colorado", "IA": "Iowa", "KY": "Kentucky", "RI": "Rhode Island", "ME": "Maine", "MD": "Maryland", "GA": "Georgia", "NM": "New Mexico", "MI": "Michigan", "MN": "Minnesota", "LA": "Louisiana", "WY": "Wyoming", "WI": "Wisconsin", "FL": "Florida", "DE": "Delaware", "NC": "North Carolina", "WV": "West Virginia", "CA": "California", "OH": "Ohio", "AL": "Alabama", "TX": "Texas", "AK": "Alaska", "NV": "Nevada", "MA": "Massachusetts", "ND": "North Dakota", "NH": "New Hampshire", "NY": "New York", "IN": "Indiana", "ID": "Idaho", "KS": "Kansas", "UT": "Utah", "DC": "District of Columbia", "MO": "Missouri", "OK": "Oklahoma", "WA": "Washington", "NJ": "New Jersey", "AR": "Arkansas", "HI": "Hawaii", "VT": "Vermont", "VA": "Virginia", "TN": "Tennessee", "CT": "Connecticut", "SD": "South Dakota"
}

def abbr_to_full(abbr):
    return state_map.get(abbr, "Unknown")

abbr_to_full_udf = udf(abbr_to_full, StringType())

df = df.withColumn("state", abbr_to_full_udf(col("state_abbr")))
df.display()

In [0]:
# write data to silver layer
df.write.mode("overwrite") \
    .format("delta") \
    .option("mergeSchema", "true") \
    .save("abfss://silver@databricksdevetl.dfs.core.windows.net/customers")

In [0]:
%sql
CREATE TABLE IF NOT EXISTS databricks_dev.silver.silver_customers
USING DELTA
LOCATION "abfss://silver@databricksdevetl.dfs.core.windows.net/customers"