# Read the source table

In [0]:
df = spark.table('workspace.bronze_pyspark.erp_loc_a101')
df.display()

# Import methods & libraries

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.functions import col, when, concat, coalesce, trim, lit, substring, regexp_replace
from pyspark.sql.functions import row_number
from pyspark.sql.window import Window

# Explore Source Table

In [0]:
(df.select(col('CNTRY')).distinct()).show()
# need to standardise the values, handle nulls & empty values

In [0]:
(df.select(col('CNTRY'))
          .where(
              length(col('CNTRY')) != length(trim(col('CNTRY'))))).show()

In [0]:
cust_loc = df.select(
    regexp_replace(col("CID"), "-", "").alias("customer_number"),
    when(
        (trim(col("CNTRY")) == "US") | (trim(col("CNTRY")) == "USA"),
        lit("United States"),
    )
    .when(trim(col("CNTRY")) == "DE", lit("Germany"))
    .when(
        (trim(col("CNTRY")) == "")
        | (trim(col("CNTRY")).isNull() | (trim(col("CNTRY")) == " ")),
        lit("n/a"),
    )
    .otherwise(col("CNTRY"))
    .alias("country"),
)

# Drop Target table if it already exists

In [0]:
spark.sql("""DROP TABLE IF EXISTS workspace.silver_pyspark.erp_loc_a101 """).show()

# Create Target table & load transformed data into it

In [0]:
cust_loc.write.format('delta').mode('overwrite').saveAsTable('workspace.silver_pyspark.erp_loc_a101')

# Sanity Checks

In [0]:
df = spark.sql("""select * from workspace.silver_pyspark.erp_loc_a101""")
df.display()

In [0]:
(df.select(col('country')).distinct()).display()
# no nulls / empty values

# View table changes

In [0]:
spark.sql("""describe history workspace.silver_pyspark.erp_loc_a101""").display()