# Read the Source Table

In [0]:
df = spark.table('workspace.bronze_pyspark.crm_products')
df.show()

# import the methods & libraries

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.functions import col, when, concat, coalesce, trim, lit
from pyspark.sql.functions import row_number
from pyspark.sql.window import Window

# Explore the data in Source Table

In [0]:
# Expectation No result
(df.groupby(
    col('prd_id')
)
 .agg(count("*").alias('cnt'))
 .filter(
    (col('cnt') > 1) | col('prd_id').isNull()
 )
 ).display()

In [0]:
# Expectation No result
(df.select(
    length(col('prd_key')).alias('actual_len_prd_key'),
    length(trim(col('prd_key'))).alias('trim_len_prd_key'),
    length(col('prd_nm')).alias('actual_len_prd_nm'),
    length(trim(col('prd_nm'))).alias('trim_len_prd_nm'))
           .where(
               (length(col('prd_key')) != length(trim(col('prd_key')))) |
                (length(col('prd_nm')) != length(trim(col('prd_nm'))))
           )        
).display()

In [0]:
# Expectation No result

(df.select(
    df.columns)
           .where(
               ((col('prd_cost').isNull()) | (col('prd_cost') < 0))
           )        
).display()
# found nulls, need to handle

In [0]:
(df.select(
    col('prd_line')).distinct()
    ).display()
# nulls to be handled, standardization to be done

In [0]:
# Expectation No result
(df.select(
    df.columns)
           .where(
               col('prd_start_dt').isNull()
           )        
).display()

In [0]:
# Expectstion no result
window_spec = Window.partitionBy('prd_key').orderBy('prd_start_dt')
(df.select(df.columns).where(
    col('prd_end_dt')< col('prd_start_dt')
    )).display()
# found records where end_date > start_date need to handle with window function

# Drop Target table if it already Exists

In [0]:
spark.sql("""DROP TABLE IF EXISTS workspace.silver_pyspark.crm_products""")

# import methods & libraries

In [0]:

from pyspark.sql.functions import *
from pyspark.sql.functions import col, when, concat, coalesce, trim, lit, substring, regexp_replace,lead
from pyspark.sql.functions import row_number
from pyspark.sql.window import Window

# Transformations

In [0]:
from pyspark.sql import functions as f
df = spark.table('workspace.bronze_pyspark.crm_products')
customers =df.select(
    col('prd_id').alias('product_id'),
    regexp_replace(substring(col('prd_key'), 1, 5), '-', '_').alias('category_id'),
    substring(col('prd_key'), 7, length(col('prd_key'))).alias('product_number'),
    col('prd_nm').alias('product_name'),
    coalesce(col('prd_cost'), lit(0)).alias('cost'),
    when(upper(trim(col('prd_line'))) == 'R', lit('Road'))
    .when(upper(trim(col('prd_line'))) == 'MR', lit('Mountain'))
    .when(upper(trim(col('prd_line'))) == 'S', lit('Other Sales'))
    .when(upper(trim(col('prd_line'))) == 'T', lit('Touring'))
    .otherwise(lit('n/a')).alias('product_line'),
    col('prd_start_dt').alias('start_date'),
    date_sub(lead(col('prd_start_dt')).over(Window.partitionBy('prd_key').orderBy('prd_start_dt')),1).alias('end_date')
    )


In [0]:
customers.display()

# Create the Target Table & load the transformed data to it

In [0]:
customers.write.format('delta').mode('overwrite').saveAsTable('workspace.silver_pyspark.crm_products')

# Sanity Checks in the Target Table

In [0]:
df = spark.table('workspace.silver_pyspark.crm_products')
df.display()

In [0]:
# Expectation No result

(df.select(
    df.columns)
           .where(
               ((col('cost').isNull()) | (col('cost') < 0))
           )        
).display()

In [0]:
(df.select(
    col('product_line')).distinct()
    ).display()
# nulls handled, standardization is done

In [0]:
# Expectstion no result
window_spec = Window.partitionBy('category_id').orderBy('start_date')
(df.select(df.columns).where(
    col('end_date')< col('start_date')
    )).display()

# View table changes

In [0]:
spark.sql("""DESCRIBE HISTORY workspace.silver_pyspark.crm_products """).display()