# Read the Source Table

In [0]:
spark.sql("""SELECT * FROM workspace.bronze_pyspark.crm_customers LIMIT 2""").show()

In [0]:
# display(spark.table('workspace.bronze_pyspark.crm_customers').limit(2))
df = spark.table('workspace.bronze_pyspark.crm_customers')
display(df.limit(2))

# import the methods & libraries

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.functions import col, when, concat, coalesce, trim, lit,count
from pyspark.sql.functions import row_number
from pyspark.sql.window import Window

# Explore the data in Source Table

#Check for duplicated records or Null in Primary Key

In [0]:
(df.groupBy(col('cst_id'))
 .agg(count("*").alias("cnt"))
 .filter((col("cnt")>1) | col('cst_id').isNull())
  ).display()

## Check for whitespaces in col firstname & lastname

In [0]:
(df.select(
    length(col('cst_firstname')).alias('actual_len_firstname'),
    length(trim(col('cst_firstname'))).alias('trim_len_firstname'),
    length(col('cst_lastname')).alias('actual_len_lastname'),
    length(trim(col('cst_lastname'))).alias('trim_len_lastname'))
           .where(
               (length(col('cst_firstname')) != length(trim(col('cst_firstname')))) |
                (length(col('cst_lastname')) != length(trim(col('cst_lastname'))))
           )        
).display()

## Explore null values in col first & lastname

In [0]:
(df.select(
df.columns
)
 .where(
     (col('cst_firstname').isNull()) | (col('cst_lastname').isNull())
 )).show()

## Explore null values in col cst_id & create_date

In [0]:
(df.select(
    col('cst_id'),
    col('cst_create_date'))
 .where(
     (col('cst_id').isNull()) | (col('cst_create_date').isNull()) 
 )).show()

## Explore for duplicated records

In [0]:
window_spec = Window.partitionBy('cst_id').orderBy('cst_create_date')
(df.withColumn('RN',
row_number().over(window_spec)
).filter(col('RN')>1)).show()

## Explore white spaces & Null in col cst_key

In [0]:
(df.select(
    df.columns
)
 .where (col('cst_key').isNull())).show()

In [0]:
(df.select(df.columns)
 .where(
     length(col('cst_key')) != length(trim(col('cst_key')))
 )).show()

In [0]:
spark.sql("""SELECT * FROM workspace.bronze_pyspark.crm_customers """).show()

In [0]:
spark.sql("""drop table if exists workspace.silver_pyspark.crm_customers""")

In [0]:
# renamed_cols =  {'cst_id': 'customer_id',
#      'cst_key': 'customer_number',
#      'cst_firstname': 'first_name',
#      'cst_lastname': 'last_name',
#      'cst_marital_status' : 'marital_status',
#      'cst_gndr': 'gender',
#      'cst_create_date': 'create_date'}


In [0]:
# for key, value in renamed_cols.items():
#     if key in df.columns:
#         df=df.withColumnRenamed(key, value)
#         df.display()

# Transformation

## Whitespaces removed

## Null values replaced with 'n/a'

## column Renmaing is done

## Duplicate records dropped

In [0]:

from pyspark.sql.functions import *
from pyspark.sql.functions import col, when, concat, coalesce, trim, lit
from pyspark.sql.functions import row_number
from pyspark.sql.window import Window

df = spark.table("workspace.bronze.crm_cust_info")
window_spec = Window.partitionBy('cst_id').orderBy('cst_create_date')
silver = (
df
#--------------------------------
#filter cst_id is not null
#--------------------------------
.filter(col('cst_id').isNotNull())
#--------------------------------
# creare row number & filter where rn = 1
#--------------------------------
.withColumn("RN", row_number().over(window_spec))
.filter(col("RN") ==1)

# select remaining columns & do the transformations
.select
(
    col('cst_id').alias('customer_id'),
    col('cst_key').alias('customer_number'),
when(
    col("cst_firstname").isNull() & col("cst_lastname").isNull(),
    lit("n/a")
).otherwise(
    concat_ws(
        " ",
        coalesce(trim(col("cst_firstname")), lit("n/a")),
        coalesce(trim(col("cst_lastname")), lit("n/a"))
    )
).alias('customer_name'),
    when(
        upper(trim(col('cst_marital_status'))) == 'M', lit('Married')
    )
    .when(
        upper(trim(col('cst_marital_status'))) == 'S', lit('Single')
    )
    .otherwise(lit('n/a')).alias ('marital_status').alias('marital_status'),
    when(
        upper(trim(col('cst_gndr'))) == 'M', lit('Male')
    )
    .when(
        upper(trim(col('cst_gndr'))) == 'F', lit('Female')
    )
    .otherwise(lit('n/a')).alias('gender'),
    col('cst_create_date').alias('create_date')
)
)

In [0]:
silver.show()

In [0]:
(silver.select(
    silver.columns
).where(col('create_date').isNotNull())).show()

# Create Target Table

In [0]:
spark.sql("""drop table if exists workspace.silver_pyspark.crm_customer """)


# Load the transformed data into Target table(overwrite)

In [0]:
silver.write.format('delta').mode('overwrite').saveAsTable('workspace.silver_pyspark.crm_customers')


# Sanity Checks

In [0]:
df = spark.sql("""SELECT * FROM workspace.silver_pyspark.crm_customers""")
df.show()

## Check for whitespaces in col firstname & lastname

In [0]:
(df.select
    (
    length(col('customer_name')).alias('org_name'),
    length(trim(col('customer_name'))).alias('trim_name')
    )
    .where(
            length(col('customer_name')) != length(trim(col('customer_name'))) 
    )).show()

## Explore null values in col customer_name

In [0]:
spark.sql("""SELECT 
          *
          FROM workspace.silver_pyspark.crm_customers  
          WHERE customer_name is null """).show()

In [0]:
(df.select(
df.columns
)
 .where(
     col('customer_name').isNull())
 ).show()

## Check for  null values in col customer_id  & create_date 

In [0]:
(df.select(
    col('customer_id'),
    col('create_date'))
 .where(
     (col('customer_id').isNull()) | (col('create_date').isNull()) 
 )).show()

## Check for duplicated records

In [0]:
window_spec = Window.partitionBy('customer_id').orderBy('create_date')
(df.withColumn('RN',
row_number().over(window_spec)
).filter(col('RN')>1)).show()

## Check white spaces & Null in col cst_key

In [0]:
(df.select(
    df.columns
)
 .where (col('customer_number').isNull())).show()

In [0]:
(df.select(df.columns)
 .where(
     length(col('customer_number')) != length(trim(col('customer_number')))
 )).show()

In [0]:

df.agg(count("*")).show()

# View table changes

In [0]:
spark.sql("""DESCRIBE HISTORY workspace.silver.crm_customers""").show()