### Reading the Bronze Table

In [0]:
bronze_df = spark.read.table("workspace.default.bronze_retail_table_raw")

bronze_df.display()

In [0]:
bronze_df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: long (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: long (nullable = true)
 |-- Country: string (nullable = true)



In [0]:
from pyspark.sql.functions import *

### Correcting or Adding relevant columns with correct Data Types

Creating a new column called InvoiceDateTS that is of TimeStamp type 

In [0]:
from pyspark.sql.functions import to_timestamp, col

silver_df = bronze_df.withColumn(
    "InvoiceDateTS",
    to_timestamp("InvoiceDate", "M/d/yyyy H:mm")
)


### Data Cleaning (Removing Duplicates, Cleaning Nulls)

Drop Rows where all the values in the columns are NULL

In [0]:
silver_df = silver_df.dropna("all")

Drop Rows that have all the below 5 important columns as NULL

In [0]:
silver_df = silver_df.dropna(subset = ["InvoiceDate", "StockCode", "InvoiceNo","Quantity", "UnitPrice"])

Remove Duplicate rows having the same InvoiceNo and StockCode

In [0]:
silver_df = silver_df.dropDuplicates(subset = ["InvoiceNo", "StockCode"])

In [0]:
silver_df = silver_df.withColumn("InvoiceYear", year(col("InvoiceDateTS")))
silver_df = silver_df.withColumn("InvoiceMonth", month(col("InvoiceDateTS")))

### Writing back as a Silver Table for consumption in Gold Layer 

In [0]:
silver_df.write.mode("overwrite").format("delta").saveAsTable("silver_retail_table_clean")