# Silver Layer
The silver layer is an intermediate layer where data is refined, cleansed, and transformed for analysis.
The code below defines three functions (`dim1_cleansed`, `dim2_cleansed`, and `fact_cleansed`) that load and transform data into respective Delta tables (`dim1_cleansed`, `dim2_cleansed`, and `fact_cleansed`). Each function follows a similar structure and performs the following steps:

1. Reads the data from a streaming source using `spark.readStream` for `dim1_cleansed`, `dim2_cleansed` and `fact_cleansed`.
2. Selects all columns from the source.
3. Converts a formatted datetime column to Unix datetime format using `withColumn("datetime_comun", from_unixtime("formatted_datettime_column"))`.
4. Renames a column from 'OldColumnName' to 'New_Column_Name' using `withColumnRenamed`.
5. Replaces a specific string value in a column with a new string value using `withColumn("column_name", regexp_replace("column_name", "string_value", "new_string_value"))`.
6. Changes the schema of a column in JSON format using `withColumn("column_name", from_json(col("column_name"), new_schema))`.
7. Explodes an array column to obtain individual rows using `withColumn("column_name", explode("column_name"))`.

In [None]:
dim1_schema = {new_schema} # schema for table dim1_cleansed

# Create cleansed dimension1 table from Delta Table dim1_raw by applying transformations
# Read the Delta Lake table as Stream
df1=spark.readStream.format("delta")\
  .option("maxFilesPerTrigger",5)\ # maxFilesPerTrigger specifies the maximum number of new files to be considered in every trigger,default value is 1000
  .option("ignoreChanges","true")\
  .load("{Delta table path for dim1_raw}")

## Apply the transformations
df1_cleansed =df1.withColumn("datetime_comun", from_unixtime("formatted_datettime_column")) #changes date-time column to unix date-time format
        .withColumnRenamed('OldColumnName', 'New_Column_Name') #changing column name
        .withColumn("column_name", regexp_replace("column_name", "string_value", "new_string_value")) #replace part of a string with another string
        .withColumn("column_name", from_json(col("column_name"), new_schema)) #changing the schema of a column in json
        .withColumn("column_name", explode("column_name")) #exploding the array to get the individual rows
        {Tranformations} # additional transformations you may want to apply

# Write into delta table (/data/delta/dim1_cleansed) ,creating a silver delta table from df1_cleansed DataFrame
df1_cleansed.writeStream.format("delta") \
   .outputMode("append") \
   .option("mergeSchema", "true") \
    .trigger("processing=30 seconds") \
    .option("checkpointLocation", "</data/delta/dim1_cleansed_checkpoint_path>") \ ##A checkpoint directory/location is required to track the streaming updates. If not specified , a default checkpoint directory is created at /local_disk0/tmp/.
    .start("/data/delta/dim1_cleansed")
## .toTable("dim1_cleansed")    ### Can be use .toTable instead of .start()

###########################################################
dim2_schema = {new_schema} # schema for table dim2_cleansed
           
# Create cleansed dimension2 table from Delta Table dim2_raw by applying transformations
# Read the Delta Lake table as Stream
df2=spark.readStream.format("delta")\
  .option("maxFilesPerTrigger",5)\ # maxFilesPerTrigger specifies the maximum number of new files to be considered in every trigger,default value is 1000
  .option("ignoreChanges","true")\
  .load("{Delta table path for dim2_raw}")

## Apply the transformations
df2_cleansed =df2.withColumn("datetime_comun", from_unixtime("formatted_datettime_column")) #changes date-time column to unix date-time format
        .withColumnRenamed('OldColumnName', 'New_Column_Name') #changing column name
        .withColumn("column_name", regexp_replace("column_name", "string_value", "new_string_value")) #replace part of a string with another string
        .withColumn("column_name", from_json(col("column_name"), new_schema)) #changing the schema of a column in json
        .withColumn("column_name", explode("column_name")) #exploding the array to get the individual rows
        {Tranformations} # additional transformations you may want to apply

# Write into delta table (/data/delta/dim2_cleansed) ,creating a silver delta table from df2_cleansed DataFrame
df2_cleansed.writeStream.format("delta") \
   .outputMode("append") \
   .option("mergeSchema", "true") \
    .trigger("processing=30 seconds") \
    .option("checkpointLocation", "</data/delta/dim2_cleansed_checkpoint_path>") \ ##A checkpoint directory/location is required to track the streaming updates. If not specified , a default checkpoint directory is created at /local_disk0/tmp/.
    .start("/data/delta/dim2_cleansed")
## .toTable("dim2_cleansed")    ### Can be use .toTable instead of .start()

###########################################################
fact_schema = {new_schema} # schema for table fact_cleansed
           
# Create cleansed fact table from Delta Table fact_raw by applying transformations
# Read the Delta Lake table as Stream
df3=spark.readStream.format("delta")\
  .option("maxFilesPerTrigger",5)\ # maxFilesPerTrigger specifies the maximum number of new files to be considered in every trigger,default value is 1000
  .option("ignoreChanges","true")\
  .load("{Delta table path for fact_raw}")

## Apply the transformations
df3_cleansed =df3.withColumn("datetime_comun", from_unixtime("formatted_datettime_column")) #changes date-time column to unix date-time format
        .withColumnRenamed('OldColumnName', 'New_Column_Name') #changing column name
        .withColumn("column_name", regexp_replace("column_name", "string_value", "new_string_value")) #replace part of a string with another string
        .withColumn("column_name", from_json(col("column_name"), new_schema)) #changing the schema of a column in json
        .withColumn("column_name", explode("column_name")) #exploding the array to get the individual rows
        {Tranformations} # additional transformations you may want to apply

# Write into delta table (/data/delta/fact_cleansed) ,creating a silver delta table from df3_cleansed DataFrame
df3_cleansed.writeStream.format("delta") \
   .outputMode("append") \
   .option("mergeSchema", "true") \
    .trigger("processing=30 seconds") \
    .option("checkpointLocation", "</data/delta/fact_cleansed_checkpoint_path>") \ ##A checkpoint directory/location is required to track the streaming updates. If not specified , a default checkpoint directory is created at /local_disk0/tmp/.
    .start("/data/delta/fact_cleansed")
## .toTable("fact_cleansed")    ### Can be use .toTable instead of .start()

##############End of File #################