In [0]:
# Read Delta or Parquet data from Bronze layer
df = spark.read.format("parquet").load("abfss://bronze@ahmedolympicsdatalake.dfs.core.windows.net/coaches/")
df.display()

In [0]:
# Transform: clean 'Name' and keep only 'Name' and 'Country'
df_transformed = df.selectExpr("trim(Name) as Name", "Country")

In [0]:
df.printSchema()

root
 |-- code: string (nullable = true)
 |-- current: string (nullable = true)
 |-- name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- function: string (nullable = true)
 |-- category: string (nullable = true)
 |-- country_code: string (nullable = true)
 |-- country: string (nullable = true)
 |-- country_long: string (nullable = true)
 |-- disciplines: string (nullable = true)
 |-- events: string (nullable = true)
 |-- birth_date: string (nullable = true)



In [0]:
from pyspark.sql.functions import to_date
# Step 2: Convert 'birth_date' from string to DateType (default format: yyyy-MM-dd)
df_transformed = df.withColumn("birth_date", to_date("birth_date", "yyyy-MM-dd"))

# Optional: Check the schema after transformation
df_transformed.printSchema()

root
 |-- code: string (nullable = true)
 |-- current: string (nullable = true)
 |-- name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- function: string (nullable = true)
 |-- category: string (nullable = true)
 |-- country_code: string (nullable = true)
 |-- country: string (nullable = true)
 |-- country_long: string (nullable = true)
 |-- disciplines: string (nullable = true)
 |-- events: string (nullable = true)
 |-- birth_date: date (nullable = true)



In [0]:
# Step 3: Write the transformed DataFrame to Silver layer in Delta format
df_transformed.write.format("delta").mode("overwrite").save("abfss://silver@ahmedolympicsdatalake.dfs.core.windows.net/coaches/")
