### **Dynamic Capabilities**

In [0]:
from pyspark.sql.types import *
dbutils.widgets.text("file_name","")

In [0]:
file_name = dbutils.widgets.get("file_name")
print(f"Processing dataset: {file_name}")

### **Define Schemas and Paths**

In [0]:
schemas = {
    "orders": StructType([
        StructField("order_id", StringType(), True),
        StructField("customer_id", StringType(), True),
        StructField("product_id", StringType(), True),
        StructField("order_date", DateType(), True),
        StructField("quantity", IntegerType(), True),
        StructField("total_amount", DoubleType(), True)
    ]),
    "customers": StructType([
        StructField("customer_id", StringType(), True),
        StructField("first_name", StringType(), True),
        StructField("last_name", StringType(), True),
        StructField("email", StringType(), True),
        StructField("city", StringType(), True),
        StructField("state", StringType(), True)
    ]),
    "products": StructType([
        StructField("product_id", StringType(), True),
        StructField("product_name", StringType(), True),
        StructField("category", StringType(), True),
        StructField("brand", StringType(), True),
        StructField("price", DoubleType(), True)
    ])
}

current_schema = schemas[file_name]
print(f"Using schema for {file_name}")


In [0]:
source_path = f"abfss://source@ayaneteprojstor.dfs.core.windows.net/{file_name}"
checkpoint_path = f"abfss://bronze@ayaneteprojstor.dfs.core.windows.net/checkpoint_{file_name}"
output_path = f"abfss://bronze@ayaneteprojstor.dfs.core.windows.net/{file_name}"

print(f"Source path: {source_path}")
print(f"Checkpoint path: {checkpoint_path}")
print(f"Output path: {output_path}")

### **Data Reading**

In [0]:
df = spark.readStream.format("cloudFiles")\
    .option("cloudFiles.format", "parquet")\
    .option("cloudFiles.schemaLocation", checkpoint_path)\
    .schema(current_schema)\
    .load(source_path)

print(f"Created streaming DataFrame for {file_name}")

### **Data Writing**

In [0]:
try:
    stream = df.writeStream.format("parquet")\
        .outputMode("append")\
        .option("checkpointLocation", checkpoint_path)\
        .option("path", output_path)\
        .trigger(once=True)\
        .start()
    
    stream.awaitTermination()
    print(f"Successfully processed {file_name}")
    
except Exception as e:
    print(f"Error processing {file_name}: {str(e)}")
    raise

In [0]:
try:
    result_df = spark.read.parquet(output_path)
    row_count = result_df.count()
    print(f"Verification: {file_name} contains {row_count} rows in bronze layer")
    
    print(f"Schema for {file_name}:")
    result_df.printSchema()
    
except Exception as e:
    print(f"Could not verify {file_name}: {str(e)}")

### **Manual Validation of Bronze Layer Autoloader**

In [0]:
df = spark.read.format("parquet")\
    .load(f"abfss://bronze@ayaneteprojstor.dfs.core.windows.net/{file_name}")
display(df)