In [None]:
# Import required libraries
from pyspark.sql import SparkSession
# Import necessary libraries
from pyspark.sql.functions import col, when, lit

In [None]:
# Initialize Spark session
spark = SparkSession.builder.appName("ReadProcessedData").getOrCreate()

In [None]:
# Azure Data Lake paths (using the same approach as in InspectRawData)
storage_account_name = "datalakestoragetask"  # Replace with your storage account name
processed_container = "processed"
analytics_ready_container = "analytics-ready"
storage_key = ""  # Replace with your key or credential method

# Configure Spark to access Azure Data Lake
spark.conf.set(f"fs.azure.account.key.{storage_account_name}.dfs.core.windows.net", storage_key)

# Define paths for processed and analytics-ready data
paths = {
    "price_processed": f"abfss://{processed_container}@{storage_account_name}.dfs.core.windows.net/price_and_stock/price",
    "stock_processed": f"abfss://{processed_container}@{storage_account_name}.dfs.core.windows.net/price_and_stock/stock",
    "analytics_ready": f"abfss://{analytics_ready_container}@{storage_account_name}.dfs.core.windows.net"
}

In [None]:
# Load DataFrames
price_df = spark.read.format("parquet").load(paths["price_processed"])
stock_df = spark.read.format("parquet").load(paths["stock_processed"])

In [None]:
# Step 2: Merge Data Using Composite Key (including manufacturer)
merged_df = price_df.join(
    stock_df,
    on=["manufacturer", "manufacturer_pid", "retailer_pid", "order_unit"],  # Updated Composite Key
    how="inner"
)

# Step 3: Clean and Restructure Merged Data
merged_df = merged_df.select(
    col("manufacturer"),  # Now a single column
    col("price").alias("price_amount"),
    col("price_base").alias("price_base"),
    col("currency").alias("currency"),
    col("tax_class").alias("tax_class"),
    col("saleable").alias("is_saleable"),
    col("quantity"),
    col(" replenishment_time"),
    col("deeplink")
)


In [None]:
# Display the merged DataFrame
print("\n--- Displaying Merged Data ---")
#merged_df.show(n=20, truncate=False)
price_df.printSchema()
merged_df.limit(30).show(truncate=False)
display(merged_df.limit(10).toPandas())  # Display as table-like format




--- Displaying Merged Data ---
root
 |-- manufacturer: string (nullable = true)
 |-- manufacturer_pid: string (nullable = true)
 |-- retailer_pid: string (nullable = true)
 |-- order_unit: string (nullable = true)
 |-- price: double (nullable = true)
 |-- price_base: string (nullable = true)
 |-- currency: string (nullable = true)
 |-- tax_class: string (nullable = true)
 |-- saleable: string (nullable = true)



[0;31m---------------------------------------------------------------------------[0m
[0;31mValueError[0m                                Traceback (most recent call last)
File [0;32m<command-513621778872262>, line 6[0m
[1;32m      4[0m price_df[38;5;241m.[39mprintSchema()
[1;32m      5[0m [38;5;66;03m#merged_df.limit(30).show(truncate=False)[39;00m
[0;32m----> 6[0m display(merged_df[38;5;241m.[39mlimit([38;5;241m10[39m)[38;5;241m.[39mtoPandas())

File [0;32m/databricks/python_shell/dbruntime/display.py:152[0m, in [0;36mDisplay.display[0;34m(self, input, *args, **kwargs)[0m
[1;32m    150[0m     [38;5;28mself[39m[38;5;241m.[39mdisplay([38;5;28mself[39m[38;5;241m.[39msparkSession[38;5;241m.[39mcreateDataFrame([38;5;28minput[39m))
[1;32m    151[0m [38;5;28;01melif[39;00m [38;5;28mtype[39m([38;5;28minput[39m)[38;5;241m.[39m[38;5;18m__module__[39m [38;5;241m==[39m [38;5;124m'[39m[38;5;124mpandas.core.frame[39m[38;5;124m'[39m [38;5;

###  Write Merged Data to Analytics-Ready Container


In [None]:
merged_df.write.format("parquet").mode("overwrite").save(price_stock_analytics_path)

print("\n--- Merged Data Written to Analytics-Ready Container ---")