In [None]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql import DataFrame
from pyspark.sql.functions import col, when, lit, split, concat, regexp_extract



In [None]:
# Initialize Spark session
spark = SparkSession.builder.appName("InspectRawData").getOrCreate()


In [None]:
# Azure Data Lake paths (replace placeholders with actual values)
storage_account_name = "datalakestoragetask"  # Replace with your storage account name
raw_container = "raw"
processed_container = "processed"
storage_key = ""  # Replace with your key or credential method

# Configure Spark to access Azure Data Lake
spark.conf.set(f"fs.azure.account.key.{storage_account_name}.dfs.core.windows.net", storage_key)

# Define paths for each dataset
paths = {
    "price_stock_raw": f"abfss://{raw_container}@{storage_account_name}.dfs.core.windows.net/price_and_stock",
    "price_stock_processed": f"abfss://{processed_container}@{storage_account_name}.dfs.core.windows.net/price_and_stock"

}


In [None]:
# Function to inspect a dataset
def inspect_dataset(name, path, format_type, options={}):
    print(f"\n=== Inspecting {name} Dataset ===")
    try:
        # Load dataset based on format
        df = spark.read.format(format_type).options(**options).load(path)
        
        # Show schema and a sample of the data
        df.printSchema()
        df.limit(30).show(truncate=False)
        
        # Return DataFrame for further analysis if needed
        return df
    except Exception as e:
        print(f"Error reading {name} data: {e}")
        return None

# Inspect datasets one by one
print("\n--- Starting Inspection ---\n")




--- Starting Inspection ---



In [None]:
# Stock data (CSV format)
stock_df = inspect_dataset(
    name="Stock",
    path=paths["price_stock_raw"] + "/STOCK-RET0001-202410151330.csv",  # Specific stock file
    format_type="csv",
    #options={"header": True}  # Read as CSV with header
    options={"header": True, "delimiter": ";", "quote": '"'}  # Added delimiter for semicolon-separated values

)

print("\n--- Inspection Completed ---")



=== Inspecting Stock Dataset ===
root
 |-- manufacturer: string (nullable = true)
 |-- manufacturer_pid: string (nullable = true)
 |-- retailer_pid: string (nullable = true)
 |-- order_unit: string (nullable = true)
 |-- quantity: string (nullable = true)
 |--  replenishment_time: string (nullable = true)
 |-- deeplink: string (nullable = true)

+------------+----------------+------------+----------+--------+-------------------+----------------------------------+
|manufacturer|manufacturer_pid|retailer_pid|order_unit|quantity| replenishment_time|deeplink                          |
+------------+----------------+------------+----------+--------+-------------------+----------------------------------+
|tequip_de   |11060           |11060       |C62       |99      |3                  |https://www.test.tequip/11060     |
|tequip_de   |DEMO            |DEMO        |C62       |0       |3                  |https://www.test.tequip/DEMO      |
|tequip_de   |9999911111      |9999911111  |C62    

In [None]:
print("=== Inspecting price Dataset ===")
stock_df.printSchema()
display(stock_df.limit(10).toPandas())  # Display as table-like format

=== Inspecting price Dataset ===
root
 |-- manufacturer: string (nullable = true)
 |-- manufacturer_pid: string (nullable = true)
 |-- retailer_pid: string (nullable = true)
 |-- order_unit: string (nullable = true)
 |-- quantity: string (nullable = true)
 |--  replenishment_time: string (nullable = true)
 |-- deeplink: string (nullable = true)



manufacturer,manufacturer_pid,retailer_pid,order_unit,quantity,replenishment_time,deeplink
tequip_de,11060,11060,C62,99,3,https://www.test.tequip/11060
tequip_de,DEMO,DEMO,C62,0,3,https://www.test.tequip/DEMO
tequip_de,9999911111,9999911111,C62,15,3,https://www.test.tequip/9999911111


In [None]:
#Stock Dataset
# Clean Stock Data
stock_transformed = stock_df.filter(
    col("quantity").isNotNull() & (col("quantity").cast("int") > 0)
).withColumn("quantity", col("quantity").cast("int"))

# #Add calculated columns for stock availability.
# stock_transformed = stock_df.withColumn(
#     "stock_status", 
#     when(col("quantity") > 0, lit("In Stock")).otherwise(lit("Out of Stock"))
# )

# Validate the cleaned data
stock_transformed.show()

+------------+----------------+------------+----------+--------+-------------------+--------------------+
|manufacturer|manufacturer_pid|retailer_pid|order_unit|quantity| replenishment_time|            deeplink|
+------------+----------------+------------+----------+--------+-------------------+--------------------+
|   tequip_de|           11060|       11060|       C62|      99|                  3|https://www.test....|
|   tequip_de|      9999911111|  9999911111|       C62|      15|                  3|https://www.test....|
+------------+----------------+------------+----------+--------+-------------------+--------------------+



##4. Write Transformed Data to Processed Folder
Save the transformed datasets into the processed folder.

In [None]:
stock_transformed.write.format("parquet").mode("overwrite").save(paths["price_stock_processed"] + "/stock")
