In [None]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql import DataFrame
from pyspark.sql.functions import col, when, lit, split, concat, regexp_extract



In [None]:
# Initialize Spark session
spark = SparkSession.builder.appName("InspectRawData").getOrCreate()


In [None]:
# Azure Data Lake paths (replace placeholders with actual values)
storage_account_name = "datalakestoragetask"  # Replace with your storage account name
raw_container = "raw"
processed_container = "processed"
storage_key = ""  # Replace with your key or credential method

# Configure Spark to access Azure Data Lake
spark.conf.set(f"fs.azure.account.key.{storage_account_name}.dfs.core.windows.net", storage_key)

# Define paths for each dataset
paths = {
    "price_stock_raw": f"abfss://{raw_container}@{storage_account_name}.dfs.core.windows.net/price_and_stock",
    "price_stock_processed": f"abfss://{processed_container}@{storage_account_name}.dfs.core.windows.net/price_and_stock"

}


In [None]:
# Function to inspect a dataset
def inspect_dataset(name, path, format_type, options={}):
    print(f"\n=== Inspecting {name} Dataset ===")
    try:
        # Load dataset based on format
        df = spark.read.format(format_type).options(**options).load(path)
        
        # Show schema and a sample of the data
        df.printSchema()
        df.limit(30).show(truncate=False)
        
        # Return DataFrame for further analysis if needed
        return df
    except Exception as e:
        print(f"Error reading {name} data: {e}")
        return None

# Inspect datasets one by one
print("\n--- Starting Inspection ---\n")




--- Starting Inspection ---



In [None]:
# Price data (CSV format)
price_df = inspect_dataset(
    name="Price",
    path=paths["price_stock_raw"] + "/PRICE-RET0001-202410151405.csv",  # Specific price file
    format_type="csv",
    options={"header": True, "delimiter": ";", "quote": '"'}  # Added delimiter for semicolon-separated values

)


=== Inspecting Price Dataset ===
root
 |-- manufacturer: string (nullable = true)
 |-- manufacturer_pid: string (nullable = true)
 |-- retailer_pid: string (nullable = true)
 |-- order_unit: string (nullable = true)
 |-- price: string (nullable = true)
 |-- price_base: string (nullable = true)
 |-- currency: string (nullable = true)
 |-- tax_class: string (nullable = true)
 |-- saleable: string (nullable = true)

+------------+----------------+------------+----------+-----+----------+--------+---------+--------+
|manufacturer|manufacturer_pid|retailer_pid|order_unit|price|price_base|currency|tax_class|saleable|
+------------+----------------+------------+----------+-----+----------+--------+---------+--------+
|tequip      |11060           |11060       |C62       |579  |1         |EUR     |1        |1       |
|tequip      |DEMO            |DEMO        |C62       |99.99|1         |EUR     |1        |1       |
|tequip      |9999911111      |9999911111  |C62       |0.02 |1         |EUR  

In [None]:
print("=== Inspecting price Dataset ===")
price_df.printSchema()
display(price_df.limit(7).toPandas())  # Display as table-like format

=== Inspecting order Dataset ===
root
 |-- manufacturer: string (nullable = true)
 |-- manufacturer_pid: string (nullable = true)
 |-- retailer_pid: string (nullable = true)
 |-- order_unit: string (nullable = true)
 |-- price: string (nullable = true)
 |-- price_base: string (nullable = true)
 |-- currency: string (nullable = true)
 |-- tax_class: string (nullable = true)
 |-- saleable: string (nullable = true)



manufacturer,manufacturer_pid,retailer_pid,order_unit,price,price_base,currency,tax_class,saleable
tequip,11060,11060,C62,579.0,1,EUR,1,1
tequip,DEMO,DEMO,C62,99.99,1,EUR,1,1
tequip,9999911111,9999911111,C62,0.02,1,EUR,1,1


In [None]:
#Price Dataset
#Handle any currency conversions or missing values.

# price_transformed = price_df.withColumnRenamed("price", "price_amount").withColumn(
#     "is_on_sale", when(col("saleable") == "1", lit(True)).otherwise(lit(False))
# )

# Clean Price Data
price_transformed = price_df.filter(
    col("price").isNotNull() & col("manufacturer_pid").isNotNull()
).withColumn("price", col("price").cast("double")) # withColumn: Converts the data type of the price column(if string) to a double

price_transformed = price_transformed.dropDuplicates()

# Validate
price_transformed.show()


+------------+----------------+------------+----------+-----+----------+--------+---------+--------+
|manufacturer|manufacturer_pid|retailer_pid|order_unit|price|price_base|currency|tax_class|saleable|
+------------+----------------+------------+----------+-----+----------+--------+---------+--------+
|      tequip|      9999911111|  9999911111|       C62| 0.02|         1|     EUR|        1|       1|
|      tequip|           11060|       11060|       C62|579.0|         1|     EUR|        1|       1|
|      tequip|            DEMO|        DEMO|       C62|99.99|         1|     EUR|        1|       1|
+------------+----------------+------------+----------+-----+----------+--------+---------+--------+



##4. Write Transformed Data to Processed Folder
Save the transformed datasets into the processed folder.

In [None]:
price_transformed.write.format("parquet").mode("overwrite").save(paths["price_stock_processed"] + "/price")
