In [None]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql import DataFrame


In [None]:
# Initialize Spark session
spark = SparkSession.builder.appName("InspectRawData").getOrCreate()


In [None]:
# Azure Data Lake paths (replace placeholders with actual values)
storage_account_name = "datalakestoragetask"  # Replace with your storage account name
raw_container = "raw"
processed_container = "processed"
storage_key = ""  # Replace with your key or credential method

# Configure Spark to access Azure Data Lake
spark.conf.set(f"fs.azure.account.key.{storage_account_name}.dfs.core.windows.net", storage_key)

# Define paths for each dataset
paths = {
    "catalog": f"abfss://{raw_container}@{storage_account_name}.dfs.core.windows.net/catalog",
    "customer": f"abfss://{raw_container}@{storage_account_name}.dfs.core.windows.net/customer",
    "order": f"abfss://{raw_container}@{storage_account_name}.dfs.core.windows.net/order",
    "price_stock": f"abfss://{raw_container}@{storage_account_name}.dfs.core.windows.net/price_and_stock"
}



In [None]:
# Function to inspect a dataset
def inspect_dataset(name, path, format_type, options={}):
    print(f"\n=== Inspecting {name} Dataset ===")
    try:
        # Load dataset based on format
        df = spark.read.format(format_type).options(**options).load(path)
        
        # Show schema and a sample of the data
        df.printSchema()
        df.limit(30).show(truncate=False)
        
        # Return DataFrame for further analysis if needed
        return df
    except Exception as e:
        print(f"Error reading {name} data: {e}")
        return None

# Inspect datasets one by one
print("\n--- Starting Inspection ---\n")




--- Starting Inspection ---



In [None]:
# Catalog data (XML format)
catalog_df = inspect_dataset(
    name="Catalog",
    path=paths["catalog"],
    format_type="xml",
    options={"rowTag": "PRODUCT"}  # Updated to 'PRODUCT' based on the XML structure
)

In [None]:
print("=== Inspecting Catalog Dataset ===")
catalog_df.printSchema()
display(catalog_df.limit(10).toPandas())  # Display as table-like format

MIME_INFO,PRODUCT_DETAILS,PRODUCT_FEATURES,PRODUCT_ORDER_DETAILS,PRODUCT_PRICE_DETAILS,SUPPLIER_PID,USER_DEFINED_EXTENSIONS,_mode
"List(List(List(List(Hersteller-Logo, deu), List(Hersteller-Logo, deu), logo, List(panel_logo.gif, deu), image/gif), List(List(Bedienungsanleitung, deu), List(Bedienungsanleitung, deu), data_sheet, List(bedienungsanleitung_11010.pdf, deu), application/pdf), List(null, null, normal, List(11070_normal.jpg, deu), image/jpeg)))","List(DEMO, List(DEMO 1 (deu, de) FOR PRODUCT MANAGEMENT DEMONSTRATION PURPOSES ✓, deu), List(List(DEMO 1 (deu, de), deu, de), List(DEMO 1 (eng, en), eng, en)), List(1, gtin), tequip, DEMO_MPID, DEMO_APID)","List(List(null, 1.0E9, null, udf_NMCATEGORIES-1.0), List(List(List(List(tec_detail_1, deu), DEMO UNIT, List(42_1, deu)), List(List(tec_detail_2, deu), DEMO UNIT, List(42_2, deu)), List(List(tec_detail_3, deu), DEMO UNIT, List(42_3, deu))), null, List(List(Technische Daten, deu), List(Technical Data, eng), List(Datos tecnicos, spa)), udf_NMTECHNICALDETAILS-1.0))","List(C62, 1, 1)","List(List(0.01, EUR, List(standard_rate), net_list))",DEMO,"List(true, false)",new
"List(List(List(List(Hersteller-Logo, deu), List(Hersteller-Logo, deu), logo, List(panel_logo.gif, deu), image/gif), List(List(Bedienungsanleitung, deu), List(Bedienungsanleitung, deu), data_sheet, List(bedienungsanleitung_11010.pdf, deu), application/pdf), List(null, null, normal, List(11070_normal.jpg, deu), image/jpeg)))","List(null, List(DEMO 9999911111 (deu, de) FOR PRODUCT MANAGEMENT DEMONSTRATION PURPOSES, deu), List(List(DEMO 9999911111 (deu, de), deu, de), List(DEMO 9999911111 (eng, en), eng, en)), List(2, gtin), tequip, 9999933333, 9999922222)","List(List(null, 1.0E9, null, udf_NMCATEGORIES-1.0), List(List(List(List(tec_detail_1, deu), DEMO UNIT, List(42_1, deu)), List(List(tec_detail_2, deu), DEMO UNIT, List(42_2, deu)), List(List(tec_detail_3, deu), DEMO UNIT, List(42_3, deu))), null, List(List(Technische Daten, deu), List(Technical Data, eng), List(Datos tecnicos, spa)), udf_NMTECHNICALDETAILS-1.0))","List(C62, 1, 1)","List(List(0.01, EUR, List(standard_rate), net_list))",9999911111,"List(true, true)",new


In [None]:
# Customer data (CSV format)
customer_df = inspect_dataset(
    name="Customer",
    path=paths["customer"],
    format_type="csv",
    options={"header": True, "delimiter": ";"}  # Added delimiter for semicolon-separated values
)


In [None]:
# Step 1: Log the files being processed
print("\n--- Processing Order Files ---")
order_files = dbutils.fs.ls(paths["order"])  # List all files in the folder

# Log all file names
print("Files in the Order folder:")
for file_info in order_files:
    print(f"- {file_info.name} (size: {file_info.size} bytes, modification time: {file_info.modificationTime})")

# Step 2: Read the Order dataset (combines all files in the folder)
# Order data (EDI format - read as plain text)
order_df = inspect_dataset(
    name="Order",
    path=paths["order"],
    format_type="text"
)


--- Processing Order Files ---
Files in the Order folder:
- example-order-1.edi (size: 252 bytes, modification time: 1733905923000)
- example-order-2.edi (size: 261 bytes, modification time: 1733905923000)
- example-order-3.edi (size: 293 bytes, modification time: 1733905923000)
- example-order-4.edi (size: 249 bytes, modification time: 1733905923000)

=== Inspecting Order Dataset ===
root
 |-- value: string (nullable = true)

+----------------------------------------------------------------------------------------------------------------------------------------------------------+
|value                                                                                                                                                     |
+----------------------------------------------------------------------------------------------------------------------------------------------------------+
|HDR;ORD;2.0;NOID;tequip;KDNR000001;Test.User;999999;;;2024-09-06T12:05;NORML;;;;;;;;test.user@ne

In [None]:
display(catalog_df.limit(10).toPandas())  # Display as table-like format

In [None]:
# Price data (CSV format)
price_df = inspect_dataset(
    name="Price",
    path=paths["price_stock"] + "/PRICE-RET0001-202410151405.csv",  # Specific price file
    format_type="csv",
    options={"header": True, "delimiter": ";", "quote": '"'}  # Added delimiter for semicolon-separated values

)


=== Inspecting Price Dataset ===
root
 |-- manufacturer: string (nullable = true)
 |-- manufacturer_pid: string (nullable = true)
 |-- retailer_pid: string (nullable = true)
 |-- order_unit: string (nullable = true)
 |-- price: string (nullable = true)
 |-- price_base: string (nullable = true)
 |-- currency: string (nullable = true)
 |-- tax_class: string (nullable = true)
 |-- saleable: string (nullable = true)

+------------+----------------+------------+----------+-----+----------+--------+---------+--------+
|manufacturer|manufacturer_pid|retailer_pid|order_unit|price|price_base|currency|tax_class|saleable|
+------------+----------------+------------+----------+-----+----------+--------+---------+--------+
|tequip      |11060           |11060       |C62       |579  |1         |EUR     |1        |1       |
|tequip      |DEMO            |DEMO        |C62       |99.99|1         |EUR     |1        |1       |
|tequip      |9999911111      |9999911111  |C62       |0.02 |1         |EUR  

In [None]:
display(price_df.limit(10).toPandas())  # Display as table-like format

manufacturer,manufacturer_pid,retailer_pid,order_unit,price,price_base,currency,tax_class,saleable
tequip,11060,11060,C62,579.0,1,EUR,1,1
tequip,DEMO,DEMO,C62,99.99,1,EUR,1,1
tequip,9999911111,9999911111,C62,0.02,1,EUR,1,1


In [None]:
# Stock data (CSV format)
stock_df = inspect_dataset(
    name="Stock",
    path=paths["price_stock"] + "/STOCK-RET0001-202410151330.csv",  # Specific stock file
    format_type="csv",
    #options={"header": True}  # Read as CSV with header
    options={"header": True, "delimiter": ";", "quote": '"'}  # Added delimiter for semicolon-separated values

)

print("\n--- Inspection Completed ---")



=== Inspecting Stock Dataset ===
root
 |-- manufacturer: string (nullable = true)
 |-- manufacturer_pid: string (nullable = true)
 |-- retailer_pid: string (nullable = true)
 |-- order_unit: string (nullable = true)
 |-- quantity: string (nullable = true)
 |--  replenishment_time: string (nullable = true)
 |-- deeplink: string (nullable = true)

+------------+----------------+------------+----------+--------+-------------------+----------------------------------+
|manufacturer|manufacturer_pid|retailer_pid|order_unit|quantity| replenishment_time|deeplink                          |
+------------+----------------+------------+----------+--------+-------------------+----------------------------------+
|tequip_de   |11060           |11060       |C62       |99      |3                  |https://www.test.tequip/11060     |
|tequip_de   |DEMO            |DEMO        |C62       |0       |3                  |https://www.test.tequip/DEMO      |
|tequip_de   |9999911111      |9999911111  |C62    

In [None]:
display(stock_df.limit(10).toPandas())  # Display as table-like format

manufacturer,manufacturer_pid,retailer_pid,order_unit,quantity,replenishment_time,deeplink
tequip_de,11060,11060,C62,99,3,https://www.test.tequip/11060
tequip_de,DEMO,DEMO,C62,0,3,https://www.test.tequip/DEMO
tequip_de,9999911111,9999911111,C62,15,3,https://www.test.tequip/9999911111
