In [None]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql import DataFrame
from pyspark.sql.functions import col, when, lit, split, concat, regexp_extract



In [None]:
# Initialize Spark session
spark = SparkSession.builder.appName("InspectRawData").getOrCreate()


In [None]:
# Azure Data Lake paths (replace placeholders with actual values)
storage_account_name = "datalakestoragetask"  # Replace with your storage account name
raw_container = "raw"
processed_container = "processed"
storage_key = ""  # Replace with your key or credential method

# Configure Spark to access Azure Data Lake
spark.conf.set(f"fs.azure.account.key.{storage_account_name}.dfs.core.windows.net", storage_key)

# Define paths for each dataset
paths = {
    "catalog_raw": f"abfss://{raw_container}@{storage_account_name}.dfs.core.windows.net/catalog",
    "catalog_processed": f"abfss://{processed_container}@{storage_account_name}.dfs.core.windows.net/catalog/catalog.parquet",

}


In [None]:
# Function to inspect a dataset
def inspect_dataset(name, path, format_type, options={}):
    print(f"\n=== Inspecting {name} Dataset ===")
    try:
        # Load dataset based on format
        df = spark.read.format(format_type).options(**options).load(path)
        
        # Show schema and a sample of the data
        df.printSchema()
        df.limit(30).show(truncate=False)
        
        # Return DataFrame for further analysis if needed
        return df
    except Exception as e:
        print(f"Error reading {name} data: {e}")
        return None

# Inspect datasets one by one
print("\n--- Starting Inspection ---\n")




--- Starting Inspection ---



In [None]:
# Catalog data (XML format)
catalog_df = inspect_dataset(
    name="Catalog",
    path=paths["catalog_raw"],
    format_type="xml",
    options={"rowTag": "PRODUCT"}  # Updated to 'PRODUCT' based on the XML structure
)

# Load raw datasets
#catalog_df = spark.read.format("xml").options(rowTag="PRODUCT").load(paths["catalog_raw"])


=== Inspecting Catalog Dataset ===
root
 |-- MIME_INFO: struct (nullable = true)
 |    |-- MIME: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- MIME_ALT: struct (nullable = true)
 |    |    |    |    |-- _VALUE: string (nullable = true)
 |    |    |    |    |-- _lang: string (nullable = true)
 |    |    |    |-- MIME_DESCR: struct (nullable = true)
 |    |    |    |    |-- _VALUE: string (nullable = true)
 |    |    |    |    |-- _lang: string (nullable = true)
 |    |    |    |-- MIME_PURPOSE: string (nullable = true)
 |    |    |    |-- MIME_SOURCE: struct (nullable = true)
 |    |    |    |    |-- _VALUE: string (nullable = true)
 |    |    |    |    |-- _lang: string (nullable = true)
 |    |    |    |-- MIME_TYPE: string (nullable = true)
 |-- PRODUCT_DETAILS: struct (nullable = true)
 |    |-- BUYER_PID: string (nullable = true)
 |    |-- DESCRIPTION_LONG: struct (nullable = true)
 |    |    |-- _VALUE: string (nullable = true)
 

In [None]:
print("=== Inspecting Catalog Dataset ===")
catalog_df.printSchema()
display(catalog_df.limit(10).toPandas())  # Display as table-like format

=== Inspecting Catalog Dataset ===
root
 |-- MIME_INFO: struct (nullable = true)
 |    |-- MIME: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- MIME_ALT: struct (nullable = true)
 |    |    |    |    |-- _VALUE: string (nullable = true)
 |    |    |    |    |-- _lang: string (nullable = true)
 |    |    |    |-- MIME_DESCR: struct (nullable = true)
 |    |    |    |    |-- _VALUE: string (nullable = true)
 |    |    |    |    |-- _lang: string (nullable = true)
 |    |    |    |-- MIME_PURPOSE: string (nullable = true)
 |    |    |    |-- MIME_SOURCE: struct (nullable = true)
 |    |    |    |    |-- _VALUE: string (nullable = true)
 |    |    |    |    |-- _lang: string (nullable = true)
 |    |    |    |-- MIME_TYPE: string (nullable = true)
 |-- PRODUCT_DETAILS: struct (nullable = true)
 |    |-- BUYER_PID: string (nullable = true)
 |    |-- DESCRIPTION_LONG: struct (nullable = true)
 |    |    |-- _VALUE: string (nullable = true)
 |

MIME_INFO,PRODUCT_DETAILS,PRODUCT_FEATURES,PRODUCT_ORDER_DETAILS,PRODUCT_PRICE_DETAILS,SUPPLIER_PID,USER_DEFINED_EXTENSIONS,_mode
"List(List(List(List(Hersteller-Logo, deu), List(Hersteller-Logo, deu), logo, List(panel_logo.gif, deu), image/gif), List(List(Bedienungsanleitung, deu), List(Bedienungsanleitung, deu), data_sheet, List(bedienungsanleitung_11010.pdf, deu), application/pdf), List(null, null, normal, List(11070_normal.jpg, deu), image/jpeg)))","List(DEMO, List(DEMO 1 (deu, de) FOR PRODUCT MANAGEMENT DEMONSTRATION PURPOSES ✓, deu), List(List(DEMO 1 (deu, de), deu, de), List(DEMO 1 (eng, en), eng, en)), List(1, gtin), tequip, DEMO_MPID, DEMO_APID)","List(List(null, 1.0E9, null, udf_NMCATEGORIES-1.0), List(List(List(List(tec_detail_1, deu), DEMO UNIT, List(42_1, deu)), List(List(tec_detail_2, deu), DEMO UNIT, List(42_2, deu)), List(List(tec_detail_3, deu), DEMO UNIT, List(42_3, deu))), null, List(List(Technische Daten, deu), List(Technical Data, eng), List(Datos tecnicos, spa)), udf_NMTECHNICALDETAILS-1.0))","List(C62, 1, 1)","List(List(0.01, EUR, List(standard_rate), net_list))",DEMO,"List(true, false)",new
"List(List(List(List(Hersteller-Logo, deu), List(Hersteller-Logo, deu), logo, List(panel_logo.gif, deu), image/gif), List(List(Bedienungsanleitung, deu), List(Bedienungsanleitung, deu), data_sheet, List(bedienungsanleitung_11010.pdf, deu), application/pdf), List(null, null, normal, List(11070_normal.jpg, deu), image/jpeg)))","List(null, List(DEMO 9999911111 (deu, de) FOR PRODUCT MANAGEMENT DEMONSTRATION PURPOSES, deu), List(List(DEMO 9999911111 (deu, de), deu, de), List(DEMO 9999911111 (eng, en), eng, en)), List(2, gtin), tequip, 9999933333, 9999922222)","List(List(null, 1.0E9, null, udf_NMCATEGORIES-1.0), List(List(List(List(tec_detail_1, deu), DEMO UNIT, List(42_1, deu)), List(List(tec_detail_2, deu), DEMO UNIT, List(42_2, deu)), List(List(tec_detail_3, deu), DEMO UNIT, List(42_3, deu))), null, List(List(Technische Daten, deu), List(Technical Data, eng), List(Datos tecnicos, spa)), udf_NMTECHNICALDETAILS-1.0))","List(C62, 1, 1)","List(List(0.01, EUR, List(standard_rate), net_list))",9999911111,"List(true, true)",new


In [None]:
#Catalog Dataset
#Transform and clean the catalog data.

catalog_transformed = catalog_df.select(
    col("PRODUCT_DETAILS.MANUFACTURER_NAME").alias("manufacturer"),
    col("PRODUCT_DETAILS.MANUFACTURER_PID").alias("manufacturer_pid"),
    col("PRODUCT_DETAILS.DESCRIPTION_LONG._VALUE").alias("description"),
    col("PRODUCT_PRICE_DETAILS.PRODUCT_PRICE.PRICE_AMOUNT").alias("price"),
    col("PRODUCT_PRICE_DETAILS.PRODUCT_PRICE.PRICE_CURRENCY").alias("currency"),
    col("SUPPLIER_PID").alias("supplier_pid")
)


##4. Write Transformed Data to Processed Folder
Save the transformed datasets into the processed folder.

In [None]:
catalog_transformed.write.format("parquet").mode("overwrite").save(paths["catalog_processed"])
