In [None]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql import DataFrame
from pyspark.sql.functions import col, when, lit, split, concat, regexp_extract



In [None]:
# Initialize Spark session
spark = SparkSession.builder.appName("InspectRawData").getOrCreate()


In [None]:
# Azure Data Lake paths (replace placeholders with actual values)
storage_account_name = "datalakestoragetask"  # Replace with your storage account name
raw_container = "raw"
processed_container = "processed"
storage_key = ""  # Replace with your key or credential method

# Configure Spark to access Azure Data Lake
spark.conf.set(f"fs.azure.account.key.{storage_account_name}.dfs.core.windows.net", storage_key)

# Define paths for each dataset
paths = {
    "customer_raw": f"abfss://{raw_container}@{storage_account_name}.dfs.core.windows.net/customer",
    "customer_processed": f"abfss://{processed_container}@{storage_account_name}.dfs.core.windows.net/customer/",
}


In [None]:
# Function to inspect a dataset
def inspect_dataset(name, path, format_type, options={}):
    print(f"\n=== Inspecting {name} Dataset ===")
    try:
        # Load dataset based on format
        df = spark.read.format(format_type).options(**options).load(path)
        
        # Show schema and a sample of the data
        df.printSchema()
        df.limit(30).show(truncate=False)
        
        # Return DataFrame for further analysis if needed
        return df
    except Exception as e:
        print(f"Error reading {name} data: {e}")
        return None

# Inspect datasets one by one
print("\n--- Starting Inspection ---\n")




--- Starting Inspection ---



In [None]:
# Customer data (CSV format)
customer_df = inspect_dataset(
    name="Customer",
    path=paths["customer_raw"],
    format_type="csv",
    options={"header": True, "delimiter": ";"}  # Added delimiter for semicolon-separated values
)
#customer_df = spark.read.format("csv").options(header=True, delimiter=";").load(paths["customer_raw"])



=== Inspecting Customer Dataset ===
root
 |-- customerID: string (nullable = true)
 |-- User: string (nullable = true)
 |-- SupplierID: string (nullable = true)
 |-- Customername1: string (nullable = true)
 |-- Customername2: string (nullable = true)
 |-- Street: string (nullable = true)
 |-- Postcode: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Email1: string (nullable = true)
 |-- Email2: string (nullable = true)
 |-- AdditionalInformation: string (nullable = true)

+----------+----+----------+-------------+-------------+------------------+--------+----------+-------+------+------------------------+---------------------+
|customerID|User|SupplierID|Customername1|Customername2|Street            |Postcode|City      |Country|Email1|Email2                  |AdditionalInformation|
+----------+----+----------+-------------+-------------+------------------+--------+----------+-------+------+------------------------+--------------

In [None]:
print("=== Inspecting customer Dataset ===")
customer_df.printSchema()
display(customer_df.limit(10).toPandas())  # Display as table-like format

=== Inspecting customer Dataset ===
root
 |-- customerID: string (nullable = true)
 |-- User: string (nullable = true)
 |-- SupplierID: string (nullable = true)
 |-- Customername1: string (nullable = true)
 |-- Customername2: string (nullable = true)
 |-- Street: string (nullable = true)
 |-- Postcode: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Email1: string (nullable = true)
 |-- Email2: string (nullable = true)
 |-- AdditionalInformation: string (nullable = true)



customerID,User,SupplierID,Customername1,Customername2,Street,Postcode,City,Country,Email1,Email2,AdditionalInformation
,,1,Tequip,,Gropiusplatz 10,70563,Stuttgart,DE,,test.user2@nexmart.com,
,,2,Tequip,,Hauptstraße 45,40210,Düsseldorf,DE,,maria.muster@firma.com,
,,3,Tequip,,Karlstraße 12,80331,München,DE,,franz.meier@firma.com,
,,4,Tequip,,Friedrichstraße 22,10969,Berlin,DE,,lisa.schmidt@firma.com,
,,5,Tequip,,Rathausplatz 3,10178,Berlin,DE,,paul.bauer@firma.com,
,,6,Tequip,,Schlossallee 5,1067,Dresden,DE,,anna.klein@firma.com,
,,7,Tequip,,Marktplatz 10,28195,Bremen,DE,,julia.weber@firma.com,
,,8,Tequip,,Bahnhofstraße 8,70173,Stuttgart,DE,,karl.schulz@firma.com,
,,9,Tequip,,Kirchweg 16,60311,Frankfurt,DE,,martin.huber@firma.com,
,,10,Tequip,,Am See 2,78462,Konstanz,DE,,sophie.fischer@firma.com,


In [None]:
#Customer Dataset
#Clean the customer data, handling missing values.

# Handle missing values and create FullAddress column
customer_transformed = customer_df \
    .withColumn(
        "FullAddress", 
        concat(
            when(col("Street").isNotNull(), col("Street")).otherwise(lit("Unknown")), lit(", "),
            when(col("City").isNotNull(), col("City")).otherwise(lit("Unknown")), lit(", "),
            when(col("Country").isNotNull(), col("Country")).otherwise(lit("Unknown"))
        )
    ) \
    .fillna({
        "Email1": "Unknown",  # Fill missing Email1 with "Unknown"
        "Email2": "Unknown"   # Fill missing Email2 with "Unknown"
    }) \
    .filter(col("customerID").isNotNull())  # Ensure critical field `customerID` is not NULL



##4. Write Transformed Data to Processed Folder
Save the transformed datasets into the processed folder.

In [None]:
customer_transformed.write.format("parquet").mode("overwrite").save(paths["customer_processed"])
