In [0]:
from pyspark.sql.functions import col
 
# Read CSV with multiline and escape options to handle JSON in ProductMetadata
sales_dfv1 = (
    spark.read.format("csv")
    .option("header", "true")
    .option("multiLine", "true")
    .option("escape", "\"")
    .option("quote", "\"")
    .load("/Volumes/workspace/default/data_volume/new_sales.csv")
)
 
display(sales_dfv1)

In [0]:
# Load CSV with inferSchema
sales_df = spark.read.format("csv") \
    .option("header", True) \
    .option("inferSchema", True) \
    .load("dbfs:/FileStore/data/new_sales.csv")

print("Initial data sample:")
sales_df.show(5)

In [0]:
from pyspark.sql.functions import col, when

# Remove duplicates
sales_df_clean = sales_df.dropDuplicates()

# Drop rows with missing Quantity or CustomerName (critical fields)
sales_df_clean = sales_df_clean.dropna(subset=["Quantity", "CustomerName"])

# Fix data types - convert Quantity to integer safely
sales_df_clean = sales_df_clean.withColumn(
    "Quantity",
    when(col("Quantity").cast("int").isNotNull(), col("Quantity").cast("int")).otherwise(None)
)

# Convert UnitPrice to float, set invalid to null
sales_df_clean = sales_df_clean.withColumn(
    "UnitPrice",
    when(col("UnitPrice").cast("float").isNotNull(), col("UnitPrice").cast("float")).otherwise(None)
)

# Convert TaxAmount to float
sales_df_clean = sales_df_clean.withColumn(
    "TaxAmount",
    when(col("TaxAmount").cast("float").isNotNull(), col("TaxAmount").cast("float")).otherwise(None)
)

# Drop rows where conversions failed (null in Quantity or UnitPrice)
sales_df_clean = sales_df_clean.dropna(subset=["Quantity", "UnitPrice"])

print("Cleaned data sample:")
sales_df_clean.show(5)

In [0]:
# Write cleaned data as Delta table
sales_df_clean.write.format("delta").mode("overwrite").saveAsTable("sales_delta_cleaned")

print("Cleaned Delta table created: sales_delta_cleaned")

In [0]:
from pyspark.sql.functions import col

# Load cleaned Delta table
df = spark.table("sales_delta_cleaned")

# Add TotalPrice column
df = df.withColumn("TotalPrice", col("Quantity") * col("UnitPrice"))

# Filter rows where Quantity > 5
filtered_df = df.filter(col("Quantity") > 5)

filtered_df.select(
    "SalesOrderNumber", "CustomerName", "Item", "Quantity", "UnitPrice", "TotalPrice"
).show(10)

In [0]:
%sql
-- Total sales per customer
SELECT CustomerName, SUM(Quantity) AS TotalQuantity, SUM(Quantity * UnitPrice) AS TotalSales
FROM sales_delta_cleaned
GROUP BY CustomerName
ORDER BY TotalSales DESC
LIMIT 10;

In [0]:
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType

schema = StructType([
    StructField("color", StringType(), True),
    StructField("warranty", StringType(), True)
])

df_with_json = df.withColumn("ProductDetails", from_json(col("ProductMetadata"), schema))

df_with_json.select(
    "SalesOrderNumber", "ProductMetadata", "ProductDetails.color", "ProductDetails.warranty"
).show(5)
