In [0]:
csv_path = "dbfs:/FileStore/data/new_sales.csv"

sales_df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load(csv_path)

print("Raw data sample:")
sales_df.show(5, truncate=False)

In [0]:
from pyspark.sql.functions import col

clean_df = sales_df.dropDuplicates() \
    .dropna(subset=["Quantity", "CustomerName"]) \
    .withColumn("Quantity", col("Quantity").cast("int")) \
    .withColumn("UnitPrice", col("UnitPrice").cast("float"))

print("Cleaned data preview:")
clean_df.show(5, truncate=False)

In [0]:
from pyspark.sql.functions import from_json, regexp_replace
from pyspark.sql.types import StructType, StructField, StringType

# Clean JSON string in ProductMetadata
clean_df = clean_df.withColumn(
    "ProductMetadata_clean",
    regexp_replace(col("ProductMetadata"), '^"+|"+$', '')
).withColumn(
    "ProductMetadata_clean",
    regexp_replace(col("ProductMetadata_clean"), '""', '"')
)

# Define schema for JSON parsing
json_schema = StructType([
    StructField("color", StringType(), True),
    StructField("warranty", StringType(), True)
])

# Parse JSON column
clean_df = clean_df.withColumn("ProductDetails", from_json(col("ProductMetadata_clean"), json_schema))

# Extract fields
clean_df = clean_df.withColumn("color", col("ProductDetails.color")) \
    .withColumn("warranty", col("ProductDetails.warranty"))

print("Data with parsed JSON fields:")
clean_df.select("SalesOrderNumber", "color", "warranty").show(5)


In [0]:
clean_df.write.format("delta").mode("overwrite").saveAsTable("sales_cleaned")
print("Delta table 'sales_cleaned' created.")

In [0]:
from pyspark.sql.functions import sum as _sum

agg_df = clean_df.groupBy("CustomerName").agg(
    _sum(col("Quantity") * col("UnitPrice")).alias("TotalSales")
)

print("Aggregated sales per customer:")
agg_df.show(10)

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, DateType, IntegerType, DoubleType

schema = StructType([
    StructField("SalesOrderNumber", StringType(), True),
    StructField("OrderDate", DateType(), True),
    StructField("CustomerName", StringType(), True),
    StructField("EmailAddress", StringType(), True),
    StructField("Item", StringType(), True),
    StructField("Quantity", IntegerType(), True),
    StructField("UnitPrice", DoubleType(), True),
    StructField("TaxAmount", DoubleType(), True),
    StructField("ProductMetadata", StringType(), True)
])

sales_df = (
    spark.readStream
    .format("cloudFiles")
    .option("cloudFiles.format", "csv") 
    .option("cloudFiles.schemaLocation", "/FileStore/data/sales/")
    .schema(schema)
    .load("/FileStore/data/sales/")
)

deduped_df = sales_df.dropDuplicates(["SalesOrderNumber"])

from delta.tables import DeltaTable

(
    deduped_df.writeStream
    .foreachBatch(
        lambda batch_df, epoch_id: (
            DeltaTable.forName(spark, "sales_raw")
            .alias("target")
            .merge(
                batch_df.alias("source"),
                "target.SalesOrderNumber = source.SalesOrderNumber"
            )
            .whenNotMatchedInsertAll()
            .execute()
        )
    )
    .outputMode("update")
    .option("checkpointLocation", "/FileStore/data/sales/")
    .start()
)


display(sales_df.limit(5))

In [0]:
csv_path = "dbfs:/FileStore/data/new_sales*.csv"

sales_df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load(csv_path)

from pyspark.sql.functions import col

clean_df = sales_df.dropDuplicates()


from delta.tables import DeltaTable


(
    deduped_df.writeStream
    .foreachBatch(
        lambda batch_df, epoch_id: (
            DeltaTable.forName(spark, "sales_raw")
            .alias("target")
            .merge(
                batch_df.alias("source"),
                "target.SalesOrderNumber = source.SalesOrderNumber"
            )
            .whenNotMatchedInsertAll()
            .execute()
        )
    )
    .outputMode("update")
    .option("checkpointLocation", "/mnt/checkpoints/sales_raw/")
    .start()
)





In [0]:
from pyspark.sql.functions import input_file_name
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, DateType

# Define schema for sales data
schema = StructType([
    StructField("SalesOrderNumber", StringType(), True),
    StructField("OrderDate", DateType(), True),
    StructField("CustomerName", StringType(), True),
    StructField("EmailAddress", StringType(), True),
    StructField("Item", StringType(), True),
    StructField("Quantity", IntegerType(), True),
    StructField("UnitPrice", DoubleType(), True),
    StructField("TaxAmount", DoubleType(), True),
    StructField("ProductMetadata", StringType(), True)
])

# Simulate daily CSV files landing in /mnt/sales_landing/
# (In practice, files would be uploaded externally or via dbutils.fs.cp)

# List all CSV files in the landing folder
landing_path = "/FileStore/data/sales/"
all_files = [f.path for f in dbutils.fs.ls(landing_path) if f.path.endswith(".csv")]

# Read already ingested file list from Delta table metadata
if spark._jsparkSession.catalog().tableExists("sales_raw"):
    ingested_files = spark.read.format("delta").table("sales_raw").select("source_file").distinct().rdd.flatMap(lambda x: x).collect()
else:
    ingested_files = []

# Detect new files
new_files = [f for f in all_files if f not in ingested_files]

if new_files:
    # Read new files with schema validation
    new_df = (
        spark.read
        .option("header", True)
        .schema(sales_schema)
        .csv(new_files)
        .withColumn("source_file", input_file_name())
    )

    # Append new data to sales_raw Delta table, avoiding duplicates by OrderID and source_file
    if spark._jsparkSession.catalog().tableExists("sales_raw"):
        sales_raw_df = spark.read.format("delta").table("sales_raw")
        new_df = new_df.join(
            sales_raw_df.select("OrderID", "source_file"),
            on=["OrderID", "source_file"],
            how="left_anti"
        )
        new_df.write.format("delta").mode("append").saveAsTable("sales_raw")
    else:
        new_df.write.format("delta").mode("overwrite").saveAsTable("sales_raw")

display(spark.read.table("sales_raw"))

In [0]:
from pyspark.sql.functions import col, avg, when, to_date, regexp_replace
from pyspark.sql.window import Window

# Read raw sales data
raw_df = spark.read.table("sales_raw")

# 1. Correct inconsistent date formats in OrderDate
# Try parsing as yyyy-MM-dd, then as MM/dd/yyyy if null
cleaned_dates_df = raw_df.withColumn(
    "OrderDate",
    when(
        to_date(col("OrderDate"), "yyyy-MM-dd").isNotNull(),
        to_date(col("OrderDate"), "yyyy-MM-dd")
    ).otherwise(
        to_date(col("OrderDate"), "MM/dd/yyyy")
    )
)

# 2. Impute missing UnitPrice with average per Product (Item)
window_spec = Window.partitionBy("Product")
imputed_df = cleaned_dates_df.withColumn(
    "UnitPrice",
    when(
        col("UnitPrice").isNull(),
        avg(col("UnitPrice")).over(window_spec)
    ).otherwise(col("UnitPrice"))
)

# 3. Remove or flag suspicious records (Quantity ≤ 0)
final_clean_df = imputed_df.withColumn(
    "is_suspicious",
    when(col("Quantity") <= 0, True).otherwise(False)
).filter(col("Quantity") > 0)

display(final_clean_df)

In [0]:
from pyspark.sql.functions import sum as _sum, rank, avg as _avg, col, lag
from pyspark.sql.window import Window

# Assume 'final_clean_df' is available from previous cell

# 1. Running total of sales per customer sorted by OrderDate
running_total_window = Window.partitionBy("CustomerName").orderBy("OrderDate").rowsBetween(Window.unboundedPreceding, Window.currentRow)
df_running_total = final_clean_df.withColumn(
    "RunningTotalSales",
    _sum(col("Quantity") * col("UnitPrice")).over(running_total_window)
)

# 2. Rank of customers by total sales per region
# Assume 'Region' column exists; if not, replace 'Region' with appropriate column or remove partitioning by region
total_sales_df = final_clean_df.groupBy("Region", "CustomerName").agg(
    _sum(col("Quantity") * col("UnitPrice")).alias("TotalSales")
)
rank_window = Window.partitionBy("Region").orderBy(col("TotalSales").desc())
df_ranked = total_sales_df.withColumn(
    "CustomerRank",
    rank().over(rank_window)
)

# 3. Identify customers with a sudden spike in purchases compared to their average
# Calculate average sales per customer
avg_sales_window = Window.partitionBy("CustomerName")
# Calculate previous day's sales per customer
order_window = Window.partitionBy("CustomerName").orderBy("OrderDate")
df_spike = final_clean_df.withColumn(
    "DailySales", col("Quantity") * col("UnitPrice")
).withColumn(
    "AvgSales", _avg("DailySales").over(avg_sales_window)
).withColumn(
    "PrevSales", lag("DailySales").over(order_window)
).withColumn(
    "Spike",
    (col("DailySales") > 2 * col("AvgSales")) & (col("PrevSales").isNotNull())
)

display(df_running_total)
display(df_ranked)
display(df_spike.filter(col("Spike")))