In [0]:
from pyspark.sql.functions import col, lit, current_timestamp, sum as _sum
from delta.tables import DeltaTable
from pydeequ.checks import Check, CheckLevel
from pydeequ.verification import VerificationResult, VerificationSuite
import os

print(os.environ['SPARK_VERSION'])
date_str = "2024-07-25"

booking_data = f"/Volumes/incremental_load/project1/orders_data/booking_data/bookings_{date_str}.csv"
customer_data = f"/Volumes/incremental_load/project1/orders_data/customer_data/customers_{date_str}.csv"
print(booking_data)
print(customer_data)


booking_df = (
    spark.read.format("csv")
            .option("header", "true")
            .option("inferSchema", "true")
            .option("quote", "\"")
            .option("multiline", "true")
            .load(booking_data)
)

booking_df.printSchema()
display(booking_df)

customer_df = (
    spark.read.format("csv")
            .option("header", "true")
            .option("inferSchema", "true")
            .option("quote", "\"")
            .option("multiline", "true")
            .load(customer_data)
)

customer_df.printSchema()
display(customer_df)

check_incremental = Check(spark, CheckLevel.Error, "Bookings Data check") \
                    .hasSize(lambda x: x > 0) \
                    .isComplete("booking_id", hint="Booking ID is not unique throught")\
                    .isComplete("amount")\
                    .isNonNegative("amount")\
                    .isNonNegative("quantity")\
                    .isNonNegative("discount")

check_scd = Check(spark, CheckLevel.Error, "Customer Data check") \
                    .hasSize(lambda x: x > 0)\
                    .isUnique("customer_id")\
                    .isComplete("customer_name")\
                    .isComplete("customer_address")\
                    .isComplete("email")
booking_dq_check = VerificationSuite(spark)\
                .onData(booking_df)\
                .addCheck(check_incremental)\
                .run()

customer_dq_check = VerificationSuite(spark)\
                .onData(customer_df)\
                .addCheck(check_scd)\
                .run()
booking_dq_check_df = VerificationResult.checkResultsAsDataFrame(spark, booking_dq_check)
display(booking_dq_check_df)
customer_dq_check_df = VerificationResult.checkResultsAsDataFrame(spark, customer_dq_check)
display(customer_dq_check_df)

if booking_dq_check.status != "Success":
    raise ValueError("Data quality check failed for booking data")

if customer_dq_check.status != "Success":
    raise ValueError("Data quality check failed for customer data")

booking_df_incremental = booking_df.withColumn("ingestion_time", current_timestamp())

df_joined = booking_df_incremental.join(customer_df, ["customer_id"], "inner")

df_transformed = df_joined\
                .withColumn("total_cost", col("amount") - col("discount"))\
                .filter(col("quantity") > 0)
df_transformed_agg = df_transformed \
                .groupBy("booking_type", "customer_id") \
                .agg(_sum("total_cost").alias("total_amount_sum"),
                    _sum("quantity").alias("total_quantity_sum")
                )
fact_table_path = "incremental_load.default.booking_fact"
fact_table_exists = spark.catalog.tableExists(fact_table_path)

if fact_table_exists:
    df_existting_fact = spark.read.format("delta").table(fact_table_path)

    df_combined = df_existting_fact.unionByName(df_transformed_agg, allowMissingColumns=True)

    df_final_agg = df_combined \
                    .groupBy("booking_type") \
                    .agg(_sum("total_amount_sum").alias("total_amount_sum"),                     _sum("total_quantity_sum").alias("total_quantity_sum"))
else:
    df_final_agg = df_transformed_agg
display(df_final_agg)

df_final_agg.write.format("delta").modet("overwrite").option("overwriteSchema", "true").saveASTable(fact_table_path)


scd_table_path = "incremental_load.default.customer_dim"
scd_table_exists = spark.catalog.tableExists(scd_table_path)

if scd_table_exists:
    scd_table = DeltaTable.forName(spark, scd_table_path)
    display(scd_table.toDF())

    scd_table.alias("scd")\
        .merge(
            customer_df.alias("updates"),
            "scd.customer_id = updates.customer_id and scd.valid_to = '9999-12-31'"
        ).whenMatchedUpdate(
            set = {
                "valid_to": "updates.from"
            }
        )\
        .execute()
    customer_df.write.format("delta").mode("append").saveAsTable(scd_table_path)
else:
    customer_df.write.format("delta").mode("overwrite").saveAsTable(scd_table_path)
            

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode

# Create a Spark session
spark = SparkSession.builder.appName("ExplodeExample").getOrCreate()

# Sample data as a list of dictionaries (could be loaded from a JSON file)
data = [
    {"name": "Alice", "age": 30, "experience": [{"company": "Company A", "years": 2}]},
    {"name": "Bob", "age": 35, "experience": [{"company": "Company B", "years": 3},
                                              {"company": "Company C", "years": 4}]}
]

# Define the schema for the data
schema = "name STRING, age INT, experience ARRAY<STRUCT<company: STRING, years: INT>>"

# Create a DataFrame from the sample data
df = spark.createDataFrame(data, schema)

# Show the original DataFrame
df.show(truncate=False)


In [0]:
# Explode the experience array into separate rows
df_exploded = df.select("name", explode("experience").alias("exp"))

# Flatten the struct inside the exploded column
df_flat = df_exploded.select(
    "name",
    "exp.company",
    "exp.years"
)

# Show the exploded DataFrame
df_flat.show(truncate=False)


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode

# Create a Spark session
spark = SparkSession.builder.appName("ExplodeMapExample").getOrCreate()

# Sample data with map (dictionary)
data = [
    {"name": "Alice", "age": 30, "skills": {"Python": 5, "Spark": 4, "SQL": 3}},
    {"name": "Bob", "age": 35, "skills": {"Java": 5, "Scala": 4}}
]

# Define the schema
schema = "name STRING, age INT, skills MAP<STRING, INT>"

# Create a DataFrame from the sample data
df = spark.createDataFrame(data, schema)

# Show the original DataFrame
df.show(truncate=False)


In [0]:
# Explode the skills map into individual rows (key-value pairs)
df_exploded = df.select("name", "age", explode("skills").alias("skill_name", "skill_rating"))

# Show the exploded DataFrame
df_exploded.show(truncate=False)
