In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from datetime import datetime
from pyspark.sql.window import Window

# Initialize Spark Session
spark = SparkSession.builder.appName("ProductsAndBudget").getOrCreate()

# Define Schema for Products Table
products_schema = StructType([
    StructField("product_id", StringType(), True),
    StructField("cost", IntegerType(), True)
])

# Define Data for Products
products_data = [
    ('P1', 200), ('P2', 300), ('P3', 500), ('P4', 800)
]

# Create Products DataFrame
products_df = spark.createDataFrame(products_data, schema=products_schema)

# Define Schema for Customer Budget Table
customer_budget_schema = StructType([
    StructField("customer_id", IntegerType(), True),
    StructField("budget", IntegerType(), True)
])

# Define Data for Customer Budget
customer_budget_data = [
    (100, 400), (200, 800), (300, 1500)
]

# Create Customer Budget DataFrame
customer_budget_df = spark.createDataFrame(customer_budget_data, schema=customer_budget_schema)

# Show DataFrames
print("Products Table:")
products_df.show()

print("Customer Budget Table:")
customer_budget_df.show()

products_df.createOrReplaceTempView("Products")
customer_budget_df.createOrReplaceTempView("Customer_budget")


Products Table:
+----------+----+
|product_id|cost|
+----------+----+
|        P1| 200|
|        P2| 300|
|        P3| 500|
|        P4| 800|
+----------+----+

Customer Budget Table:
+-----------+------+
|customer_id|budget|
+-----------+------+
|        100|   400|
|        200|   800|
|        300|  1500|
+-----------+------+



In [2]:
spark.sql(
"""
    with running_cost as
    (
    select *,
    sum(cost) over (rows between unbounded preceding and current row) as running
    from products
    )
    
    select customer_id, min(budget) as budget, 
    count(*) as no_of_products,
    array_agg(product_id) as list_of_products
    from customer_budget c join running_cost p
    on p.running <= c.budget
    group by customer_id
    order by customer_id
""").show()

+-----------+------+--------------+----------------+
|customer_id|budget|no_of_products|list_of_products|
+-----------+------+--------------+----------------+
|        100|   400|             1|            [P1]|
|        200|   800|             2|        [P1, P2]|
|        300|  1500|             3|    [P1, P2, P3]|
+-----------+------+--------------+----------------+



In [6]:
from pyspark.sql.functions import *

In [8]:
window_spec = Window.rowsBetween(Window.unboundedPreceding, Window.currentRow)

df_running_cost = products_df.withColumn("running", sum(col("cost")).over(window_spec))

df_result = (
    customer_budget_df.alias("c")
    .join(df_running_cost.alias("p"), col("p.running") <= col("c.budget"), "inner")
    .groupBy("customer_id")
    .agg(
        min(col("budget")).alias("budget"),
        count("*").alias("no_of_products"),
        collect_list(col("product_id")).alias("list_of_products")
    )
    .orderBy("customer_id")
)


In [9]:
df_result.show()

+-----------+------+--------------+----------------+
|customer_id|budget|no_of_products|list_of_products|
+-----------+------+--------------+----------------+
|        100|   400|             1|            [P1]|
|        200|   800|             2|        [P1, P2]|
|        300|  1500|             3|    [P1, P2, P3]|
+-----------+------+--------------+----------------+

