You are given a products table where a new row is inserted every time the price of a product changes. Additionally, there is a transaction table containing details such as order_date and product_id for each order.

Write an SQL query to calculate the total sales value for each product, considering the cost of the product at the time of the order date, display the output in ascending order of the product_i


In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from datetime import datetime
from pyspark.sql.window import *
spark=SparkSession.builder.appName("DynamicPricing").getOrCreate()

from pyspark.sql.types import StructType,StructField,IntegerType,StringType,DateType
products_schema=StructType(
    [
        StructField("product_id",IntegerType(),True),
        StructField("price_date",DateType(),True),
        StructField("price",IntegerType(),True)
    ]
)

from datetime import datetime

products_data = [
    (100, datetime.strptime('2024-01-01', '%Y-%m-%d'), 150),
    (100, datetime.strptime('2024-01-21', '%Y-%m-%d'), 170),
    (100, datetime.strptime('2024-02-01', '%Y-%m-%d'), 190),
    (101, datetime.strptime('2024-01-01', '%Y-%m-%d'), 1000),
    (101, datetime.strptime('2024-01-27', '%Y-%m-%d'), 1200),
    (101, datetime.strptime('2024-02-05', '%Y-%m-%d'), 1250)
]

product_df=spark.createDataFrame(schema=products_schema,data=products_data)

order_schema=StructType(
    [
        StructField("order_id",IntegerType(),True),
        StructField("order_date",DateType(),True),
        StructField("product_id",IntegerType(),True)
    ]
)

from datetime import datetime

orders_data = [
    (1, datetime.strptime('2024-01-05', '%Y-%m-%d'), 100),
    (2, datetime.strptime('2024-01-21', '%Y-%m-%d'), 100),
    (3, datetime.strptime('2024-02-20', '%Y-%m-%d'), 100),
    (4, datetime.strptime('2024-01-07', '%Y-%m-%d'), 101),
    (5, datetime.strptime('2024-02-04', '%Y-%m-%d'), 101),
    (6, datetime.strptime('2024-02-05', '%Y-%m-%d'), 101)
]

orders_df=spark.createDataFrame(schema=order_schema,data=orders_data)

In [0]:
# Sort the DataFrame by 'product_id' and 'price_date'
product_df=product_df.orderBy(col('product_id'),col('price_date'))

# Create a lead column for 'price_date'
window_spec=Window.partitionBy('product_id').orderBy('price_date')
product_df=product_df.withColumn('lead_price_date',lead('price_date',1).over(window_spec))
# Subtract 1 day from 'lead_price_date'
product_df=product_df.withColumn("lead_price_date",date_sub("lead_price_date",1))


# Fill null values in 'lead_price_date' with '2222-12-30'
product_df=product_df.fillna({"lead_price_date":'2222-12-30'})

# Merge with 'orders_df' on 'product_id'
merged_df=product_df.join(orders_df,on="product_id",how="inner")

merged_df = merged_df.filter(
    (to_date('order_date') >= to_date('price_date')) &
    (to_date('order_date') <= to_date('lead_price_date'))
)
# Calculate the total sales per product_id
final_df = merged_df.groupBy('product_id').agg(sum('price').alias('total_sales'))

# Sort the final DataFrame by 'product_id'
final_df = final_df.orderBy('product_id')

# Display the final DataFrame
final_df.show()




+----------+-----------+
|product_id|total_sales|
+----------+-----------+
|       100|        510|
|       101|       3450|
+----------+-----------+



In [0]:
merged_df.show()

+----------+----------+-----+---------------+--------+----------+
|product_id|price_date|price|lead_price_date|order_id|order_date|
+----------+----------+-----+---------------+--------+----------+
|       100|2024-01-01|  150|     2024-01-20|       1|2024-01-05|
|       100|2024-01-21|  170|     2024-01-31|       2|2024-01-21|
|       100|2024-02-01|  190|     2222-12-30|       3|2024-02-20|
|       101|2024-01-01| 1000|     2024-01-26|       4|2024-01-07|
|       101|2024-01-27| 1200|     2024-02-04|       5|2024-02-04|
|       101|2024-02-05| 1250|     2222-12-30|       6|2024-02-05|
+----------+----------+-----+---------------+--------+----------+

