In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, DateType, StringType
from pyspark.sql.functions import to_date


# Initialize Spark Session
spark = SparkSession.builder.appName("ProductsTable").getOrCreate()

# Define schema for Products table
schema = StructType([
    StructField("product_id", IntegerType(), True),
    StructField("new_price", IntegerType(), True),
    StructField("change_date", StringType(), True)
])

# Create data
data = [
    (1, 20, "2019-08-14"),
    (2, 50, "2019-08-14"),
    (1, 30, "2019-08-15"),
    (1, 35, "2019-08-16"),
    (2, 65, "2019-08-17"),
    (3, 20, "2019-08-18")
]

# Create DataFrame
products_df = spark.createDataFrame(data, schema=schema)
products_df = products_df.withColumn("change_date", to_date("change_date"))

# Register DataFrame as a SQL temporary table
products_df.createOrReplaceTempView("Products")

# Display the table
spark.sql("SELECT * FROM Products").show()


+----------+---------+-----------+
|product_id|new_price|change_date|
+----------+---------+-----------+
|         1|       20| 2019-08-14|
|         2|       50| 2019-08-14|
|         1|       30| 2019-08-15|
|         1|       35| 2019-08-16|
|         2|       65| 2019-08-17|
|         3|       20| 2019-08-18|
+----------+---------+-----------+



In [40]:
spark.sql("""
    with cte as (select * from 
    (select *,
    last_value(new_price) over (partition by product_id order by change_date range between unbounded preceding and unbounded following) as last_value
    from products
    where change_date <= '2019-08-16'))
    
    select distinct p.product_id, coalesce(c.last_value,10) as price 
    from products p left join cte c on p.product_id=c.product_id
""").show()

+----------+-----+
|product_id|price|
+----------+-----+
|         1|   35|
|         2|   50|
|         3|   10|
+----------+-----+



In [46]:
from pyspark.sql.functions import *
from pyspark.sql import Window
windowSpec  = Window.partitionBy("product_id").orderBy("change_date").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
cte = (
    products_df
    .filter(col("change_date") <= "2019-08-16")
    .withColumn("last_value", last("new_price").over(windowSpec))
)

In [62]:
products_df.alias("p").join(cte.alias("c"), on = col('p.product_id') == col('c.product_id'), how = "left").select(
    col("p.product_id"), 
    coalesce(col("c.last_value"),lit(10)).alias("price")
).distinct().show()

+----------+-----+
|product_id|price|
+----------+-----+
|         1|   35|
|         2|   50|
|         3|   10|
+----------+-----+

