In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, DateType
from datetime import datetime, timedelta

# Initialize Spark session
spark = SparkSession.builder.appName("SalesData").getOrCreate()

# Define schema for Sales table
sales_schema = StructType([
    StructField("product_id", IntegerType(), True),
    StructField("period_start", DateType(), True),
    StructField("period_end", DateType(), True),
    StructField("average_daily_sales", IntegerType(), True)
])

# Correctly parse date strings into datetime objects
sales_data = [
    (1, datetime.strptime("2019-01-25", "%Y-%m-%d"), datetime.strptime("2019-02-28", "%Y-%m-%d"), 100),
    (2, datetime.strptime("2018-12-01", "%Y-%m-%d"), datetime.strptime("2020-01-01", "%Y-%m-%d"), 10),
    (3, datetime.strptime("2019-12-01", "%Y-%m-%d"), datetime.strptime("2020-01-31", "%Y-%m-%d"), 1)
]

# Create DataFrame
sales_df = spark.createDataFrame(sales_data, schema=sales_schema)

# Create a temporary SQL view
sales_df.createOrReplaceTempView("Sales")



print("Sales table and view created successfully.")


Sales table and view created successfully.


In [5]:
spark.sql("""
    WITH r_cte AS (
    SELECT MIN(period_start) AS dates, MAX(period_end) AS max_date FROM Sales
    UNION ALL
    SELECT DATE_ADD(dates, 1), max_date FROM r_cte WHERE dates < max_date
    )

    SELECT * FROM r_cte;
""")

AnalysisException: Table or view not found: r_cte; line 5 pos 45;
'WithCTE
:- 'CTERelationDef 1, false
:  +- 'SubqueryAlias r_cte
:     +- 'Union false, false
:        :- Aggregate [min(period_start#1) AS dates#12, max(period_end#2) AS max_date#13]
:        :  +- SubqueryAlias sales
:        :     +- View (`Sales`, [product_id#0,period_start#1,period_end#2,average_daily_sales#3])
:        :        +- LogicalRDD [product_id#0, period_start#1, period_end#2, average_daily_sales#3], false
:        +- 'Project [unresolvedalias('DATE_ADD('dates, 1), None), 'max_date]
:           +- 'Filter ('dates < 'max_date)
:              +- 'UnresolvedRelation [r_cte], [], false
+- 'Project [*]
   +- 'SubqueryAlias r_cte
      +- 'CTERelationRef 1, false


In [20]:
min_date, max_date = sales_df.select(min("period_start"), max("period_end")).first()

# Initialize result list with start date
date_list = [(min_date, max_date)]

# Generate dates iteratively
current_date = min_date
while current_date < max_date:
    current_date = current_date + timedelta(days=1)
    date_list.append((current_date, max_date))

# Create DataFrame from the date list
date_df = spark.createDataFrame(date_list, ["dates", "max_date"])

# Show the results
result_df = date_df.alias("d").join(
    sales_df.alias("s"), 
    (col("d.dates").between(col("s.period_start"), col("s.period_end"))),  # Correct usage of between
    "inner"
).orderBy(col("product_id"), col("dates"))

result_df = result_df.groupBy(
    col("product_id"), year(col("dates"))
).agg(
    sum(col("average_daily_sales")).alias("TA")
)

In [21]:
result_df.show()

+----------+-----------+----+
|product_id|year(dates)|  TA|
+----------+-----------+----+
|         2|       2018| 310|
|         2|       2019|3650|
|         1|       2019|3500|
|         3|       2019|  31|
|         2|       2020|  10|
|         3|       2020|  31|
+----------+-----------+----+

