You are tasked with analyzing the sales growth of products over the years 2022, 2023, and 2024. Your goal is to identify months where the sales for a product have consistently increased from 2022 to 2023 and from 2023 to 2024.
Your task is to write an SQL query to generate a report that includes the sales for each product at the month level for the years 2022, 2023, and 2024. However, you should only include product and months combination where the sales have consistently increased from 2022 to 2023 and from 2023 to 2024, display the output in ascending order of product_id

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from datetime import datetime, timedelta
from pyspark.sql.functions import *

# Initialize Spark session
spark = SparkSession.builder.appName("OrdersTable").getOrCreate()

# Define schema for the DataFrame
schema = StructType([
    StructField("order_id", IntegerType(), True),
    StructField("customer_id", IntegerType(), True),
    StructField("order_date", StringType(), True),
    StructField("product_id", IntegerType(), True),
    StructField("sales", IntegerType(), True)
])
data=[(1001, 101, "2022-01-01", 1, 90),
    (1002, 102, "2022-01-03", 2, 75),
    (1003, 103, "2022-01-05", 3, 90),
    (1004, 104, "2022-01-08", 1, 50),
    (1005, 105, "2022-01-10", 2, 150),
    (1006, 106, "2022-02-02", 3, 30),
    (1007, 107, "2022-02-05", 1, 180),
    (1008, 108, "2022-02-08", 2, 75),
    (1009, 109, "2022-02-12", 3, 60),
    (1010, 110, "2022-02-15", 1, 100),
    (1011, 111, "2022-03-01", 2, 75),
    (1012, 112, "2022-03-03", 3, 60),
    (2001, 113, "2023-01-02", 1, 95),
    (2002, 114, "2023-01-04", 2, 80),
    (2003, 115, "2023-01-06", 3, 85),
    (2004, 116, "2023-01-09", 1, 55),
    (2005, 117, "2023-01-11", 2, 160),
    (2006, 118, "2023-02-03", 3, 35),
    (2007, 119, "2023-02-06", 1, 185),
    (2008, 120, "2023-02-09", 2, 70),
    (2009, 121, "2023-02-13", 3, 65),
    (2010, 122, "2023-02-16", 1, 105),
    (2011, 123, "2023-03-02", 2, 80),
    (2012, 124, "2023-03-04", 3, 65),
    (3001, 125, "2024-01-03", 1, 100),
    (3002, 126, "2024-01-05", 2, 85),
    (3003, 127, "2024-01-07", 3, 95),
    (3004, 128, "2024-01-10", 1, 60),
    (3005, 129, "2024-01-12", 2, 170),
    (3006, 130, "2024-02-04", 3, 40),
    (3007, 131, "2024-02-07", 1, 190),
    (3008, 132, "2024-02-10", 2, 85),
    (3009, 133, "2024-02-14", 3, 70),
    (3010, 134, "2024-02-17", 1, 110),
    (3011, 135, "2024-03-03", 2, 85),
    (3012, 136, "2024-03-05", 3, 70)
]
order_df=spark.createDataFrame(data,schema)

In [0]:
#converting the string to date 
order_df=order_df.withColumn("order_date",col("order_date").cast("date"))

#extracting the year and month from date
order_df=order_df.withColumn("order_year",year(col("order_date")))
order_df=order_df.withColumn("order_month",month(col("order_date")))

#group by product_id,order_year,order_month 
order_df=order_df.groupBy(col("product_id"),col("order_year"),col("order_month")).agg(sum("sales").alias("Total_Sales"))

order_df = order_df.groupBy("product_id", "order_month").agg(
sum(when(col("order_year") == 2022, col("Total_Sales")).otherwise(0)).alias("2022_Sales"),
sum(when(col("order_year") == 2023, col("Total_Sales")).otherwise(0)).alias("2023_Sales"),
sum(when(col("order_year") == 2024, col("Total_Sales")).otherwise(0)).alias("2024_Sales")
)
# consistently increased from 2022 to 2023 and from 2023 to 2024 and sort the product_id by ascending order
order_df=order_df.filter((col("2023_Sales") > col("2022_Sales")) & (col("2024_Sales") > col("2023_Sales"))) \
    .orderBy(col("product_id").asc())
