You are a data analyst working for an e-commerce company, responsible for analysing customer orders to gain insights into their purchasing behaviour. Your task is to write a SQL query to retrieve the details of the penultimate order for each customer. However, if a customer has placed only one order, you need to retrieve the details of that order instead, display the output in ascending order of customer name.

In [0]:
from pyspark.sql import SparkSession

from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql.functions import *
from pyspark.sql.window import *

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("Orders DataFrame") \
    .getOrCreate()

# Define the data
data = [
    (1, "2023-01-01", "Alexa", "iphone", 100),
    (2, "2023-01-02", "Alexa", "boAt", 300),
    (3, "2023-01-03", "Alexa", "Rolex", 400),
    (4, "2023-01-01", "Ramesh", "Titan", 200),
    (5, "2023-01-02", "Ramesh", "Shirt", 300),
    (6, "2023-01-03", "Neha", "Dress", 100),
]



# Define the schema
schema = StructType([
    StructField("order_id", IntegerType(), True),
    StructField("order_date", StringType(), True),
    StructField("customer_name", StringType(), True),
    StructField("product_name", StringType(), True),
    StructField("sales", IntegerType(), True)
])


# Create the DataFrame
orders = spark.createDataFrame(data, schema=schema)

# Show the DataFrame
orders.show()



+--------+----------+-------------+------------+-----+
|order_id|order_date|customer_name|product_name|sales|
+--------+----------+-------------+------------+-----+
|       1|2023-01-01|        Alexa|      iphone|  100|
|       2|2023-01-02|        Alexa|        boAt|  300|
|       3|2023-01-03|        Alexa|       Rolex|  400|
|       4|2023-01-01|       Ramesh|       Titan|  200|
|       5|2023-01-02|       Ramesh|       Shirt|  300|
|       6|2023-01-03|         Neha|       Dress|  100|
+--------+----------+-------------+------------+-----+



In [0]:
#Checking the schema
orders.printSchema()


#Converting the order_date to date
orders=orders.withColumn("order_date",to_date(col("order_date"),"yyyy-MM-dd"))

#Specifying the Windown Function
window_spec=Window.partitionBy(col("customer_name")).orderBy(col("order_date").desc())

#2nd Last order of every customer and if customer have 1 order that should be reflect also

orders=orders.withColumn("rn",row_number().over(window_spec)) \
    .withColumn("cnt",count("*").over(Window.partitionBy(col("customer_name")))) \
        .filter((col("rn")==2) | (col("cnt")==1))


In [0]:

orders=orders.withColumn("rn",row_number().over(window_spec)) \
    .withColumn("cnt",count("*").over(Window.partitionBy(col("customer_name")))) \
        .filter((col("rn")==2) | (col("cnt")==1))


order_id,order_date,customer_name,product_name,sales,rn,cnt
2,2023-01-02,Alexa,boAt,300,2,3
6,2023-01-03,Neha,Dress,100,1,1
4,2023-01-01,Ramesh,Titan,200,2,2
