In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *


# Initialize a Spark session
spark = SparkSession.builder.appName("CreateOrdersDataFrame").getOrCreate()

# Create a list of tuples with the data
data = [
    (1, 1, "p1"),
    (1, 1, "p2"),
    (1, 1, "p3"),
    (2, 2, "p1"),
    (2, 2, "p2"),
    (2, 2, "p4"),
    (3, 1, "p5"),
    (3, 1, "p6"),
    (4, 3, "p1"),
    (4, 3, "p3"),
    (4, 3, "p5"),
    (5, 4, "p2"),
    (5, 4, "p4"),
    (5, 4, "p1"),
    
]

# Define the schema
columns = ["order_id", "customer_id", "product_id"]

# Create the DataFrame
orders = spark.createDataFrame(data, schema=columns)

# Show the DataFrame
orders.show()


+--------+-----------+----------+
|order_id|customer_id|product_id|
+--------+-----------+----------+
|       1|          1|        p1|
|       1|          1|        p2|
|       1|          1|        p3|
|       2|          2|        p1|
|       2|          2|        p2|
|       2|          2|        p4|
|       3|          1|        p5|
|       3|          1|        p6|
|       4|          3|        p1|
|       4|          3|        p3|
|       4|          3|        p5|
|       5|          4|        p2|
|       5|          4|        p4|
|       5|          4|        p1|
+--------+-----------+----------+



In [0]:
order_o1 = orders.alias("o1")
order_o2 = orders.alias("o2")

result = (
    order_o1
    .join(
        order_o2,
        [
            (col("o1.order_id") == col("o2.order_id")) &
            (col("o1.product_id") > col("o2.product_id"))
        ],
        how="inner"
    )
    .select(
        col("o1.product_id").alias("product_1"),
        col("o2.product_id").alias("product_2")
    )
    .groupBy(col("product_1"), col("product_2"))
    .agg(count("*").alias("purchase_frequency"))
    .orderBy(col("purchase_frequency").desc())
)
result.show()


+---------+---------+------------------+
|product_1|product_2|purchase_frequency|
+---------+---------+------------------+
|       p2|       p1|                 3|
|       p3|       p1|                 2|
|       p4|       p1|                 2|
|       p4|       p2|                 2|
|       p3|       p2|                 1|
|       p6|       p5|                 1|
|       p5|       p3|                 1|
|       p5|       p1|                 1|
+---------+---------+------------------+

