In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql.functions import *
from pyspark.sql.window import Window
spark = SparkSession.builder.appName("PySparkTables").getOrCreate()
# Initialize Spark Session
spark = SparkSession.builder.appName("PySparkTables").getOrCreate()

# Define schema for Orders table
orders_schema = StructType([
    StructField("order_id", IntegerType(), True),
    StructField("customer_id", IntegerType(), True),
    StructField("product_id", IntegerType(), True)
])

# Create Orders DataFrame
orders_data = [
    (1, 1, 1),
    (1, 1, 2),
    (1, 1, 3),
    (2, 2, 1),
    (2, 2, 2),
    (2, 2, 4),
    (3, 1, 5)
]
orders_df = spark.createDataFrame(orders_data, schema=orders_schema)

# Register as Temp View
orders_df.createOrReplaceTempView("orders")

# Define schema for Products table
products_schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True)
])

# Create Products DataFrame
products_data = [
    (1, 'A'),
    (2, 'B'),
    (3, 'C'),
    (4, 'D'),
    (5, 'E')
]
products_df = spark.createDataFrame(products_data, schema=products_schema)

# Register as Temp View
products_df.createOrReplaceTempView("products")

# Verify Tables by Running SQL Queries
spark.sql("SELECT * FROM orders").show()
spark.sql("SELECT * FROM products").show()


+--------+-----------+----------+
|order_id|customer_id|product_id|
+--------+-----------+----------+
|       1|          1|         1|
|       1|          1|         2|
|       1|          1|         3|
|       2|          2|         1|
|       2|          2|         2|
|       2|          2|         4|
|       3|          1|         5|
+--------+-----------+----------+

+---+----+
| id|name|
+---+----+
|  1|   A|
|  2|   B|
|  3|   C|
|  4|   D|
|  5|   E|
+---+----+



In [2]:
spark.sql("""
    select concat(pr1.name, pr2.name)
    from orders o1 
    inner join orders o2 on o1.order_id = o2.order_id
    inner join products pr1 on o1.product_id = pr1.id
    inner join products pr2 on o2.product_id = pr2.id
    where o1.product_id < o2.product_id
    group by pr1.name, pr2.name
""").show()

+------------------+
|concat(name, name)|
+------------------+
|                BC|
|                AC|
|                AD|
|                BD|
|                AB|
+------------------+



In [3]:
o1 = orders_df.alias("o1")
o2 = orders_df.alias("o2")

# Create two aliases for products
pr1 = products_df.alias("pr1")
pr2 = products_df.alias("pr2")

# Perform joins
result_df = (
    o1.join(o2, col("o1.order_id") == col("o2.order_id"))  # Orders Join
    .join(pr1, col("o1.product_id") == col("pr1.id"))  # First Product Join
    .join(pr2, col("o2.product_id") == col("pr2.id"))  # Second Product Join
    .filter(col("o1.product_id") < col("o2.product_id"))  # Apply filter condition
    .select(concat(col("pr1.name"), lit(" "), col("pr2.name")).alias("combined_names"))  # Concatenation
    .distinct()  # Group by effect (since we select distinct pairs)
)
