In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType

# Initialize Spark session
spark = SparkSession.builder.appName("EcommerceData").getOrCreate()

# Define schema for Users table
users_schema = StructType([
    StructField("user_id", IntegerType(), True),
    StructField("join_date", StringType(), True),
    StructField("favorite_brand", StringType(), True)
])

# Define schema for Items table
items_schema = StructType([
    StructField("item_id", IntegerType(), True),
    StructField("item_brand", StringType(), True)
])

# Define schema for Orders table
orders_schema = StructType([
    StructField("order_id", IntegerType(), True),
    StructField("order_date", StringType(), True),
    StructField("item_id", IntegerType(), True),
    StructField("buyer_id", IntegerType(), True),
    StructField("seller_id", IntegerType(), True)
])

# Create Users DataFrame
users_data = [
    (1, "2019-01-01", "Lenovo"),
    (2, "2019-02-09", "Samsung"),
    (3, "2019-01-19", "LG"),
    (4, "2019-05-21", "HP")
]

users_df = spark.createDataFrame(users_data, schema=users_schema)

# Create Items DataFrame
items_data = [
    (1, "Samsung"),
    (2, "Lenovo"),
    (3, "LG"),
    (4, "HP")
]

items_df = spark.createDataFrame(items_data, schema=items_schema)

# Create Orders DataFrame
orders_data = [
    (1, "2019-08-01", 4, 1, 2),
    (2, "2019-08-02", 2, 1, 3),
    (3, "2019-08-03", 3, 2, 3),
    (4, "2019-08-04", 1, 4, 2),
    (5, "2019-08-04", 1, 3, 4),
    (6, "2019-08-05", 2, 2, 4)
]

orders_df = spark.createDataFrame(orders_data, schema=orders_schema)

# Create temporary views for SQL queries
users_df.createOrReplaceTempView("Users")
items_df.createOrReplaceTempView("Items")
orders_df.createOrReplaceTempView("Orders")


print("Users, Items, and Orders tables and views created successfully.")


Users, Items, and Orders tables and views created successfully.


In [27]:
spark.sql("""
    with second_orders as (
    select * from (
    select *,
    rank() over(partition by seller_id order by order_date asc) as rn
    from Orders) where rn = 2)
    
    select u.user_id, o.seller_id, o.buyer_id, i.item_brand, u.favorite_brand,
    case when i.item_brand = u.favorite_brand then 'yes' else 'no' end as 2nditemfavbrand

    from second_orders o 
    left join items i on i.item_id = o.item_id
    right join users u on u.user_id = o.seller_id
""").show()

+-------+---------+--------+----------+--------------+---------------+
|user_id|seller_id|buyer_id|item_brand|favorite_brand|2nditemfavbrand|
+-------+---------+--------+----------+--------------+---------------+
|      1|     null|    null|      null|        Lenovo|             no|
|      2|        2|       4|   Samsung|       Samsung|            yes|
|      3|        3|       2|        LG|            LG|            yes|
|      4|        4|       2|    Lenovo|            HP|             no|
+-------+---------+--------+----------+--------------+---------------+



In [29]:
from pyspark.sql.functions import *
from pyspark.sql.window import Window

In [37]:
rank_df = orders_df.select("*",
    rank().over(Window.partitionBy(col("seller_id")).orderBy(col("order_date").asc())).alias("rn")
).filter(col("rn") == 2)

In [40]:
result_df = (
    rank_df.alias("o")
    .join(orders_df.alias("o"), col("u.user_id") == col("o.seller_id"), "right")  # Right Join
    .join(items_df.alias("i"), col("o.item_id") == col("i.item_id"), "left")  # Left Join
    .select(
        col("u.user_id"),
        col("o.seller_id"),
        col("o.buyer_id"),
        col("i.item_brand"),
        col("u.favorite_brand"),
        when(col("i.item_brand") == col("u.favorite_brand"), "yes").otherwise("no").alias("2nditemfavbrand")
    )
)

AnalysisException: Reference 'o.seller_id' is ambiguous, could be: o.seller_id, o.seller_id.