In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, DateType, FloatType
from datetime import datetime
from pyspark.sql.functions import *
from pyspark.sql.window import Window

# Initialize Spark Session
spark = SparkSession.builder.appName("OrdersTable").getOrCreate()

# Define Schema for int_orders Table
int_orders_schema = StructType([
    StructField("order_number", IntegerType(), False),
    StructField("order_date", DateType(), False),
    StructField("cust_id", IntegerType(), False),
    StructField("salesperson_id", IntegerType(), False),
    StructField("amount", FloatType(), False)
])

# Define Data for int_orders Table
int_orders_data = [
    (30, datetime.strptime('1995-07-14', '%Y-%m-%d'), 9, 1, 460.0),
    (10, datetime.strptime('1996-08-02', '%Y-%m-%d'), 4, 2, 540.0),
    (40, datetime.strptime('1998-01-29', '%Y-%m-%d'), 7, 2, 2400.0),
    (50, datetime.strptime('1998-02-03', '%Y-%m-%d'), 6, 7, 600.0),
    (60, datetime.strptime('1998-03-02', '%Y-%m-%d'), 6, 7, 720.0),
    (70, datetime.strptime('1998-05-06', '%Y-%m-%d'), 9, 7, 150.0),
    (20, datetime.strptime('1999-01-30', '%Y-%m-%d'), 4, 8, 1800.0)
]

# Create int_orders DataFrame
int_orders_df = spark.createDataFrame(int_orders_data, schema=int_orders_schema)

# Show DataFrame
int_orders_df.show()

int_orders_df.createOrReplaceTempView("Orders")



+------------+----------+-------+--------------+------+
|order_number|order_date|cust_id|salesperson_id|amount|
+------------+----------+-------+--------------+------+
|          30|1995-07-14|      9|             1| 460.0|
|          10|1996-08-02|      4|             2| 540.0|
|          40|1998-01-29|      7|             2|2400.0|
|          50|1998-02-03|      6|             7| 600.0|
|          60|1998-03-02|      6|             7| 720.0|
|          70|1998-05-06|      9|             7| 150.0|
|          20|1999-01-30|      4|             8|1800.0|
+------------+----------+-------+--------------+------+



In [7]:
spark.sql(
'''
    select o1.order_number, o1.order_date, o1.cust_id, o1.salesperson_id, o1.amount from Orders o1 join Orders o2
    on o1.salesperson_id = o2.salesperson_id
    group by o1.order_number, o1.order_date, o1.cust_id, o1.salesperson_id, o1.amount
    having o1.amount >= max(o2.amount)
''').show()

+------------+----------+-------+--------------+------+
|order_number|order_date|cust_id|salesperson_id|amount|
+------------+----------+-------+--------------+------+
|          30|1995-07-14|      9|             1| 460.0|
|          40|1998-01-29|      7|             2|2400.0|
|          60|1998-03-02|      6|             7| 720.0|
|          20|1999-01-30|      4|             8|1800.0|
+------------+----------+-------+--------------+------+

