In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import *
from pyspark.sql.window import Window

# Initialize SparkSession
spark = SparkSession.builder.appName("EmployeeTable").getOrCreate()

# Define schema
schema = StructType([
    StructField("emp_name", StringType(), True),
    StructField("dep_id", IntegerType(), True),
    StructField("salary", IntegerType(), True)
])

# Data (list of tuples)
data = [
    ("Siva", 1, 30000),
    ("Ravi", 2, 40000),
    ("Prasad", 1, 50000),
    ("Sai", 2, 20000)
]

# Create DataFrame
employee_df = spark.createDataFrame(data, schema)
employee_df.createOrReplaceTempView("employee_df")

# Show DataFrame
employee_df.show()


+--------+------+------+
|emp_name|dep_id|salary|
+--------+------+------+
|    Siva|     1| 30000|
|    Ravi|     2| 40000|
|  Prasad|     1| 50000|
|     Sai|     2| 20000|
+--------+------+------+



In [17]:
spark.sql("""
    with cte as (
    select *, rank() over(partition by dep_id order by salary) as rasc,
    rank() over(partition by dep_id order by salary desc) as rdesc
    from employee)
    
    select dep_id,
        max(case when rasc = 1 then salary end) as lowsalary,
        max(case when rdesc = 1 then salary end) as highsalary
    from cte
    group by dep_id
""").show()

AnalysisException: expression 'cte.rasc' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get.;
WithCTE
:- CTERelationDef 4, false
:  +- SubqueryAlias cte
:     +- Project [emp_name#48, dep_id#49, salary#50, rasc#273, rdesc#274]
:        +- Project [emp_name#48, dep_id#49, salary#50, rasc#273, rdesc#274, rasc#273, rdesc#274]
:           +- Window [rank(salary#50) windowspecdefinition(dep_id#49, salary#50 DESC NULLS LAST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS rdesc#274], [dep_id#49], [salary#50 DESC NULLS LAST]
:              +- Window [rank(salary#50) windowspecdefinition(dep_id#49, salary#50 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS rasc#273], [dep_id#49], [salary#50 ASC NULLS FIRST]
:                 +- Project [emp_name#48, dep_id#49, salary#50]
:                    +- SubqueryAlias employee
:                       +- View (`employee`, [emp_name#48,dep_id#49,salary#50])
:                          +- LogicalRDD [emp_name#48, dep_id#49, salary#50], false
+- Aggregate [dep_id#49], [dep_id#49, coalesce(CASE WHEN (rasc#273 = 1) THEN salary#50 END) AS lowsalary#271, coalesce(CASE WHEN (rdesc#274 = 1) THEN salary#50 END) AS highsalary#272]
   +- SubqueryAlias cte
      +- CTERelationRef 4, true, [emp_name#48, dep_id#49, salary#50, rasc#273, rdesc#274]
