In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import dense_rank
from pyspark.sql.window import Window

# Create Spark session
spark = SparkSession.builder.appName("TopThreeSalaries").getOrCreate()

In [0]:
# Sample Employee data
employee_data = [
    (1, 'Joe', 85000, 1),
    (2, 'Henry', 80000, 2),
    (3, 'Sam', 60000, 2),
    (4, 'Max', 90000, 1),
    (5, 'Janet', 69000, 1),
    (6, 'Randy', 85000, 1),
    (7, 'Will', 70000, 1)
]
employee_columns = ['id', 'name', 'salary', 'departmentId']

employee_df = spark.createDataFrame(employee_data, employee_columns)

display(employee_df)


In [0]:
# Sample Department data
department_data = [
    (1, 'IT'),
    (2, 'Sales')
]
department_columns = ['id', 'name']

department_df = spark.createDataFrame(department_data, department_columns)

display(department_df)

In [0]:
joined_df = employee_df.alias('e').join(department_df.alias('d'), employee_df.departmentId == department_df.id, 'inner')
display(joined_df)

In [0]:
from pyspark.sql.functions import col, dense_rank
from pyspark.sql.window import Window

ranked_df = joined_df.withColumn(
    'denseRank', 
    dense_rank().over(
        Window.partitionBy('departmentId').orderBy(joined_df.salary.desc())
    )
)
#display(ranked_df)

top_three_df = ranked_df.select(
    col('d.name').alias('department'), 
    col('e.name').alias('employee'), 
    col('e.salary').alias('salary')
).filter(col('denseRank') <= 3)
display(top_three_df)