In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,rank
from pyspark.sql.window import Window

In [2]:
spark = SparkSession.builder.appName("topSalary").getOrCreate()

SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/home/glue_user/spark/jars/log4j-slf4j-impl-2.17.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/home/glue_user/spark/jars/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/home/glue_user/aws-glue-libs/jars/log4j-slf4j-impl-2.17.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/home/glue_user/aws-glue-libs/jars/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/17 15:21:49 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/03/17 15:21:49 WARN 

In [3]:
employee_data = [
    ("HR", "Alice", 60000),
    ("HR", "Bob", 75000),
    ("HR", "Charlie", 80000),
    ("HR", "David", 72000),
    ("IT", "Eve", 90000),
    ("IT", "Frank", 85000),
    ("IT", "Grace", 95000),
    ("IT", "Hank", 87000),
    ("Finance", "Ivy", 70000),
    ("Finance", "Jack", 65000),
    ("Finance", "Kevin", 72000),
    ("Finance", "Liam", 71000),
]

# Creating DataFrame
columns = ["department", "employee", "salary"]

In [4]:
employee_df = spark.createDataFrame(employee_data,columns)

In [5]:
employee_df.show()

                                                                                

+----------+--------+------+
|department|employee|salary|
+----------+--------+------+
|        HR|   Alice| 60000|
|        HR|     Bob| 75000|
|        HR| Charlie| 80000|
|        HR|   David| 72000|
|        IT|     Eve| 90000|
|        IT|   Frank| 85000|
|        IT|   Grace| 95000|
|        IT|    Hank| 87000|
|   Finance|     Ivy| 70000|
|   Finance|    Jack| 65000|
|   Finance|   Kevin| 72000|
|   Finance|    Liam| 71000|
+----------+--------+------+



In [6]:
employee_df.createOrReplaceTempView("empSalary")

In [8]:
spark.sql(
    """
    with rankSalaryQuery as (
        select department,employee,salary, RANK() over(partition by department order by salary desc) as salary_ranking 
        from empSalary 
    )
    select department,employee,salary,salary_ranking from rankSalaryQuery where salary_ranking <= 3
    """
         ).show()



+----------+--------+------+--------------+
|department|employee|salary|salary_ranking|
+----------+--------+------+--------------+
|   Finance|   Kevin| 72000|             1|
|   Finance|    Liam| 71000|             2|
|   Finance|     Ivy| 70000|             3|
|        HR| Charlie| 80000|             1|
|        HR|     Bob| 75000|             2|
|        HR|   David| 72000|             3|
|        IT|   Grace| 95000|             1|
|        IT|     Eve| 90000|             2|
|        IT|    Hank| 87000|             3|
+----------+--------+------+--------------+



                                                                                

In [None]:
# select department,employee,salary, RANK() over(partition by department order by salary desc) as salary_ranking 
# from table_name 

In [10]:
employee_df.printSchema()

root
 |-- department: string (nullable = true)
 |-- employee: string (nullable = true)
 |-- salary: long (nullable = true)



In [11]:
windowSpec = Window.partitionBy("department").orderBy(col("salary").desc())

In [12]:
employee_rank_sal_df = employee_df.withColumn("salary_ranking",rank().over(windowSpec))

In [13]:
employee_rank_sal_df.show()



+----------+--------+------+--------------+
|department|employee|salary|salary_ranking|
+----------+--------+------+--------------+
|   Finance|   Kevin| 72000|             1|
|   Finance|    Liam| 71000|             2|
|   Finance|     Ivy| 70000|             3|
|   Finance|    Jack| 65000|             4|
|        HR| Charlie| 80000|             1|
|        HR|     Bob| 75000|             2|
|        HR|   David| 72000|             3|
|        HR|   Alice| 60000|             4|
|        IT|   Grace| 95000|             1|
|        IT|     Eve| 90000|             2|
|        IT|    Hank| 87000|             3|
|        IT|   Frank| 85000|             4|
+----------+--------+------+--------------+



                                                                                

In [15]:
employee_rank_sal_df.filter(col("salary_ranking") <= 2).show()



+----------+--------+------+--------------+
|department|employee|salary|salary_ranking|
+----------+--------+------+--------------+
|   Finance|   Kevin| 72000|             1|
|   Finance|    Liam| 71000|             2|
|        HR| Charlie| 80000|             1|
|        HR|     Bob| 75000|             2|
|        IT|   Grace| 95000|             1|
|        IT|     Eve| 90000|             2|
+----------+--------+------+--------------+



                                                                                