In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import row_number, rank, dense_rank, lag, lead
from pyspark.sql.window import Window
from pyspark.sql.types import *

spark = SparkSession.Builder().appName('Rank Function in PySpark').getOrCreate()

In [5]:
empData = [
    (1,"Rohit",'M', 3000, "Data"),
    (2,"Ajay", 'M',2000, "Data"),
    (6,"Deepshika", 'F',2000, "Data"),
    (3,"Hemma", 'F',2000, "HR"),
    (4,"Arti", 'F',2000, "Marketing"),
    (5,"Kanchan", 'F',2000, "Marketing"),
]  

empDataSchema = ['empID', 'Name', 'Gender','Salary', 'dept']

df = spark.createDataFrame(empData, empDataSchema)
df.sort('dept', 'Salary').show()

+-----+---------+------+------+---------+
|empID|     Name|Gender|Salary|     dept|
+-----+---------+------+------+---------+
|    6|Deepshika|     F|  2000|     Data|
|    2|     Ajay|     M|  2000|     Data|
|    1|    Rohit|     M|  3000|     Data|
|    3|    Hemma|     F|  2000|       HR|
|    5|  Kanchan|     F|  2000|Marketing|
|    4|     Arti|     F|  2000|Marketing|
+-----+---------+------+------+---------+



#### row_number()

In [16]:
window = Window.partitionBy('dept').orderBy('Salary')
df.withColumn('rowNum', row_number().over(window)).show()

+-----+---------+------+------+---------+------+
|empID|     Name|Gender|Salary|     dept|rowNum|
+-----+---------+------+------+---------+------+
|    2|     Ajay|     M|  2000|     Data|     1|
|    6|Deepshika|     F|  2000|     Data|     2|
|    1|    Rohit|     M|  3000|     Data|     3|
|    3|    Hemma|     F|  2000|       HR|     1|
|    4|     Arti|     F|  2000|Marketing|     1|
|    5|  Kanchan|     F|  2000|Marketing|     2|
+-----+---------+------+------+---------+------+



#### rank() --> ranking under groups/ partition and it leaves the gaps in rank when there are ties

In [17]:
df.withColumn('rank', rank().over(
    Window.partitionBy('dept')\
    .orderBy('salary')
)).show()

+-----+---------+------+------+---------+----+
|empID|     Name|Gender|Salary|     dept|rank|
+-----+---------+------+------+---------+----+
|    2|     Ajay|     M|  2000|     Data|   1|
|    6|Deepshika|     F|  2000|     Data|   1|
|    1|    Rohit|     M|  3000|     Data|   3|
|    3|    Hemma|     F|  2000|       HR|   1|
|    4|     Arti|     F|  2000|Marketing|   1|
|    5|  Kanchan|     F|  2000|Marketing|   1|
+-----+---------+------+------+---------+----+



#### dense_rank() --> ranking under groups/ partition and it does not leaves the gaps in rank when there are ties

In [18]:
df.withColumn('denseRank', dense_rank().over(
    Window.partitionBy('dept')\
    .orderBy('salary')
)).show()

+-----+---------+------+------+---------+---------+
|empID|     Name|Gender|Salary|     dept|denseRank|
+-----+---------+------+------+---------+---------+
|    2|     Ajay|     M|  2000|     Data|        1|
|    6|Deepshika|     F|  2000|     Data|        1|
|    1|    Rohit|     M|  3000|     Data|        2|
|    3|    Hemma|     F|  2000|       HR|        1|
|    4|     Arti|     F|  2000|Marketing|        1|
|    5|  Kanchan|     F|  2000|Marketing|        1|
+-----+---------+------+------+---------+---------+



#### lag() --> will give specific previous value

In [28]:
# lag(columnName, 2) --> will give previous value by 2 row

df.withColumn('lag', lag('Name', 2).over(
    Window.partitionBy('dept')\
    .orderBy('salary')
)).show()

+-----+---------+------+------+---------+----+
|empID|     Name|Gender|Salary|     dept| lag|
+-----+---------+------+------+---------+----+
|    2|     Ajay|     M|  2000|     Data|NULL|
|    6|Deepshika|     F|  2000|     Data|NULL|
|    1|    Rohit|     M|  3000|     Data|Ajay|
|    3|    Hemma|     F|  2000|       HR|NULL|
|    4|     Arti|     F|  2000|Marketing|NULL|
|    5|  Kanchan|     F|  2000|Marketing|NULL|
+-----+---------+------+------+---------+----+



#### lead() --> will give specific following / next value

In [27]:
df.withColumn('lag', lead('Name',1).over(
    Window.partitionBy('dept')\
    .orderBy('salary')
)).show()

+-----+---------+------+------+---------+---------+
|empID|     Name|Gender|Salary|     dept|      lag|
+-----+---------+------+------+---------+---------+
|    2|     Ajay|     M|  2000|     Data|Deepshika|
|    6|Deepshika|     F|  2000|     Data|    Rohit|
|    1|    Rohit|     M|  3000|     Data|     NULL|
|    3|    Hemma|     F|  2000|       HR|     NULL|
|    4|     Arti|     F|  2000|Marketing|  Kanchan|
|    5|  Kanchan|     F|  2000|Marketing|     NULL|
+-----+---------+------+------+---------+---------+



In [29]:
spark.stop()