In [1]:
from pyspark.sql import SparkSession,Row

In [2]:
spark = SparkSession.builder.appName("test").master("local[*]").getOrCreate()

In [3]:
header =Row("ID", "Name", "gender", "Salary", "Department")

In [19]:
employee_df = spark.sparkContext.parallelize([ 
    header(1, "Deva", "Male", 5000, "Sales"), 
    header(2, "Jugnu", "Female", 6000, "HR"), 
    header(3, "Kavita", "Female", 7500, "IT"), 
    header(4, "Vikram", "Male", 6500, "Marketing"), 
    header(5, "Shabana", "Female", 5500, "Finance"), 
    header(6, "Shantilal", "Male", 8000, "Sales"), 
    header(7, "Vinod", "Male", 7200, "HR"), 
    header(8, "Vimla", "Female", 6600, "IT"), 
    header(9, "Jasmin", "Female", 5400, "Marketing"), 
    header(10, "Lovely", "Female", 6300, "Finance"), 
    header(11, "Mohan", "Male", 5700, "Sales"), 
    header(12, "Purvish", "Male", 7000, "HR"),  
    header(13, "Jinat", "Female", 7100, "IT"), 
    header(14, "Eva", "Female", 6800,"Marketing"),
    header(15, "Jitendra", "Male", 5000, "Finance") , 
    header(15, "Rajkumar", "Male", 4500, "Finance") , 
    header(15, "Satish", "Male", 4500, "Finance") ,
    header(15, "Himmat", "Male", 3500, "Finance")]).toDF()

In [9]:
from pyspark.sql.functions import desc_nulls_last
from pyspark.sql.window import Window

In [10]:
genderPartitionedSpec = Window.partitionBy("gender").orderBy(desc_nulls_last("Salary"))  

In [11]:
#Lag function will help you find the previous value in the same column 
from pyspark.sql.functions import lag 

In [12]:
employee_df.withColumn("previousValue", lag("Salary", 1).over(genderPartitionedSpec)).show()

+---+---------+------+------+----------+-------------+
| ID|     Name|gender|Salary|Department|previousValue|
+---+---------+------+------+----------+-------------+
|  3|   Kavita|Female|  7500|        IT|         null|
| 13|    Jinat|Female|  7100|        IT|         7500|
| 14|      Eva|Female|  6800| Marketing|         7100|
|  8|    Vimla|Female|  6600|        IT|         6800|
| 10|   Lovely|Female|  6300|   Finance|         6600|
|  2|    Jugnu|Female|  6000|        HR|         6300|
|  5|  Shabana|Female|  5500|   Finance|         6000|
|  9|   Jasmin|Female|  5400| Marketing|         5500|
|  6|Shantilal|  Male|  8000|     Sales|         null|
|  7|    Vinod|  Male|  7200|        HR|         8000|
| 12|  Purvish|  Male|  7000|        HR|         7200|
|  4|   Vikram|  Male|  6500| Marketing|         7000|
| 11|    Mohan|  Male|  5700|     Sales|         6500|
|  1|     Deva|  Male|  5000|     Sales|         5700|
| 15| Jitendra|  Male|  5000|   Finance|         5000|
| 15| Rajk

In [13]:
employee_df.withColumn("previousValue", lag("Salary", 2).over(genderPartitionedSpec)).show()

+---+---------+------+------+----------+-------------+
| ID|     Name|gender|Salary|Department|previousValue|
+---+---------+------+------+----------+-------------+
|  3|   Kavita|Female|  7500|        IT|         null|
| 13|    Jinat|Female|  7100|        IT|         null|
| 14|      Eva|Female|  6800| Marketing|         7500|
|  8|    Vimla|Female|  6600|        IT|         7100|
| 10|   Lovely|Female|  6300|   Finance|         6800|
|  2|    Jugnu|Female|  6000|        HR|         6600|
|  5|  Shabana|Female|  5500|   Finance|         6300|
|  9|   Jasmin|Female|  5400| Marketing|         6000|
|  6|Shantilal|  Male|  8000|     Sales|         null|
|  7|    Vinod|  Male|  7200|        HR|         null|
| 12|  Purvish|  Male|  7000|        HR|         8000|
|  4|   Vikram|  Male|  6500| Marketing|         7200|
| 11|    Mohan|  Male|  5700|     Sales|         7000|
|  1|     Deva|  Male|  5000|     Sales|         6500|
| 15| Jitendra|  Male|  5000|   Finance|         5700|
| 15| Rajk

In [14]:
#Similarly, third last and increase the range as per need 
employee_df.withColumn("previousValue", lag("Salary", 3).over(genderPartitionedSpec)).show()

+---+---------+------+------+----------+-------------+
| ID|     Name|gender|Salary|Department|previousValue|
+---+---------+------+------+----------+-------------+
|  3|   Kavita|Female|  7500|        IT|         null|
| 13|    Jinat|Female|  7100|        IT|         null|
| 14|      Eva|Female|  6800| Marketing|         null|
|  8|    Vimla|Female|  6600|        IT|         7500|
| 10|   Lovely|Female|  6300|   Finance|         7100|
|  2|    Jugnu|Female|  6000|        HR|         6800|
|  5|  Shabana|Female|  5500|   Finance|         6600|
|  9|   Jasmin|Female|  5400| Marketing|         6300|
|  6|Shantilal|  Male|  8000|     Sales|         null|
|  7|    Vinod|  Male|  7200|        HR|         null|
| 12|  Purvish|  Male|  7000|        HR|         null|
|  4|   Vikram|  Male|  6500| Marketing|         8000|
| 11|    Mohan|  Male|  5700|     Sales|         7200|
|  1|     Deva|  Male|  5000|     Sales|         7000|
| 15| Jitendra|  Male|  5000|   Finance|         6500|
| 15| Rajk

In [16]:
from pyspark.sql.functions import expr

In [17]:
#Get the difference between previous value and current value 
employee_df2 = employee_df.withColumn("previousValue", lag("Salary", 1)\
                                      .over(genderPartitionedSpec)) 

employee_df2.select("ID", "Name", "gender", "Department", "Salary", "previousValue", 
                     expr("Salary-previousValue").alias("SalaryDiff")).show()

+---+---------+------+----------+------+-------------+----------+
| ID|     Name|gender|Department|Salary|previousValue|SalaryDiff|
+---+---------+------+----------+------+-------------+----------+
|  3|   Kavita|Female|        IT|  7500|         null|      null|
| 13|    Jinat|Female|        IT|  7100|         7500|      -400|
| 14|      Eva|Female| Marketing|  6800|         7100|      -300|
|  8|    Vimla|Female|        IT|  6600|         6800|      -200|
| 10|   Lovely|Female|   Finance|  6300|         6600|      -300|
|  2|    Jugnu|Female|        HR|  6000|         6300|      -300|
|  5|  Shabana|Female|   Finance|  5500|         6000|      -500|
|  9|   Jasmin|Female| Marketing|  5400|         5500|      -100|
|  6|Shantilal|  Male|     Sales|  8000|         null|      null|
|  7|    Vinod|  Male|        HR|  7200|         8000|      -800|
| 12|  Purvish|  Male|        HR|  7000|         7200|      -200|
|  4|   Vikram|  Male| Marketing|  6500|         7000|      -500|
| 11|    M

In [18]:
#Now opposite of that using lead function 
from pyspark.sql.functions import lead 

In [20]:
employee_df.withColumn("lead", lead("Salary", 1).over(genderPartitionedSpec)).show() 
employee_df.withColumn("leadBy2", lead("Salary", 2).over(genderPartitionedSpec)).show()

+---+---------+------+------+----------+----+
| ID|     Name|gender|Salary|Department|lead|
+---+---------+------+------+----------+----+
|  3|   Kavita|Female|  7500|        IT|7100|
| 13|    Jinat|Female|  7100|        IT|6800|
| 14|      Eva|Female|  6800| Marketing|6600|
|  8|    Vimla|Female|  6600|        IT|6300|
| 10|   Lovely|Female|  6300|   Finance|6000|
|  2|    Jugnu|Female|  6000|        HR|5500|
|  5|  Shabana|Female|  5500|   Finance|5400|
|  9|   Jasmin|Female|  5400| Marketing|null|
|  6|Shantilal|  Male|  8000|     Sales|7200|
|  7|    Vinod|  Male|  7200|        HR|7000|
| 12|  Purvish|  Male|  7000|        HR|6500|
|  4|   Vikram|  Male|  6500| Marketing|5700|
| 11|    Mohan|  Male|  5700|     Sales|5000|
|  1|     Deva|  Male|  5000|     Sales|5000|
| 15| Jitendra|  Male|  5000|   Finance|4500|
| 15| Rajkumar|  Male|  4500|   Finance|4500|
| 15|   Satish|  Male|  4500|   Finance|3500|
| 15|   Himmat|  Male|  3500|   Finance|null|
+---+---------+------+------+-----