In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("window_functions").getOrCreate()

In [5]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number,rank,dense_rank

In [2]:
data = [
    ("Alice", "Sales", 5000),
    ("Bob", "Sales", 4800),
    ("Charlie", "IT", 6000),
    ("David", "IT", 5800),
    ("Eve", "Sales", 5000),
    ("Frank", "IT", 6000),
    ("Grace", "IT", 5800)
]

schema = ["name", "department", "salary"]

# Create DataFrame
df = spark.createDataFrame(data, schema)

df.show()

+-------+----------+------+
|   name|department|salary|
+-------+----------+------+
|  Alice|     Sales|  5000|
|    Bob|     Sales|  4800|
|Charlie|        IT|  6000|
|  David|        IT|  5800|
|    Eve|     Sales|  5000|
|  Frank|        IT|  6000|
|  Grace|        IT|  5800|
+-------+----------+------+



In [6]:
window = Window.partitionBy('department').orderBy('salary')

In [7]:
df_row_num = df.withColumn('row_number', row_number().over(window))

df_row_num.show()

+-------+----------+------+----------+
|   name|department|salary|row_number|
+-------+----------+------+----------+
|  David|        IT|  5800|         1|
|  Grace|        IT|  5800|         2|
|Charlie|        IT|  6000|         3|
|  Frank|        IT|  6000|         4|
|    Bob|     Sales|  4800|         1|
|  Alice|     Sales|  5000|         2|
|    Eve|     Sales|  5000|         3|
+-------+----------+------+----------+



In [8]:
df_rank = df.withColumn('rnk', rank().over(window))

df_rank.show()

+-------+----------+------+---+
|   name|department|salary|rnk|
+-------+----------+------+---+
|  David|        IT|  5800|  1|
|  Grace|        IT|  5800|  1|
|Charlie|        IT|  6000|  3|
|  Frank|        IT|  6000|  3|
|    Bob|     Sales|  4800|  1|
|  Alice|     Sales|  5000|  2|
|    Eve|     Sales|  5000|  2|
+-------+----------+------+---+



In [9]:
df_dense_rnk = df.withColumn('dense_rank',dense_rank().over(window))

df_dense_rnk.show()

+-------+----------+------+----------+
|   name|department|salary|dense_rank|
+-------+----------+------+----------+
|  David|        IT|  5800|         1|
|  Grace|        IT|  5800|         1|
|Charlie|        IT|  6000|         2|
|  Frank|        IT|  6000|         2|
|    Bob|     Sales|  4800|         1|
|  Alice|     Sales|  5000|         2|
|    Eve|     Sales|  5000|         2|
+-------+----------+------+----------+



In [10]:
from pyspark.sql.functions import lag


In [11]:
window = Window.partitionBy('department').orderBy('salary')

In [12]:
df_lag = df.withColumn('lag',lag('salary',1).over(window))
df_lag.show()

+-------+----------+------+----+
|   name|department|salary| lag|
+-------+----------+------+----+
|  David|        IT|  5800|null|
|  Grace|        IT|  5800|5800|
|Charlie|        IT|  6000|5800|
|  Frank|        IT|  6000|6000|
|    Bob|     Sales|  4800|null|
|  Alice|     Sales|  5000|4800|
|    Eve|     Sales|  5000|5000|
+-------+----------+------+----+



In [14]:
from pyspark.sql.functions import lead




In [15]:
window = Window.partitionBy('department').orderBy('salary')


In [18]:
df_lead = df.withColumn('lead',lead('salary',1).over(window))
df_lead.show()

+-------+----------+------+----+
|   name|department|salary|lead|
+-------+----------+------+----+
|  David|        IT|  5800|5800|
|  Grace|        IT|  5800|6000|
|Charlie|        IT|  6000|6000|
|  Frank|        IT|  6000|null|
|    Bob|     Sales|  4800|5000|
|  Alice|     Sales|  5000|5000|
|    Eve|     Sales|  5000|null|
+-------+----------+------+----+



In [19]:
from pyspark.sql.functions import first, last

In [20]:
window = Window.partitionBy('department').orderBy('salary')


In [21]:
df.withColumn('first_sal',first('salary').over(window)).show()
df.withColumn('last_sal',last('salary').over(window)).show()

+-------+----------+------+---------+
|   name|department|salary|first_sal|
+-------+----------+------+---------+
|  David|        IT|  5800|     5800|
|  Grace|        IT|  5800|     5800|
|Charlie|        IT|  6000|     5800|
|  Frank|        IT|  6000|     5800|
|    Bob|     Sales|  4800|     4800|
|  Alice|     Sales|  5000|     4800|
|    Eve|     Sales|  5000|     4800|
+-------+----------+------+---------+

+-------+----------+------+--------+
|   name|department|salary|last_sal|
+-------+----------+------+--------+
|  David|        IT|  5800|    5800|
|  Grace|        IT|  5800|    5800|
|Charlie|        IT|  6000|    6000|
|  Frank|        IT|  6000|    6000|
|    Bob|     Sales|  4800|    4800|
|  Alice|     Sales|  5000|    5000|
|    Eve|     Sales|  5000|    5000|
+-------+----------+------+--------+



In [22]:
from pyspark.sql.functions import ntile

In [23]:
window = Window.partitionBy('department').orderBy('salary')


In [31]:
df.sort('salary').show()
df.withColumn('ntile',ntile(2).over(window)).show()


+-------+----------+------+
|   name|department|salary|
+-------+----------+------+
|    Bob|     Sales|  4800|
|    Eve|     Sales|  5000|
|  Alice|     Sales|  5000|
|  Grace|        IT|  5800|
|  David|        IT|  5800|
|Charlie|        IT|  6000|
|  Frank|        IT|  6000|
+-------+----------+------+

+-------+----------+------+-----+
|   name|department|salary|ntile|
+-------+----------+------+-----+
|  David|        IT|  5800|    1|
|  Grace|        IT|  5800|    1|
|Charlie|        IT|  6000|    2|
|  Frank|        IT|  6000|    2|
|    Bob|     Sales|  4800|    1|
|  Alice|     Sales|  5000|    1|
|    Eve|     Sales|  5000|    2|
+-------+----------+------+-----+



In [32]:
df.withColumn('ntile',ntile(3).over(window)).show()

+-------+----------+------+-----+
|   name|department|salary|ntile|
+-------+----------+------+-----+
|  David|        IT|  5800|    1|
|  Grace|        IT|  5800|    1|
|Charlie|        IT|  6000|    2|
|  Frank|        IT|  6000|    3|
|    Bob|     Sales|  4800|    1|
|  Alice|     Sales|  5000|    2|
|    Eve|     Sales|  5000|    3|
+-------+----------+------+-----+



In [34]:
from pyspark.sql.functions import sum, avg, min, max
from pyspark.sql.window import Window

windowSpec = Window.partitionBy('department').orderBy('salary')

# Apply sum() function
df_with_sum = df.withColumn("sum_salary", sum("salary").over(windowSpec))

# Apply avg() function
df_with_avg = df.withColumn("avg_salary", avg("salary").over(windowSpec))

# Apply min() function
df_with_min = df.withColumn("min_salary", min("salary").over(windowSpec))

# Apply max() function
df_with_max = df.withColumn("max_salary", max("salary").over(windowSpec))

# Show the results
df_with_sum.show(truncate=False)
df_with_avg.show(truncate=False)
df_with_min.show(truncate=False)
df_with_max.show(truncate=False)


+-------+----------+------+----------+
|name   |department|salary|sum_salary|
+-------+----------+------+----------+
|David  |IT        |5800  |11600     |
|Grace  |IT        |5800  |11600     |
|Charlie|IT        |6000  |23600     |
|Frank  |IT        |6000  |23600     |
|Bob    |Sales     |4800  |4800      |
|Alice  |Sales     |5000  |14800     |
|Eve    |Sales     |5000  |14800     |
+-------+----------+------+----------+

+-------+----------+------+-----------------+
|name   |department|salary|avg_salary       |
+-------+----------+------+-----------------+
|David  |IT        |5800  |5800.0           |
|Grace  |IT        |5800  |5800.0           |
|Charlie|IT        |6000  |5900.0           |
|Frank  |IT        |6000  |5900.0           |
|Bob    |Sales     |4800  |4800.0           |
|Alice  |Sales     |5000  |4933.333333333333|
|Eve    |Sales     |5000  |4933.333333333333|
+-------+----------+------+-----------------+

+-------+----------+------+----------+
|name   |department|sala

In [35]:
from pyspark.sql.functions import percent_rank

# Apply percent_rank() function
df_with_percent_rank = df.withColumn("percent_rank", percent_rank().over(windowSpec))

# Show the result
df_with_percent_rank.show(truncate=False)


+-------+----------+------+------------------+
|name   |department|salary|percent_rank      |
+-------+----------+------+------------------+
|David  |IT        |5800  |0.0               |
|Grace  |IT        |5800  |0.0               |
|Charlie|IT        |6000  |0.6666666666666666|
|Frank  |IT        |6000  |0.6666666666666666|
|Bob    |Sales     |4800  |0.0               |
|Alice  |Sales     |5000  |0.5               |
|Eve    |Sales     |5000  |0.5               |
+-------+----------+------+------------------+

