In [None]:
# Import necessary libraries
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col, row_number
from pyspark.sql.window import Window

# Create SparkSession
spark = SparkSession.builder.appName("WindowFunctionExample").getOrCreate()

# Define schema and create a list of Rows
schema = ["id", "price", "discount"]
row_list = [
    Row(id=1, price=100.0, discount=10.0),
    Row(id=2, price=200.0, discount=20.0),
    Row(id=3, price=300.0, discount=30.0)
]

# Create DataFrame
df = spark.createDataFrame(row_list, schema)

# Define a Python function that operates on PySpark DataFrames
def get_discounted_price(df):
    return df.withColumn("discounted_price",\
        df.price - (df.price * df.discount) / 100)

# Apply the transformation using .transform
df_discounted = df.transform(get_discounted_price)

# Define the window function
window = Window.orderBy("discounted_price")

# Apply window function
df_with_row_number = df_discounted.withColumn("row_number", row_number().over(window))

# Show the result
df_with_row_number.show()

In [None]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("example").getOrCreate()
simpleData = (("James", "Sales", 3000), \
    ("Michael", "Sales", 4600),  \
    ("Robert", "Sales", 4100),   \
    ("Maria", "Finance", 3000),  \
    ("James", "Sales", 3000),    \
    ("Scott", "Finance", 3300),  \
    ("Jen", "Finance", 3900),    \
    ("Jeff", "Marketing", 3000), \
    ("Kumar", "Marketing", 2000),\
    ("Saif", "Sales", 4100) \
  )
 
columns= ["employee_name", "department", "salary"]

df = spark.createDataFrame(data = simpleData, schema = columns)

df.printSchema()
df.show(truncate=False)

from pyspark.sql.window import Window
from pyspark.sql.functions import row_number
windowSpec  = Window.partitionBy("department").orderBy("salary")

df.withColumn("row_number",row_number().over(windowSpec)) \
    .show(truncate=False)

from pyspark.sql.functions import rank
df.withColumn("rank",rank().over(windowSpec)) \
    .show()

from pyspark.sql.functions import dense_rank
df.withColumn("dense_rank",dense_rank().over(windowSpec)) \
    .show()

from pyspark.sql.functions import percent_rank
df.withColumn("percent_rank",percent_rank().over(windowSpec)) \
    .show()
    
from pyspark.sql.functions import ntile
df.withColumn("ntile",ntile(2).over(windowSpec)) \
    .show()

from pyspark.sql.functions import cume_dist    
df.withColumn("cume_dist",cume_dist().over(windowSpec)) \
   .show()

from pyspark.sql.functions import lag    
df.withColumn("lag",lag("salary",2).over(windowSpec)) \
      .show()

from pyspark.sql.functions import lead    
df.withColumn("lead",lead("salary",2).over(windowSpec)) \
    .show()
    
windowSpecAgg  = Window.partitionBy("department")
from pyspark.sql.functions import col,avg,sum,min,max,row_number 
df.withColumn("row",row_number().over(windowSpec)) \
  .withColumn("avg", avg(col("salary")).over(windowSpecAgg)) \
  .withColumn("sum", sum(col("salary")).over(windowSpecAgg)) \
  .withColumn("min", min(col("salary")).over(windowSpecAgg)) \
  .withColumn("max", max(col("salary")).over(windowSpecAgg)) \
  .where(col("row")==1).select("department","avg","sum","min","max") \
  .show()