In [1]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("SP").getOrCreate()

# read csv file
file_path = "C:\\Users\\pcc\\Desktop\\daily-website-visitors.csv"
df=spark.read.csv(file_path,header=True,inferSchema=True)
df

df=df.withColumnsRenamed({"Day.Of.Week":"Day_Of_Week","Page.Loads":"Page_Loads",
                          "Unique.Visits":"Unique_Visits","First.Time.Visits":"First_Time_Visits",
                          "Returning.Visits":"Returning_Visits"})

In [2]:
from pyspark.sql.functions import *

In [6]:
# row_number() window function is used to give the sequential row number 
# starting from 1 to the result for each window partition.

from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

window_spec=Window.partitionBy("Day").orderBy("Returning_Visits")

df.withColumn("row_number",row_number().over(window_spec)).show(truncate=False)


+----+------+-----------+----------+----------+-------------+-----------------+----------------+----------+
|Row |Day   |Day_Of_Week|Date      |Page_Loads|Unique_Visits|First_Time_Visits|Returning_Visits|row_number|
+----+------+-----------+----------+----------+-------------+-----------------+----------------+----------+
|468 |Friday|6          |12/25/2015|1,017     |724          |586              |138             |1         |
|475 |Friday|6          |1/1/2016  |1,411     |960          |786              |174             |2         |
|104 |Friday|6          |12/26/2014|1,486     |1,005        |808              |197             |3         |
|1931|Friday|6          |12/27/2019|1,927     |1,319        |1,109            |210             |4         |
|13  |Friday|6          |9/26/2014 |3,323     |2,249        |2,033            |216             |5         |
|6   |Friday|6          |9/19/2014 |2,815     |1,863        |1,622            |241             |6         |
|20  |Friday|6          |10/

In [7]:
# rank() window function is used to provide a rank to the result within a window partition.
# This function leaves gaps in rank when there is ties.
# This is same as RANK function in SQL.

from pyspark.sql.functions import rank
df.withColumn("rank",rank().over(window_spec)).show()

+----+------+-----------+----------+----------+-------------+-----------------+----------------+----+
| Row|   Day|Day_Of_Week|      Date|Page_Loads|Unique_Visits|First_Time_Visits|Returning_Visits|rank|
+----+------+-----------+----------+----------+-------------+-----------------+----------------+----+
| 468|Friday|          6|12/25/2015|     1,017|          724|              586|             138|   1|
| 475|Friday|          6|  1/1/2016|     1,411|          960|              786|             174|   2|
| 104|Friday|          6|12/26/2014|     1,486|        1,005|              808|             197|   3|
|1931|Friday|          6|12/27/2019|     1,927|        1,319|            1,109|             210|   4|
|  13|Friday|          6| 9/26/2014|     3,323|        2,249|            2,033|             216|   5|
|   6|Friday|          6| 9/19/2014|     2,815|        1,863|            1,622|             241|   6|
|  20|Friday|          6| 10/3/2014|     3,005|        2,097|            1,856|   

In [9]:
# dense_rank() window function is used to get the result with rank of rows within a window partition
# without any gaps. This is similar to rank() function, difference being rank function 
# leaves gaps in rank when there are ties.
# This is same as the Dense_Rank function in SQL.

from pyspark.sql.functions import dense_rank
df.withColumn("dense_rank",dense_rank().over(window_spec)).show()

+----+------+-----------+----------+----------+-------------+-----------------+----------------+----------+
| Row|   Day|Day_Of_Week|      Date|Page_Loads|Unique_Visits|First_Time_Visits|Returning_Visits|dense_rank|
+----+------+-----------+----------+----------+-------------+-----------------+----------------+----------+
| 468|Friday|          6|12/25/2015|     1,017|          724|              586|             138|         1|
| 475|Friday|          6|  1/1/2016|     1,411|          960|              786|             174|         2|
| 104|Friday|          6|12/26/2014|     1,486|        1,005|              808|             197|         3|
|1931|Friday|          6|12/27/2019|     1,927|        1,319|            1,109|             210|         4|
|  13|Friday|          6| 9/26/2014|     3,323|        2,249|            2,033|             216|         5|
|   6|Friday|          6| 9/19/2014|     2,815|        1,863|            1,622|             241|         6|
|  20|Friday|          6| 10

In [11]:
# percent_rank()
# This is same as the PERCENT_RANK function in SQL.

from pyspark.sql.functions import percent_rank
df.withColumn("percent_rank",percent_rank().over(window_spec)).show()

+----+------+-----------+----------+----------+-------------+-----------------+----------------+--------------------+
| Row|   Day|Day_Of_Week|      Date|Page_Loads|Unique_Visits|First_Time_Visits|Returning_Visits|        percent_rank|
+----+------+-----------+----------+----------+-------------+-----------------+----------------+--------------------+
| 468|Friday|          6|12/25/2015|     1,017|          724|              586|             138|                 0.0|
| 475|Friday|          6|  1/1/2016|     1,411|          960|              786|             174|0.003246753246753247|
| 104|Friday|          6|12/26/2014|     1,486|        1,005|              808|             197|0.006493506493506494|
|1931|Friday|          6|12/27/2019|     1,927|        1,319|            1,109|             210| 0.00974025974025974|
|  13|Friday|          6| 9/26/2014|     3,323|        2,249|            2,033|             216|0.012987012987012988|
|   6|Friday|          6| 9/19/2014|     2,815|        1

In [13]:
# ntile() window function returns the relative rank of the result rows within a window partition. 
# In the below example we have used 2 as an argument to ntile, hence it returns ranking between 2 values.

from pyspark.sql.functions import ntile
df.withColumn("ntile",ntile(2).over(window_spec)).show()

+----+------+-----------+----------+----------+-------------+-----------------+----------------+-----+
| Row|   Day|Day_Of_Week|      Date|Page_Loads|Unique_Visits|First_Time_Visits|Returning_Visits|ntile|
+----+------+-----------+----------+----------+-------------+-----------------+----------------+-----+
| 468|Friday|          6|12/25/2015|     1,017|          724|              586|             138|    1|
| 475|Friday|          6|  1/1/2016|     1,411|          960|              786|             174|    1|
| 104|Friday|          6|12/26/2014|     1,486|        1,005|              808|             197|    1|
|1931|Friday|          6|12/27/2019|     1,927|        1,319|            1,109|             210|    1|
|  13|Friday|          6| 9/26/2014|     3,323|        2,249|            2,033|             216|    1|
|   6|Friday|          6| 9/19/2014|     2,815|        1,863|            1,622|             241|    1|
|  20|Friday|          6| 10/3/2014|     3,005|        2,097|            

In [14]:
# Cume_dist()vwindow function is used to get the cummulative distribution of values 
# within a window partition.
# This is same as the Dense_rank function in SQL.

from pyspark.sql.functions import cume_dist
df.withColumn("Cumm",cume_dist().over(window_spec)).show()

+----+------+-----------+----------+----------+-------------+-----------------+----------------+--------------------+
| Row|   Day|Day_Of_Week|      Date|Page_Loads|Unique_Visits|First_Time_Visits|Returning_Visits|                Cumm|
+----+------+-----------+----------+----------+-------------+-----------------+----------------+--------------------+
| 468|Friday|          6|12/25/2015|     1,017|          724|              586|             138|0.003236245954692557|
| 475|Friday|          6|  1/1/2016|     1,411|          960|              786|             174|0.006472491909385114|
| 104|Friday|          6|12/26/2014|     1,486|        1,005|              808|             197|0.009708737864077669|
|1931|Friday|          6|12/27/2019|     1,927|        1,319|            1,109|             210|0.012944983818770227|
|  13|Friday|          6| 9/26/2014|     3,323|        2,249|            2,033|             216|0.016181229773462782|
|   6|Friday|          6| 9/19/2014|     2,815|        1

In [15]:
# lag()- This is same as the lag function in sql. 
from pyspark.sql.functions import lag
df.withColumn("lag",lag("Returning_Visits",2).over(window_spec)).show()

+----+------+-----------+----------+----------+-------------+-----------------+----------------+----+
| Row|   Day|Day_Of_Week|      Date|Page_Loads|Unique_Visits|First_Time_Visits|Returning_Visits| lag|
+----+------+-----------+----------+----------+-------------+-----------------+----------------+----+
| 468|Friday|          6|12/25/2015|     1,017|          724|              586|             138|NULL|
| 475|Friday|          6|  1/1/2016|     1,411|          960|              786|             174|NULL|
| 104|Friday|          6|12/26/2014|     1,486|        1,005|              808|             197| 138|
|1931|Friday|          6|12/27/2019|     1,927|        1,319|            1,109|             210| 174|
|  13|Friday|          6| 9/26/2014|     3,323|        2,249|            2,033|             216| 197|
|   6|Friday|          6| 9/19/2014|     2,815|        1,863|            1,622|             241| 210|
|  20|Friday|          6| 10/3/2014|     3,005|        2,097|            1,856|   

In [16]:
# lead() - This is same as the function in sql.

from pyspark.sql.functions import lead
df.withColumn("lead",lead("Returning_Visits",2).over(window_spec)).show()


+----+------+-----------+----------+----------+-------------+-----------------+----------------+----+
| Row|   Day|Day_Of_Week|      Date|Page_Loads|Unique_Visits|First_Time_Visits|Returning_Visits|lead|
+----+------+-----------+----------+----------+-------------+-----------------+----------------+----+
| 468|Friday|          6|12/25/2015|     1,017|          724|              586|             138| 197|
| 475|Friday|          6|  1/1/2016|     1,411|          960|              786|             174| 210|
| 104|Friday|          6|12/26/2014|     1,486|        1,005|              808|             197| 216|
|1931|Friday|          6|12/27/2019|     1,927|        1,319|            1,109|             210| 241|
|  13|Friday|          6| 9/26/2014|     3,323|        2,249|            2,033|             216| 241|
|   6|Friday|          6| 9/19/2014|     2,815|        1,863|            1,622|             241| 258|
|  20|Friday|          6| 10/3/2014|     3,005|        2,097|            1,856|   

In [27]:
# PySpark window aggregation function
window_Spec_agg = Window.partitionBy("Day")
from pyspark.sql.functions import col, sum, avg, min, max, row_number
df.withColumn("row",row_number().over(window_spec)
             ).withColumn("avg",avg(col("Returning_Visits")).over(window_Spec_agg)
                         ).withColumn("sum",sum(col("Returning_Visits")).over(window_Spec_agg)
                                     ).withColumn("min",min(col("Returning_Visits")).over(window_Spec_agg)
                                                 ).withColumn("max",max(col("Returning_Visits")).over(window_Spec_agg)
                                                             ).show()




+---+------+-----------+----------+----------+-------------+-----------------+----------------+-----------------+--------+---+---+
|row|   Day|Day_Of_Week|      Date|Page_Loads|Unique_Visits|First_Time_Visits|Returning_Visits|              avg|     sum|min|max|
+---+------+-----------+----------+----------+-------------+-----------------+----------------+-----------------+--------+---+---+
|  1|Friday|          6|12/25/2015|     1,017|          724|              586|             138|482.3527508090615|149047.0|138|806|
|  2|Friday|          6|  1/1/2016|     1,411|          960|              786|             174|482.3527508090615|149047.0|138|806|
|  3|Friday|          6|12/26/2014|     1,486|        1,005|              808|             197|482.3527508090615|149047.0|138|806|
|  4|Friday|          6|12/27/2019|     1,927|        1,319|            1,109|             210|482.3527508090615|149047.0|138|806|
|  5|Friday|          6| 9/26/2014|     3,323|        2,249|            2,033|     