In [1]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("SP").getOrCreate()

In [3]:
# read csv file
file_path = "C:\\Users\\pcc\\Desktop\\daily-website-visitors.csv"
df=spark.read.csv(file_path,header=True,inferSchema=True)
df

df=df.withColumnsRenamed({"Day.Of.Week":"Day_Of_Week","Page.Loads":"Page_Loads",
                          "Unique.Visits":"Unique_Visits","First.Time.Visits":"First_Time_Visits",
                          "Returning.Visits":"Returning_Visits"})

In [8]:
from pyspark.sql.functions import *

# approx_count_distinct() Agg function

In [10]:
# distinct counts
print("approx_count_distinct: " + str(df.select(approx_count_distinct("Day")).collect()[0][0]))

approx_count_distinct: 7


In [15]:
# Avg Agg Function
print("Avg: "+str(df.select(avg("First_Time_Visits")).collect()[0][0]))

Avg: 841.5584415584416


In [17]:
# collect_list() function returns all values from an input column with duplicates.
df.select(collect_list("Day")).show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [18]:
# collect_set() function returns all values from an input column with duplicate values eliminated.
df.select(collect_set("Day")).show(truncate=False)

+----------------------------------------------------------------+
|collect_set(Day)                                                |
+----------------------------------------------------------------+
|[Monday, Saturday, Friday, Thursday, Wednesday, Tuesday, Sunday]|
+----------------------------------------------------------------+



In [20]:
# countDistinct() function returns the number of distinct elements in a columns.
df2=df.select(countDistinct("Day","First_Time_Visits"))
df2.show(truncate=False)
print("Distinct Count of Day's & First time visitors: " + str(df2.collect()[0][0]))

+--------------------------------------+
|count(DISTINCT Day, First_Time_Visits)|
+--------------------------------------+
|2038                                  |
+--------------------------------------+

Distinct Count of Day's & First time visitors: 2038


In [25]:
# count() function returns number of elements in a column.
df.select(count("First_Time_Visits")).collect()[0]

Row(count(First_Time_Visits)=2167)

grouping() function - Indicates whether a given input column is aggreagated or not. returns 1 for
aggregated or 0 for not aggregated in the result. If we try grouping directly on the salary column 
you will get below error. 
`Exception in thread 'main' org.apache.spark.sql.AnalysisException:
    
    //grouping() can only be used with GroupingSets/Cube/Rollup`

In [26]:
# first() function returns the first element in a column when ignore Nulls is set to true, 
# it returns the first non-null element.
df.select(first("First_Time_Visits")).show()

+------------------------+
|first(First_Time_Visits)|
+------------------------+
|                   1,430|
+------------------------+



In [27]:
# last() function returns the last element in a column. When ignoreNulls is set to true, 
# it returns the last non-null element.

df.select(last("First_Time_Visits")).show()

+-----------------------+
|last(First_Time_Visits)|
+-----------------------+
|                  1,297|
+-----------------------+



In [28]:
# kurtosis() function returns the kurtosis of the values in a group.
df.select(kurtosis("First_Time_Visits")).show()

+---------------------------+
|kurtosis(First_Time_Visits)|
+---------------------------+
|        -0.5700303550600014|
+---------------------------+



In [34]:
df.select(max("Day_Of_Week")).show()

+----------------+
|max(Day_Of_Week)|
+----------------+
|               7|
+----------------+



In [33]:
df.select(min("Day_Of_Week")).show()

+----------------+
|min(Day_Of_Week)|
+----------------+
|               1|
+----------------+



In [37]:
df.select(mean("Returning_Visits")).show()

+---------------------+
|avg(Returning_Visits)|
+---------------------+
|    511.5803324099723|
+---------------------+



In [36]:
df.show(5)

+---+---------+-----------+---------+----------+-------------+-----------------+----------------+
|Row|      Day|Day_Of_Week|     Date|Page_Loads|Unique_Visits|First_Time_Visits|Returning_Visits|
+---+---------+-----------+---------+----------+-------------+-----------------+----------------+
|  1|   Sunday|          1|9/14/2014|     2,146|        1,582|            1,430|             152|
|  2|   Monday|          2|9/15/2014|     3,621|        2,528|            2,297|             231|
|  3|  Tuesday|          3|9/16/2014|     3,698|        2,630|            2,352|             278|
|  4|Wednesday|          4|9/17/2014|     3,667|        2,614|            2,327|             287|
|  5| Thursday|          5|9/18/2014|     3,316|        2,366|            2,130|             236|
+---+---------+-----------+---------+----------+-------------+-----------------+----------------+
only showing top 5 rows



In [38]:
# skewness() function returns the skewness of the values in a group 
df.select(skewness("Returning_Visits")).show()

+--------------------------+
|skewness(Returning_Visits)|
+--------------------------+
|       0.13754046231086722|
+--------------------------+



In [39]:
# stddev() alias for  stddev_samp()
# stddev_samp() fumction returns the sample standard deviation of values in a column.
# stddev_pop() function returns the population standard deviation of the values in a column.

df.select(stddev("Returning_Visits"),stddev_pop("Returning_Visits"),
          stddev_samp("Returning_Visits")).show()

+------------------------+----------------------------+-----------------------------+
|stddev(Returning_Visits)|stddev_pop(Returning_Visits)|stddev_samp(Returning_Visits)|
+------------------------+----------------------------+-----------------------------+
|      168.39876451489633|          168.35988681277027|           168.39876451489633|
+------------------------+----------------------------+-----------------------------+



In [40]:
df.select(sum("Returning_Visits")).show()

+---------------------+
|sum(Returning_Visits)|
+---------------------+
|            1108083.0|
+---------------------+



In [41]:
# sumDistinct() function returns the sum of all distinct values in a column.
df.select(sumDistinct("Returning_Visits")).show()



+------------------------------+
|sum(DISTINCT Returning_Visits)|
+------------------------------+
|                      348756.0|
+------------------------------+



In [42]:
df.select(sum_distinct("Returning_Visits")).show()

+------------------------------+
|sum(DISTINCT Returning_Visits)|
+------------------------------+
|                      348756.0|
+------------------------------+



In [43]:
# variance() alias for var_samp
df.select(variance("Returning_Visits"),var_samp("Returning_Visits"),
         var_pop('Returning_Visits')).show()

+--------------------------+--------------------------+-------------------------+
|var_samp(Returning_Visits)|var_samp(Returning_Visits)|var_pop(Returning_Visits)|
+--------------------------+--------------------------+-------------------------+
|         28358.14389014351|         28358.14389014351|        28345.05148760882|
+--------------------------+--------------------------+-------------------------+

