## 02-pyspark-aggregate.py

In [0]:
# 02-pyspark-aggregate.py
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import approx_count_distinct, collect_list
from pyspark.sql.functions import collect_set, sum, avg, max, countDistinct, count
from pyspark.sql.functions import first, last, kurtosis, min, mean, skewness 
from pyspark.sql.functions import stddev, stddev_samp, stddev_pop, sumDistinct
from pyspark.sql.functions import variance, var_samp, var_pop

spark = SparkSession.builder.appName('PySparkExamples').getOrCreate()

simpleData = [("Jaya", "Sales", 3000), ("Mithun", "Sales", 4600),
              ("Rohit", "Sales", 4100), ("Maya", "Finance", 3000),
              ("Jaya", "Sales", 3000), ("Satish", "Finance", 3300),
              ("Joy", "Finance", 3900), ("Jitendra", "Marketing", 3000),
              ("Kumar", "Marketing", 2000), ("Sunu", "Sales", 4100)]
df_schema = ["employee_name", "department", "salary"]

df = spark.createDataFrame(data = simpleData, schema = df_schema)
df.printSchema()
df.show(truncate = False)

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: long (nullable = true)

+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|Jaya         |Sales     |3000  |
|Mithun       |Sales     |4600  |
|Rohit        |Sales     |4100  |
|Maya         |Finance   |3000  |
|Jaya         |Sales     |3000  |
|Satish       |Finance   |3300  |
|Joy          |Finance   |3900  |
|Jitendra     |Marketing |3000  |
|Kumar        |Marketing |2000  |
|Sunu         |Sales     |4100  |
+-------------+----------+------+



In [0]:
print("approx_count_distinct: " + \
      str(df.select(approx_count_distinct("salary")).collect()[0][0]))
print("avg: " + str(df.select(avg("salary")).collect()[0][0]))

approx_count_distinct: 6
avg: 3400.0


In [0]:
df.select(collect_list("salary")).show(truncate = False)
df.select(collect_set("salary")).show(truncate = False)

+------------------------------------------------------------+
|collect_list(salary)                                        |
+------------------------------------------------------------+
|[3000, 4600, 4100, 3000, 3000, 3300, 3900, 3000, 2000, 4100]|
+------------------------------------------------------------+

+------------------------------------+
|collect_set(salary)                 |
+------------------------------------+
|[4600, 3000, 3900, 4100, 3300, 2000]|
+------------------------------------+



In [0]:
df2 = df.select(countDistinct("department", "salary"))
df2.show(truncate = False)
df2 = df.select(countDistinct("department", "salary").alias("dept_salary"))
df2.show(truncate = False)
print("Distinct Count of Department and Salary: " + str(df2.collect()))
print("Distinct Count of Department and Salary: " + str(df2.collect()[0]))
print("Distinct Count of Department and Salary: " + str(df2.collect()[0][0]))
print("count: " + str(df.select(count("salary")).collect()))
print("count: " + str(df.select(count("salary")).collect()[0]))
print("count: " + str(df.select(count("salary")).collect()[0][0]))

+----------------------------------+
|count(DISTINCT department, salary)|
+----------------------------------+
|8                                 |
+----------------------------------+

+-----------+
|dept_salary|
+-----------+
|8          |
+-----------+

Distinct Count of Department and Salary: [Row(dept_salary=8)]
Distinct Count of Department and Salary: Row(dept_salary=8)
Distinct Count of Department and Salary: 8
count: [Row(count(salary)=10)]
count: Row(count(salary)=10)
count: 10


In [0]:
df.select(first("salary"), last("salary"), max("salary"), min("salary")).show(truncate = False)
df.select(sum("salary"), sumDistinct("salary"), mean("salary"), count("salary")).show(truncate = False)

+-------------+------------+-----------+-----------+
|first(salary)|last(salary)|max(salary)|min(salary)|
+-------------+------------+-----------+-----------+
|3000         |4100        |4600       |2000       |
+-------------+------------+-----------+-----------+

+-----------+--------------------+-----------+-------------+
|sum(salary)|sum(DISTINCT salary)|avg(salary)|count(salary)|
+-----------+--------------------+-----------+-------------+
|34000      |20900               |3400.0     |10           |
+-----------+--------------------+-----------+-------------+



In [0]:
df.select(kurtosis("salary"), skewness("salary")).show(truncate = False)
df.select(stddev("salary"), stddev_samp("salary"), stddev_pop("salary")).show(truncate = False)
df.select(variance("salary"), var_samp("salary"), var_pop("salary")).show(truncate = False)

+-------------------+--------------------+
|kurtosis(salary)   |skewness(salary)    |
+-------------------+--------------------+
|-0.6467803030303032|-0.12041791181069571|
+-------------------+--------------------+

+-------------------+-------------------+------------------+
|stddev_samp(salary)|stddev_samp(salary)|stddev_pop(salary)|
+-------------------+-------------------+------------------+
|765.9416862050705  |765.9416862050705  |726.636084983398  |
+-------------------+-------------------+------------------+

+-----------------+-----------------+---------------+
|var_samp(salary) |var_samp(salary) |var_pop(salary)|
+-----------------+-----------------+---------------+
|586666.6666666666|586666.6666666666|528000.0       |
+-----------------+-----------------+---------------+

