In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr

In [4]:
spark = SparkSession.builder.appName("aggregation_demo").getOrCreate()

In [3]:
spark

In [5]:
employees = spark.read.csv("employee_data.csv", inferSchema=True, header=True)

In [11]:
employees.show()

+---+----------+----------+------+------+----------+
| id|first_name| last_name|gender|salary|department|
+---+----------+----------+------+------+----------+
|  1|    Merell|   Ruilton|  Male|   500|   Finance|
|  2|    Waring|     Dibbs|Female|   600| Marketing|
|  3|    Dewitt|    Steart|Female|   700|        IT|
|  4|   Thadeus|Winchcombe|  Male|   700|        HR|
|  5|      Ford|  Thorrold|  Male|   500|     Sales|
|  6|    Verile|   Edgeler|Female|   600|   Finance|
|  7|  Ezechiel|    Siggin|Female|   500| Marketing|
|  8|    Flossy| Stroobant|Female|   600|        IT|
|  9|  Kristian|    Osborn|  Male|   700|        HR|
| 10|    Tessie|    Sisley|  Male|   800|     Sales|
| 11|  Prentiss|   Biswell|Female|   500|   Finance|
| 12|     Elise|  Timbrell|  Male|   600| Marketing|
| 13|    Frieda|  Butchard|Female|   700|        IT|
| 14|   Reinold| McSperrin|Female|   600|        HR|
| 15|    Powell|    Mundee|  Male|   500|     Sales|
+---+----------+----------+------+------+-----

## ========== GroupBy ==========

### Total salary

In [7]:
# The given statement is equivalent to: employees.groupBy().agg({"salary": "sum"}).show()
employees.agg({"salary": "sum"}).show()

+-----------+
|sum(salary)|
+-----------+
|       9100|
+-----------+



### Salaries by gender

In [13]:
employees.groupBy("gender").agg({"salary": "sum"}).show()

+------+-----------+
|gender|sum(salary)|
+------+-----------+
|Female|       4800|
|  Male|       4300|
+------+-----------+



### Salaries by department

In [20]:
employees.groupBy("department").agg({"salary": "sum"}).sort("department").show()

+----------+-----------+
|department|sum(salary)|
+----------+-----------+
|   Finance|       1600|
|        HR|       2000|
|        IT|       2000|
| Marketing|       1700|
|     Sales|       1800|
+----------+-----------+



### Salaries of each department by gender

In [15]:
employees.groupBy("department", "gender").agg({"salary": "sum"}).sort("department", "gender").show()



+----------+------+-----------+
|department|gender|sum(salary)|
+----------+------+-----------+
|   Finance|Female|       1100|
|   Finance|  Male|        500|
|        HR|Female|        600|
|        HR|  Male|       1400|
|        IT|Female|       2000|
| Marketing|Female|       1100|
| Marketing|  Male|        600|
|     Sales|  Male|       1800|
+----------+------+-----------+





## ========== RollUp ==========

### Total salary & salaries by gender

In [19]:
employees.rollup("gender").sum("salary").sort("gender").show()

+------+-----------+
|gender|sum(salary)|
+------+-----------+
|  null|       9100|
|Female|       4800|
|  Male|       4300|
+------+-----------+



### Total salary & salaries by department

In [22]:
employees.rollup("department").sum("salary").sort("department").show()

+----------+-----------+
|department|sum(salary)|
+----------+-----------+
|      null|       9100|
|   Finance|       1600|
|        HR|       2000|
|        IT|       2000|
| Marketing|       1700|
|     Sales|       1800|
+----------+-----------+



### Total salary & salaries by department & salaries of each department by gender

In [18]:
employees.rollup("department", "gender").sum("salary").sort("department", "gender").show()

+----------+------+-----------+
|department|gender|sum(salary)|
+----------+------+-----------+
|      null|  null|       9100|
|   Finance|  null|       1600|
|   Finance|Female|       1100|
|   Finance|  Male|        500|
|        HR|  null|       2000|
|        HR|Female|        600|
|        HR|  Male|       1400|
|        IT|  null|       2000|
|        IT|Female|       2000|
| Marketing|  null|       1700|
| Marketing|Female|       1100|
| Marketing|  Male|        600|
|     Sales|  null|       1800|
|     Sales|  Male|       1800|
+----------+------+-----------+





## -------------------- RollUp Pivot --------------------

In [42]:
employees.rollup("department", "gender").sum("salary").groupBy("department").pivot("gender")\
.sum("sum(salary)").sort("department").show()

+----------+----+------+----+
|department|null|Female|Male|
+----------+----+------+----+
|      null|9100|  null|null|
|   Finance|1600|  1100| 500|
|        HR|2000|   600|1400|
|        IT|2000|  2000|null|
| Marketing|1700|  1100| 600|
|     Sales|1800|  null|1800|
+----------+----+------+----+



## ========== Cube ==========

### Total salary & salaries by gender & salaries by department & salaries of each department by gender

In [17]:
employees.cube("department", "gender").sum("salary").sort("department", "gender").show()

+----------+------+-----------+
|department|gender|sum(salary)|
+----------+------+-----------+
|      null|  null|       9100|
|      null|Female|       4800|
|      null|  Male|       4300|
|   Finance|  null|       1600|
|   Finance|Female|       1100|
|   Finance|  Male|        500|
|        HR|  null|       2000|
|        HR|Female|        600|
|        HR|  Male|       1400|
|        IT|  null|       2000|
|        IT|Female|       2000|
| Marketing|  null|       1700|
| Marketing|Female|       1100|
| Marketing|  Male|        600|
|     Sales|  null|       1800|
|     Sales|  Male|       1800|
+----------+------+-----------+



## -------------------- Cube Pivot --------------------

In [38]:
employees.cube("department", "gender").sum("salary").groupBy("department").pivot("gender")\
.sum("sum(salary)").sort("department").withColumnRenamed("null", "All Gender Combined").fillna("All").fillna(0).show()

+----------+-------------------+------+----+
|department|All Gender Combined|Female|Male|
+----------+-------------------+------+----+
|       All|               9100|  4800|4300|
|   Finance|               1600|  1100| 500|
|        HR|               2000|   600|1400|
|        IT|               2000|  2000|   0|
| Marketing|               1700|  1100| 600|
|     Sales|               1800|     0|1800|
+----------+-------------------+------+----+



## ========== Pivot ==========

In [28]:
employees.groupBy("department").pivot("gender").agg({"salary": "sum"}).sort("department").show()

+----------+------+----+
|department|Female|Male|
+----------+------+----+
|   Finance|  1100| 500|
|        HR|   600|1400|
|        IT|  2000|null|
| Marketing|  1100| 600|
|     Sales|  null|1800|
+----------+------+----+

