## Pyspark groupBy and aggregate functions 

In [34]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Agg").getOrCreate()

In [35]:
df = spark.read.csv("employee.csv",header=True, inferSchema=True)

In [36]:
df.show()

+--------+----+----------+-------+--------------------+--------------------+
|    name| age|experience| salary|         departments|                 job|
+--------+----+----------+-------+--------------------+--------------------+
|   Roger|24.0|       2.0|24000.0|                Data|       Product Owner|
|   Marie|40.0|      10.0|40000.0|Customer relation...|             Manager|
|  Fatima|47.0|       7.0|80000.0|               sales|         Acquisition|
|    Alex|26.0|       3.0|38000.0|                Data|        Data analyst|
|   sunny|24.0|       2.0|   NULL|       Communication|         Media buyer|
|  Robert|50.0|      NULL|   NULL|Customer relation...|          Call agent|
|Amandine|NULL|       5.0|   NULL|                NULL|                NULL|
|  Armine|NULL|      NULL|   NULL|               Sales|             Manager|
|    Rico|43.0|       7.0|55600.0|                NULL|   Software engineer|
|     Zoé|20.0|       0.0|25000.0|                  HR|            Assitant|

In [37]:
df.describe().show()

+-------+-----+------------------+-----------------+------------------+-------------+-----------------+
|summary| name|               age|       experience|            salary|  departments|              job|
+-------+-----+------------------+-----------------+------------------+-------------+-----------------+
|  count|   11|                 9|                9|                 7|            9|               10|
|   mean| NULL| 35.77777777777778|5.333333333333333| 47528.57142857143|         NULL|             NULL|
| stddev| NULL|12.091089464744044|              4.0|21747.085190565525|         NULL|             NULL|
|    min| Alex|              20.0|              0.0|           24000.0|Communication|      Acquisition|
|    max|sunny|              50.0|             12.0|           80000.0|        sales|Software engineer|
+-------+-----+------------------+-----------------+------------------+-------------+-----------------+



## Group by operation

### Max

In [38]:
df.groupBy().max("salary").show()

+-----------+
|max(salary)|
+-----------+
|    80000.0|
+-----------+



## Avg salary by department and job

In [39]:
from pyspark.sql import functions as f

In [42]:
df.groupBy("Departments","job").avg('salary').show()

+--------------------+--------------------+-----------+
|         Departments|                 job|avg(salary)|
+--------------------+--------------------+-----------+
|               sales|         Acquisition|    80000.0|
|                  IT|Head hardware man...|    70100.0|
|                NULL|                NULL|       NULL|
|                Data|        Data analyst|    38000.0|
|Customer relation...|             Manager|    40000.0|
|                NULL|   Software engineer|    55600.0|
|       Communication|         Media buyer|       NULL|
|                Data|       Product Owner|    24000.0|
|Customer relation...|          Call agent|       NULL|
|                  HR|            Assitant|    25000.0|
|               Sales|             Manager|       NULL|
+--------------------+--------------------+-----------+



## Aggregate on multiple columns with agg()

In [43]:
df.groupBy('job').agg({'Salary':'sum'}).show()

+--------------------+-----------+
|                 job|sum(Salary)|
+--------------------+-----------+
|Head hardware man...|    70100.0|
|                NULL|       NULL|
|       Product Owner|    24000.0|
|            Assitant|    25000.0|
|          Call agent|       NULL|
|   Software engineer|    55600.0|
|         Acquisition|    80000.0|
|             Manager|    40000.0|
|         Media buyer|       NULL|
|        Data analyst|    38000.0|
+--------------------+-----------+



In [33]:
df.groupBy('Departments')\
.agg(f.sum("salary").alias("sum_salary"),
                              f.mean("salary").alias("mean_salary")
                              )\
.show()

+--------------------+----------+-----------+
|         Departments|sum_salary|mean_salary|
+--------------------+----------+-----------+
|                Data|   62000.0|    31000.0|
|               Sales|      NULL|       NULL|
|                  HR|   25000.0|    25000.0|
|                NULL|   55600.0|    55600.0|
|               sales|   80000.0|    80000.0|
|Custmer relationship|   40000.0|    40000.0|
|                  IT|   70100.0|    70100.0|
|       Communication|      NULL|       NULL|
+--------------------+----------+-----------+

