In [5]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession, DataFrame

# Creating Spark Session

In [6]:
spark = SparkSession.builder \
    .appName("Doc Learning") \
    .getOrCreate()

In [8]:
df = spark.read.csv("C:\Education\PySpark-Learning\data\data_per_100k_habitants.csv", header=True, inferSchema=True)

In [None]:
df.show()

# 1 - Aggregation function

## Aggregate on the entire DataFrame without groups (shorthand for df.groupBy().agg()).

In [None]:
agg_sample = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])

In [None]:
# example 1 - simple operations
agg_df_1 = agg_sample.agg({"age": "max"})
agg_df_1.show()

+--------+
|max(age)|
+--------+
|       5|
+--------+



In [None]:
# example 2 - aggregating multiple columns
agg_df_2 = agg_sample.agg(
    {
        "age": "max",
        "name": "min"
    }
)
agg_df_2.show()

+---------+--------+
|min(name)|max(age)|
+---------+--------+
|    Alice|       5|
+---------+--------+



In [None]:
# exempla 3 - using sql functions for aggregations
agg_df_3 = agg_sample.agg(
    F.max("age").alias("MAX_AGE"),
    F.count("name").alias("NAME_COUNT")
).select("MAX_AGE", "NAME_COUNT").show()

+-------+----------+
|MAX_AGE|NAME_COUNT|
+-------+----------+
|      5|         2|
+-------+----------+



In [None]:
# example 4 - using with groupBy
agg_df_4 = agg_sample.groupBy(F.col("age")).agg(F.count("*"))
agg_df_4.show()

+---+--------+
|age|count(1)|
+---+--------+
|  2|       1|
|  5|       1|
+---+--------+



In [None]:
# example 5 - Aggregating with Multiple Functions on the Same Column
agg_df_5 = agg_sample.agg(
    F.min("age").alias("min_age"),
    F.max("age").alias("max_age"),
    F.avg("age").alias("avg_age"),
    F.count("age").alias("count_age")
)
agg_df_5.show()

+-------+-------+-------+---------+
|min_age|max_age|avg_age|count_age|
+-------+-------+-------+---------+
|      2|      5|    3.5|        2|
+-------+-------+-------+---------+



In [None]:
# example 6 - using custom aggregations
from pyspark.sql import types as T

# Define a UDF for custom aggregation
@F.udf(T.DoubleType())
def custom_agg(values):
    return sum(values) / len(values)  # Custom example: mean calculation

# Apply custom aggregation
agg_df_6 = agg_sample.groupBy("name").agg(
    custom_agg(F.collect_list("age")).alias("custom_age_mean")
)
agg_df_6.show()


+-----+---------------+
| name|custom_age_mean|
+-----+---------------+
|Alice|            2.0|
|  Bob|            5.0|
+-----+---------------+



## Exercises "agg" function

In [None]:
# sample data
data = [(1, "Alice", "Sales", 34, 70000, 5),
        (2, "Bob", "HR", 45, 80000, 10),
        (3, "Catherine", "IT", 29, 90000, 3),
        (4, "David", "IT", 39, 85000, 7),
        (5, "Eve", "Sales", 41, 75000, 8),
        (6, "Frank", "HR", 30, 60000, 2),
        (7, "Grace", "IT", 35, 95000, 6),
        (8, "Hannah", "Sales", 50, 65000, 12),
        (9, "Ivy", "IT", 38, 87000, 9),
        (10, "Jack", "HR", 28, 72000, 4)]

columns = ["id", "name", "department", "age", "salary", "experience"]

agg_sample = spark.createDataFrame(data, columns)

# Show the sample data
agg_sample.show()

+---+---------+----------+---+------+----------+
| id|     name|department|age|salary|experience|
+---+---------+----------+---+------+----------+
|  1|    Alice|     Sales| 34| 70000|         5|
|  2|      Bob|        HR| 45| 80000|        10|
|  3|Catherine|        IT| 29| 90000|         3|
|  4|    David|        IT| 39| 85000|         7|
|  5|      Eve|     Sales| 41| 75000|         8|
|  6|    Frank|        HR| 30| 60000|         2|
|  7|    Grace|        IT| 35| 95000|         6|
|  8|   Hannah|     Sales| 50| 65000|        12|
|  9|      Ivy|        IT| 38| 87000|         9|
| 10|     Jack|        HR| 28| 72000|         4|
+---+---------+----------+---+------+----------+



In [None]:
# question 1: Find the maximum age, average salary, and total experience for all employees.
q1 = agg_sample.agg(
    F.max("age").alias("max_age"),
    F.avg("salary").alias("avg_salary"),
    F.sum("experience").alias("total_experience")
).show()

+-------+----------+----------------+
|max_age|avg_salary|total_experience|
+-------+----------+----------------+
|     50|   77900.0|              66|
+-------+----------+----------------+



In [None]:
# question 2: Group the employees by department and find the maximum age, average salary, and total experience for each department.
q2 = agg_sample.groupBy("department").agg(
    F.max("age").alias("max_age"),
    F.avg("salary").alias("avg_salary"),
    F.sum("experience").alias("total_experience")
).select(
    "department",
    "max_age",
    "avg_salary",
    "total_experience"
).show()

+----------+-------+-----------------+----------------+
|department|max_age|       avg_salary|total_experience|
+----------+-------+-----------------+----------------+
|     Sales|     50|          70000.0|              25|
|        HR|     45|70666.66666666667|              16|
|        IT|     39|          89250.0|              25|
+----------+-------+-----------------+----------------+



In [None]:
# question 3: Find the minimum age, maximum salary, and count of employees.
q3 = agg_sample.agg(
    F.min("age").alias("min_age"),
    F.max("salary").alias("max_salary"),
    F.count("*").alias("employee_count")
).show()

+-------+----------+--------------+
|min_age|max_salary|employee_count|
+-------+----------+--------------+
|     28|     95000|            10|
+-------+----------+--------------+



In [None]:
# question 4 - Group the employees by department and find the minimum age,
# maximum salary, and count of employees for each department.
q4 = agg_sample.groupBy("department").agg(
    F.min("age").alias("min_age"),
    F.max("salary").alias("max_salary"),
    F.count("*").alias("employee_count")
).show()

+----------+-------+----------+--------------+
|department|min_age|max_salary|employee_count|
+----------+-------+----------+--------------+
|     Sales|     34|     75000|             3|
|        HR|     28|     80000|             3|
|        IT|     29|     95000|             4|
+----------+-------+----------+--------------+



In [None]:
# question 5 - Find the average age and total salary of employees.
q5 = agg_sample.agg({
    "age": "avg",
    "salary": "sum"
})
columns = ["total_salary", "avg_age"

q5_df = q5.withColumns(columns)

PySparkTypeError: [NOT_DICT] Argument `colsMap` should be a dict, got set.

# 2 - withColumn function

A função withColumn no PySpark é usada para adicionar uma nova coluna a um DataFrame ou para substituir uma coluna existente com base em uma expressão especificada

Syntax:

**DataFrame.withColumn(colName, col)**

colName: O nome da nova coluna ou da coluna existente a ser substituída.
col: Uma expressão que define os valores da coluna, que pode ser uma instância de Column, uma expressão SQL, ou uma função do módulo pyspark.sql.functions.


## 2.1 - Examples

**Adicionar uma nova coluna**

Vamos adicionar uma nova coluna chamada "idade_5_anos" que será a idade atual acrescida de 5 anos.

In [9]:
data = [("Alice", 34), ("Bob", 45), ("Catherine", 29)]
columns = ["Nome", "Idade"]

df = spark.createDataFrame(data, columns)

# Adicionar uma nova coluna "idade_5_anos"
df = df.withColumn("idade_5_anos", F.col("Idade") + 5)

In [10]:
df.show()

+---------+-----+------------+
|     Nome|Idade|idade_5_anos|
+---------+-----+------------+
|    Alice|   34|          39|
|      Bob|   45|          50|
|Catherine|   29|          34|
+---------+-----+------------+



**Substituir uma coluna existente**

Vamos substituir a coluna "Idade" com a idade acrescida de 10 anos.

In [11]:
df = df.withColumn("Idade", F.col("Idade") + 10)
df.show()

+---------+-----+------------+
|     Nome|Idade|idade_5_anos|
+---------+-----+------------+
|    Alice|   44|          39|
|      Bob|   55|          50|
|Catherine|   39|          34|
+---------+-----+------------+



**Converter tipos de dados**

In [13]:
# Converter uma coluna de string para inteiro
df = df.withColumn("Idade", F.col("Idade").cast("int"))
df.show()

+---------+-----+------------+
|     Nome|Idade|idade_5_anos|
+---------+-----+------------+
|    Alice|   44|          39|
|      Bob|   55|          50|
|Catherine|   39|          34|
+---------+-----+------------+



**Aplicar funções SQL integradas**

In [14]:
# Adicionar uma nova coluna "AnoAtual" usando a função current_year
df = df.withColumn("AnoAtual", F.year(F.current_date()))
df.show()

+---------+-----+------------+--------+
|     Nome|Idade|idade_5_anos|AnoAtual|
+---------+-----+------------+--------+
|    Alice|   44|          39|    2024|
|      Bob|   55|          50|    2024|
|Catherine|   39|          34|    2024|
+---------+-----+------------+--------+



**Criar uma coluna calculada com base em outras colunas**

In [None]:
# Adicionar uma nova coluna "SalarioAnual" que é "SalarioMensal" vezes 12
df = df.withColumn("SalarioAnual", F.col("SalarioMensal") * 12)

## 2.2 - Exercises