In [21]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession, DataFrame

# Creating Spark Session

In [2]:
spark = SparkSession.builder \
    .appName("Doc Learning") \
    .getOrCreate()

In [3]:
df = spark.read.csv("C:\Education\PySpark-Learning\data\data_per_100k_habitants.csv", header=True, inferSchema=True)

In [4]:
df.show()

+----+-----------------------------------------------------------+---+---------+---------------+--------------------+----+-----------------------------------------------------------+---+---------+---------------+--------------------+----+-----------------------------------------------------------+----+---------+---------------+--------------------+----+-----------------------------------------------------------+----+---------+---------------+--------------------+----+-----------------------------------------------------------+----+---------+---------------+--------------------+----+-----------------------------------------------------------+----+---------+---------------+--------------------+----+-----------------------------------------------------------+----+---------+---------------+--------------------+----+-----------------------------------------------------------+----+---------+---------------+--------------------+----+-----------------------------------------------------------+

# 1 - Aggregation function

## Aggregate on the entire DataFrame without groups (shorthand for df.groupBy().agg()).

In [5]:
agg_sample = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])

In [6]:
# example 1 - simple operations
agg_df_1 = agg_sample.agg({"age": "max"})
agg_df_1.show()

+--------+
|max(age)|
+--------+
|       5|
+--------+



In [7]:
# example 2 - aggregating multiple columns
agg_df_2 = agg_sample.agg(
    {
        "age": "max",
        "name": "min"
    }
)
agg_df_2.show()

+---------+--------+
|min(name)|max(age)|
+---------+--------+
|    Alice|       5|
+---------+--------+



In [8]:
# exempla 3 - using sql functions for aggregations
agg_df_3 = agg_sample.agg(
    F.max("age").alias("MAX_AGE"),
    F.count("name").alias("NAME_COUNT")
).select("MAX_AGE", "NAME_COUNT").show()

+-------+----------+
|MAX_AGE|NAME_COUNT|
+-------+----------+
|      5|         2|
+-------+----------+



In [9]:
# example 4 - using with groupBy
agg_df_4 = agg_sample.groupBy(F.col("age")).agg(F.count("*"))
agg_df_4.show()

+---+--------+
|age|count(1)|
+---+--------+
|  2|       1|
|  5|       1|
+---+--------+



In [10]:
# example 5 - Aggregating with Multiple Functions on the Same Column
agg_df_5 = agg_sample.agg(
    F.min("age").alias("min_age"),
    F.max("age").alias("max_age"),
    F.avg("age").alias("avg_age"),
    F.count("age").alias("count_age")
)
agg_df_5.show()

+-------+-------+-------+---------+
|min_age|max_age|avg_age|count_age|
+-------+-------+-------+---------+
|      2|      5|    3.5|        2|
+-------+-------+-------+---------+



In [11]:
# example 6 - using custom aggregations
from pyspark.sql import types as T

# Define a UDF for custom aggregation
@F.udf(T.DoubleType())
def custom_agg(values):
    return sum(values) / len(values)  # Custom example: mean calculation

# Apply custom aggregation
agg_df_6 = agg_sample.groupBy("name").agg(
    custom_agg(F.collect_list("age")).alias("custom_age_mean")
)
agg_df_6.show()


+-----+---------------+
| name|custom_age_mean|
+-----+---------------+
|Alice|            2.0|
|  Bob|            5.0|
+-----+---------------+



## Exercises "agg" function

In [12]:
# sample data
data = [(1, "Alice", "Sales", 34, 70000, 5),
        (2, "Bob", "HR", 45, 80000, 10),
        (3, "Catherine", "IT", 29, 90000, 3),
        (4, "David", "IT", 39, 85000, 7),
        (5, "Eve", "Sales", 41, 75000, 8),
        (6, "Frank", "HR", 30, 60000, 2),
        (7, "Grace", "IT", 35, 95000, 6),
        (8, "Hannah", "Sales", 50, 65000, 12),
        (9, "Ivy", "IT", 38, 87000, 9),
        (10, "Jack", "HR", 28, 72000, 4)]

columns = ["id", "name", "department", "age", "salary", "experience"]

agg_sample = spark.createDataFrame(data, columns)

# Show the sample data
agg_sample.show()

+---+---------+----------+---+------+----------+
| id|     name|department|age|salary|experience|
+---+---------+----------+---+------+----------+
|  1|    Alice|     Sales| 34| 70000|         5|
|  2|      Bob|        HR| 45| 80000|        10|
|  3|Catherine|        IT| 29| 90000|         3|
|  4|    David|        IT| 39| 85000|         7|
|  5|      Eve|     Sales| 41| 75000|         8|
|  6|    Frank|        HR| 30| 60000|         2|
|  7|    Grace|        IT| 35| 95000|         6|
|  8|   Hannah|     Sales| 50| 65000|        12|
|  9|      Ivy|        IT| 38| 87000|         9|
| 10|     Jack|        HR| 28| 72000|         4|
+---+---------+----------+---+------+----------+



In [13]:
# question 1: Find the maximum age, average salary, and total experience for all employees.
q1 = agg_sample.agg(
    F.max("age").alias("max_age"),
    F.avg("salary").alias("avg_salary"),
    F.sum("experience").alias("total_experience")
).show()

+-------+----------+----------------+
|max_age|avg_salary|total_experience|
+-------+----------+----------------+
|     50|   77900.0|              66|
+-------+----------+----------------+



In [14]:
# question 2: Group the employees by department and find the maximum age, average salary, and total experience for each department.
q2 = agg_sample.groupBy("department").agg(
    F.max("age").alias("max_age"),
    F.avg("salary").alias("avg_salary"),
    F.sum("experience").alias("total_experience")
).select(
    "department",
    "max_age",
    "avg_salary",
    "total_experience"
).show()

+----------+-------+-----------------+----------------+
|department|max_age|       avg_salary|total_experience|
+----------+-------+-----------------+----------------+
|     Sales|     50|          70000.0|              25|
|        HR|     45|70666.66666666667|              16|
|        IT|     39|          89250.0|              25|
+----------+-------+-----------------+----------------+



In [15]:
# question 3: Find the minimum age, maximum salary, and count of employees.
q3 = agg_sample.agg(
    F.min("age").alias("min_age"),
    F.max("salary").alias("max_salary"),
    F.count("*").alias("employee_count")
).show()

+-------+----------+--------------+
|min_age|max_salary|employee_count|
+-------+----------+--------------+
|     28|     95000|            10|
+-------+----------+--------------+



In [16]:
# question 4 - Group the employees by department and find the minimum age,
# maximum salary, and count of employees for each department.
q4 = agg_sample.groupBy("department").agg(
    F.min("age").alias("min_age"),
    F.max("salary").alias("max_salary"),
    F.count("*").alias("employee_count")
).show()

+----------+-------+----------+--------------+
|department|min_age|max_salary|employee_count|
+----------+-------+----------+--------------+
|     Sales|     34|     75000|             3|
|        HR|     28|     80000|             3|
|        IT|     29|     95000|             4|
+----------+-------+----------+--------------+



In [17]:
# question 5 - Find the average age and total salary of employees.
q5 = agg_sample.agg({
    "age": "avg",
    "salary": "sum"
})
columns = ["total_salary", "avg_age"

q5_df = q5.withColumns(columns)

SyntaxError: '[' was never closed (515974048.py, line 6)

# 2 - withColumn function

A função withColumn no PySpark é usada para adicionar uma nova coluna a um DataFrame ou para substituir uma coluna existente com base em uma expressão especificada

Syntax:

**DataFrame.withColumn(colName, col)**

colName: O nome da nova coluna ou da coluna existente a ser substituída.
col: Uma expressão que define os valores da coluna, que pode ser uma instância de Column, uma expressão SQL, ou uma função do módulo pyspark.sql.functions.


## 2.1 - Examples

**Adicionar uma nova coluna**

Vamos adicionar uma nova coluna chamada "idade_5_anos" que será a idade atual acrescida de 5 anos.

In [None]:
data = [("Alice", 34), ("Bob", 45), ("Catherine", 29)]
columns = ["Nome", "Idade"]

df = spark.createDataFrame(data, columns)

# Adicionar uma nova coluna "idade_5_anos"
df = df.withColumn("idade_5_anos", F.col("Idade") + 5)

In [None]:
df.show()

+---------+-----+------------+
|     Nome|Idade|idade_5_anos|
+---------+-----+------------+
|    Alice|   34|          39|
|      Bob|   45|          50|
|Catherine|   29|          34|
+---------+-----+------------+



**Substituir uma coluna existente**

Vamos substituir a coluna "Idade" com a idade acrescida de 10 anos.

In [None]:
df = df.withColumn("Idade", F.col("Idade") + 10)
df.show()

+---------+-----+------------+
|     Nome|Idade|idade_5_anos|
+---------+-----+------------+
|    Alice|   44|          39|
|      Bob|   55|          50|
|Catherine|   39|          34|
+---------+-----+------------+



**Converter tipos de dados**

In [None]:
# Converter uma coluna de string para inteiro
df = df.withColumn("Idade", F.col("Idade").cast("int"))
df.show()

+---------+-----+------------+
|     Nome|Idade|idade_5_anos|
+---------+-----+------------+
|    Alice|   44|          39|
|      Bob|   55|          50|
|Catherine|   39|          34|
+---------+-----+------------+



**Aplicar funções SQL integradas**

In [None]:
# Adicionar uma nova coluna "AnoAtual" usando a função current_year
df = df.withColumn("AnoAtual", F.year(F.current_date()))
df.show()

+---------+-----+------------+--------+
|     Nome|Idade|idade_5_anos|AnoAtual|
+---------+-----+------------+--------+
|    Alice|   44|          39|    2024|
|      Bob|   55|          50|    2024|
|Catherine|   39|          34|    2024|
+---------+-----+------------+--------+



**Criar uma coluna calculada com base em outras colunas**

In [None]:
# Adicionar uma nova coluna "SalarioAnual" que é "SalarioMensal" vezes 12
df = df.withColumn("SalarioAnual", F.col("SalarioMensal") * 12)

## 2.2 - Exercises

In [23]:
# data
data = [
    ("John", "Doe", 28),
    ("Jane", "Smith", 32),
    ("Mike", "Johnson", 25),
    ("Emily", "Brown", 34),
    ("Kevin", "Davis", 30)
]

columns = ["first_name", "last_name", "age"]

# Create a DataFrame
df = spark.createDataFrame(data, columns)

In [24]:
# 1 - Add a new column full_name by concatenating first_name and last_name with a space in between.
full_name_df = df.withColumn(
    "full_name",
    F.concat(F.col("first_name"), F.lit(" "), F.col("last_name"))
    )
full_name_df.show()

+----------+---------+---+------------+
|first_name|last_name|age|   full_name|
+----------+---------+---+------------+
|      John|      Doe| 28|    John Doe|
|      Jane|    Smith| 32|  Jane Smith|
|      Mike|  Johnson| 25|Mike Johnson|
|     Emily|    Brown| 34| Emily Brown|
|     Kevin|    Davis| 30| Kevin Davis|
+----------+---------+---+------------+



In [25]:
# 2 - Create a new column age_after_5_years that shows each person's age 5 years from now.
age_df = df.withColumn(
    "age_after_5_years",
    F.col("age") + 5
)

age_df.show()

+----------+---------+---+-----------------+
|first_name|last_name|age|age_after_5_years|
+----------+---------+---+-----------------+
|      John|      Doe| 28|               33|
|      Jane|    Smith| 32|               37|
|      Mike|  Johnson| 25|               30|
|     Emily|    Brown| 34|               39|
|     Kevin|    Davis| 30|               35|
+----------+---------+---+-----------------+



In [26]:
# 3 - Add a column name_length that calculates the length of the first_name.
name_df = df.withColumn(
    "name_length",
    F.length("first_name")
)
name_df.show()

+----------+---------+---+-----------+
|first_name|last_name|age|name_length|
+----------+---------+---+-----------+
|      John|      Doe| 28|          4|
|      Jane|    Smith| 32|          4|
|      Mike|  Johnson| 25|          4|
|     Emily|    Brown| 34|          5|
|     Kevin|    Davis| 30|          5|
+----------+---------+---+-----------+



In [27]:
# 4 - Convert the age column to a string type and store it in a new column age_str.
str_df = df.withColumn(
    "age_str",
    F.col("age").cast("int")
)

str_df.show()

+----------+---------+---+-------+
|first_name|last_name|age|age_str|
+----------+---------+---+-------+
|      John|      Doe| 28|     28|
|      Jane|    Smith| 32|     32|
|      Mike|  Johnson| 25|     25|
|     Emily|    Brown| 34|     34|
|     Kevin|    Davis| 30|     30|
+----------+---------+---+-------+



In [28]:
# 5 - Create a column is_adult that indicates whether a person is an adult (age >= 18).
adult_df = df.withColumn(
    "is_adult",
    F.col("age") >= 18
)

adult_df.show()

+----------+---------+---+--------+
|first_name|last_name|age|is_adult|
+----------+---------+---+--------+
|      John|      Doe| 28|    true|
|      Jane|    Smith| 32|    true|
|      Mike|  Johnson| 25|    true|
|     Emily|    Brown| 34|    true|
|     Kevin|    Davis| 30|    true|
+----------+---------+---+--------+



In [31]:
# 6 - Add a column name_in_uppercase that converts the first_name to uppercase.
upper_df = df.withColumn(
    "name_in_upper",
    F.upper("first_name")
)

upper_df.show()

+----------+---------+---+-------------+
|first_name|last_name|age|name_in_upper|
+----------+---------+---+-------------+
|      John|      Doe| 28|         JOHN|
|      Jane|    Smith| 32|         JANE|
|      Mike|  Johnson| 25|         MIKE|
|     Emily|    Brown| 34|        EMILY|
|     Kevin|    Davis| 30|        KEVIN|
+----------+---------+---+-------------+



In [32]:
# 7 - Create a column age_category that categorizes age into "Young" (age < 30) and "Old" (age >= 30).
category_df = df.withColumn(
    "age_category",
    F.when(F.col("age") < 30, "Young")
    .when(F.col("age") >= 30, "Old")
)

category_df.show()

+----------+---------+---+------------+
|first_name|last_name|age|age_category|
+----------+---------+---+------------+
|      John|      Doe| 28|       Young|
|      Jane|    Smith| 32|         Old|
|      Mike|  Johnson| 25|       Young|
|     Emily|    Brown| 34|         Old|
|     Kevin|    Davis| 30|         Old|
+----------+---------+---+------------+



In [34]:
# 8 - Add a column last_name_length that calculates the length of the last_name.
last_name_df = df.withColumn(
    "last_name_length",
    F.length("last_name")
)
last_name_df.show()

+----------+---------+---+----------------+
|first_name|last_name|age|last_name_length|
+----------+---------+---+----------------+
|      John|      Doe| 28|               3|
|      Jane|    Smith| 32|               5|
|      Mike|  Johnson| 25|               7|
|     Emily|    Brown| 34|               5|
|     Kevin|    Davis| 30|               5|
+----------+---------+---+----------------+



In [37]:
# 9 - Create a column name_with_initial that combines first_name and only the first letter of last_name.
initial_df = df.withColumn(
    "name_with_initial",
    F.concat(F.col("first_name"), F.lit(" "), F.col("last_name").substr(1, 1))
)
initial_df.show()

+----------+---------+---+-----------------+
|first_name|last_name|age|name_with_initial|
+----------+---------+---+-----------------+
|      John|      Doe| 28|           John D|
|      Jane|    Smith| 32|           Jane S|
|      Mike|  Johnson| 25|           Mike J|
|     Emily|    Brown| 34|          Emily B|
|     Kevin|    Davis| 30|          Kevin D|
+----------+---------+---+-----------------+



In [38]:
# 10 - Add a column even_or_odd_age that labels each person's age as "Even" or "Odd".
even_odd = df.withColumn(
    "even_or_odd_age",
    F.when(F.col("age") % 2 == 0, "Even")
    .when(F.col("age") % 2 != 0, "Odd")
)

even_odd.show()

+----------+---------+---+---------------+
|first_name|last_name|age|even_or_odd_age|
+----------+---------+---+---------------+
|      John|      Doe| 28|           Even|
|      Jane|    Smith| 32|           Even|
|      Mike|  Johnson| 25|            Odd|
|     Emily|    Brown| 34|           Even|
|     Kevin|    Davis| 30|           Even|
+----------+---------+---+---------------+

