## Dealing with Array Type and Complex Fields in Apache Spark

In [1]:
!pip install -q pyspark==3.5.0

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import Row
from pyspark.sql import functions as F
from pyspark.sql import types as T

In [3]:
spark = SparkSession \
    .builder \
    .master('local[*]') \
    .appName("New Session Example") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .enableHiveSupport() \
    .getOrCreate()

## How to create nested items

In [5]:
rows = [
    Row(id=1, name='Pedro', sale=[Row(product="Mouse",
                                      quant=1.0,
                                      unit_value=10.0),
                                   Row(product="Water Cooler",
                                       quant=1.0,
                                       unit_value=150.9)]),

    Row(id=2, name='Maria', sale=[Row(product="Keyboard",
                                      quant=2.0,
                                      unit_value=40.5)]),

    Row(id=3, name='Clara', sale=[Row(product="Graphic Cards",
                                      quant=2.0,
                                      unit_value=340.0)]),

    Row(id=4, name='Joana', sale=[Row(product="Sound System",
                                      quant=1.0,
                                      unit_value=340.0),
                                  Row(product="Mouse",
                                      quant=1.0,
                                      unit_value=10.0)]),

    Row(id=5, name='Carlos', sale=[Row(product="Graphic Cards",
                                       quant=1.0,
                                       unit_value=340.0),
                                   Row(product="Memory",
                                       quant=4.0,
                                       unit_value=50.0),
                                   Row(product="Hard Disk",
                                       quant=2.0,
                                       unit_value=10.0)])
]


df = spark.createDataFrame(rows)

In [6]:
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- sale: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- product: string (nullable = true)
 |    |    |-- quant: double (nullable = true)
 |    |    |-- unit_value: double (nullable = true)



In [7]:
df.show(truncate=False)

+---+------+--------------------------------------------------------------------------+
|id |name  |sale                                                                      |
+---+------+--------------------------------------------------------------------------+
|1  |Pedro |[{Mouse, 1.0, 10.0}, {Water Cooler, 1.0, 150.9}]                          |
|2  |Maria |[{Keyboard, 2.0, 40.5}]                                                   |
|3  |Clara |[{Graphic Cards, 2.0, 340.0}]                                             |
|4  |Joana |[{Sound System, 1.0, 340.0}, {Mouse, 1.0, 10.0}]                          |
|5  |Carlos|[{Graphic Cards, 1.0, 340.0}, {Memory, 4.0, 50.0}, {Hard Disk, 2.0, 10.0}]|
+---+------+--------------------------------------------------------------------------+



## Exploding array items

In [8]:
df_exp = df.select(df.id, df.name, F.explode(df.sale).alias("sale"))
df_exp.show(n=100, truncate=False)

+---+------+---------------------------+
|id |name  |sale                       |
+---+------+---------------------------+
|1  |Pedro |{Mouse, 1.0, 10.0}         |
|1  |Pedro |{Water Cooler, 1.0, 150.9} |
|2  |Maria |{Keyboard, 2.0, 40.5}      |
|3  |Clara |{Graphic Cards, 2.0, 340.0}|
|4  |Joana |{Sound System, 1.0, 340.0} |
|4  |Joana |{Mouse, 1.0, 10.0}         |
|5  |Carlos|{Graphic Cards, 1.0, 340.0}|
|5  |Carlos|{Memory, 4.0, 50.0}        |
|5  |Carlos|{Hard Disk, 2.0, 10.0}     |
+---+------+---------------------------+



## Struct to Table

In [9]:
df_exp.selectExpr("id",
                  "name",
                  "sale.product",
                  "sale.quant",
                  "sale.unit_value") \
      .show(n=100, truncate=False)

+---+------+-------------+-----+----------+
|id |name  |product      |quant|unit_value|
+---+------+-------------+-----+----------+
|1  |Pedro |Mouse        |1.0  |10.0      |
|1  |Pedro |Water Cooler |1.0  |150.9     |
|2  |Maria |Keyboard     |2.0  |40.5      |
|3  |Clara |Graphic Cards|2.0  |340.0     |
|4  |Joana |Sound System |1.0  |340.0     |
|4  |Joana |Mouse        |1.0  |10.0      |
|5  |Carlos|Graphic Cards|1.0  |340.0     |
|5  |Carlos|Memory       |4.0  |50.0      |
|5  |Carlos|Hard Disk    |2.0  |10.0      |
+---+------+-------------+-----+----------+



## Sum sales total using complex struct with Explode

In [10]:
df_exp.selectExpr("id",
                  "name",
                  "sale.unit_value * sale.quant as total_by_product") \
      .groupBy("id", "name") \
           .sum("total_by_product") \
           .withColumnRenamed("sum(total_by_product)", "total") \
      .orderBy("id") \
      .show(n=100, truncate=False)

+---+------+-----+
|id |name  |total|
+---+------+-----+
|1  |Pedro |160.9|
|2  |Maria |81.0 |
|3  |Clara |680.0|
|4  |Joana |350.0|
|5  |Carlos|560.0|
+---+------+-----+



## Sum sales total using complex struct with UDF

In [11]:
def total_sale(sale):
  acc = 0.0
  for s in sale:
    acc += (s.quant * s.unit_value)
  return acc

udf_total_sale = F.udf(total_sale, T.FloatType())
df.withColumn("total", udf_total_sale(df.sale)).show(truncate=False)

+---+------+--------------------------------------------------------------------------+-----+
|id |name  |sale                                                                      |total|
+---+------+--------------------------------------------------------------------------+-----+
|1  |Pedro |[{Mouse, 1.0, 10.0}, {Water Cooler, 1.0, 150.9}]                          |160.9|
|2  |Maria |[{Keyboard, 2.0, 40.5}]                                                   |81.0 |
|3  |Clara |[{Graphic Cards, 2.0, 340.0}]                                             |680.0|
|4  |Joana |[{Sound System, 1.0, 340.0}, {Mouse, 1.0, 10.0}]                          |350.0|
|5  |Carlos|[{Graphic Cards, 1.0, 340.0}, {Memory, 4.0, 50.0}, {Hard Disk, 2.0, 10.0}]|560.0|
+---+------+--------------------------------------------------------------------------+-----+

