In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
spark = (SparkSession.builder.getOrCreate())

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/19 16:32:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# ARRAY ZIP

In [3]:
array_data = [
    ("john", 5, 1),
    ("john", 6, 12),
    ("alice", 3, 7),
    ("bob", 9, 4),
    ("alice", 8, 15),
    ("alice", 7, 10),
    ("bob", 2, 14),
    ("bob", 4, 8),
    ("grace", None, 3),
    ("john", 1, 13),
    ("john", 5, 11),
    ("john", 4, None),
    ("john", 1, 3),
]
array_schema = ["name","score_1","score_2"]

In [4]:
df_array_data = spark.createDataFrame(array_data,array_schema)
df_array_data.show()

                                                                                

+-----+-------+-------+
| name|score_1|score_2|
+-----+-------+-------+
| john|      5|      1|
| john|      6|     12|
|alice|      3|      7|
|  bob|      9|      4|
|alice|      8|     15|
|alice|      7|     10|
|  bob|      2|     14|
|  bob|      4|      8|
|grace|   NULL|      3|
| john|      1|     13|
| john|      5|     11|
| john|      4|   NULL|
| john|      1|      3|
+-----+-------+-------+



## Convert data into array dataframe

In [5]:
from pyspark.sql import functions as F

In [6]:
df_master_array = df_array_data.groupby("name").agg(
    F.collect_list("score_1").alias("array_score_1"),
    F.collect_list("score_2").alias("array_score_2"),
)

df_master_array.show()

+-----+------------------+------------------+
| name|     array_score_1|     array_score_2|
+-----+------------------+------------------+
|alice|         [3, 8, 7]|       [7, 15, 10]|
| john|[5, 6, 1, 5, 4, 1]|[1, 12, 13, 11, 3]|
|  bob|         [9, 2, 4]|        [4, 14, 8]|
|grace|                []|               [3]|
+-----+------------------+------------------+



In [7]:
df_master_array.printSchema()

root
 |-- name: string (nullable = true)
 |-- array_score_1: array (nullable = false)
 |    |-- element: long (containsNull = false)
 |-- array_score_2: array (nullable = false)
 |    |-- element: long (containsNull = false)



## Apply array_zip function

In [10]:
df_array_zip = df_master_array.withColumn("zipped_value",F.arrays_zip("array_score_1","array_score_2"))

df_array_zip.show(10,False)

[Stage 8:>                                                          (0 + 4) / 4]

+-----+------------------+------------------+------------------------------------------------------+
|name |array_score_1     |array_score_2     |zipped_value                                          |
+-----+------------------+------------------+------------------------------------------------------+
|alice|[3, 8, 7]         |[7, 15, 10]       |[{3, 7}, {8, 15}, {7, 10}]                            |
|john |[5, 6, 1, 5, 4, 1]|[1, 12, 13, 11, 3]|[{5, 1}, {6, 12}, {1, 13}, {5, 11}, {4, 3}, {1, NULL}]|
|bob  |[9, 2, 4]         |[4, 14, 8]        |[{9, 4}, {2, 14}, {4, 8}]                             |
|grace|[]                |[3]               |[{NULL, 3}]                                           |
+-----+------------------+------------------+------------------------------------------------------+



                                                                                

## Practical use case flatten data

In [12]:
department_data = [
    (
        "sales", [
            {"name": "william", "age": 40, "active": True, "years_service": 10},
            {"name": "mariam", "age": 22, "active": False, "years_service": 2},
            {"name": "john", "age": 35, "active": True, "years_service": 7},
            {"name": "susan", "age": 29, "active": True, "years_service": 5}
        ]
    ),
    (
        "HHRR", [
            {"name": "marta", "age": 33, "active": True, "years_service": 1},
            {"name": "leonard", "age": 29, "active": True, "years_service": 7},
            {"name": "nancy", "age": 31, "active": False, "years_service": 3},
            {"name": "robert", "age": 45, "active": True, "years_service": 20}
        ]
    ),
    (
        "engineering", [
            {"name": "alice", "age": 28, "active": True, "years_service": 5},
            {"name": "bob", "age": 35, "active": False, "years_service": 12},
            {"name": "charlie", "age": 32, "active": True, "years_service": 8},
            {"name": "diane", "age": 27, "active": True, "years_service": 4},
            {"name": "edward", "age": 41, "active": False, "years_service": 15}
        ]
    ),
    (
        "marketing", [
            {"name": "carol", "age": 26, "active": True, "years_service": 3},
            {"name": "dave", "age": 30, "active": True, "years_service": 8},
            {"name": "ellen", "age": 34, "active": False, "years_service": 6},
            {"name": "frank", "age": 39, "active": True, "years_service": 10}
        ]
    ),
    (
        "finance", [
            {"name": "eve", "age": 31, "active": True, "years_service": 9},
            {"name": "frank", "age": 45, "active": False, "years_service": 20},
            {"name": "george", "age": 38, "active": True, "years_service": 14},
            {"name": "hannah", "age": 29, "active": True, "years_service": 5}
        ]
    )
]

department_schema = ["department","employee"]

df_department = spark.createDataFrame(data=department_data, schema=department_schema)

In [18]:
df_department.show(10,False)

+-----------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|department |employee                                                                                                                                                                                                                                                                                                                             |
+-----------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [14]:
df_department.printSchema()

root
 |-- department: string (nullable = true)
 |-- employee: array (nullable = true)
 |    |-- element: map (containsNull = true)
 |    |    |-- key: string
 |    |    |-- value: string (valueContainsNull = true)



In [21]:
df_departmet_zip = df_department.withColumn("zip",F.arrays_zip("employee"))

df_departmet_zip.show(10,False)

+-----------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|department |employee                                                                                                                                                                                                                                                                                                               

## Apply explode

In [28]:
df_departmet_expand = df_departmet_zip.withColumn("explode",F.explode("zip"))

df_departmet_expand.show(10,False)

+-----------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------+
|department |employee                                                                                                                                                                                                                                           

In [29]:
df_departmet_expand.printSchema()

root
 |-- department: string (nullable = true)
 |-- employee: array (nullable = true)
 |    |-- element: map (containsNull = true)
 |    |    |-- key: string
 |    |    |-- value: string (valueContainsNull = true)
 |-- zip: array (nullable = true)
 |    |-- element: struct (containsNull = false)
 |    |    |-- employee: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |-- explode: struct (nullable = false)
 |    |-- employee: map (nullable = true)
 |    |    |-- key: string
 |    |    |-- value: string (valueContainsNull = true)



In [32]:
df_departmet_output = df_departmet_expand.withColumn("employee_name",df_departmet_expand["explode.employee.name"])\
                                            .withColumn("employee_age",df_departmet_expand["explode.employee.age"])\
                                            .withColumn("employee_years_service",df_departmet_expand["explode.employee.years_service"])\
                                            .withColumn("employee_active",df_departmet_expand["explode.employee.active"])\
                                            .drop("employee","zip","explode")

df_departmet_output.show(10, False)

+-----------+-------------+------------+----------------------+---------------+
|department |employee_name|employee_age|employee_years_service|employee_active|
+-----------+-------------+------------+----------------------+---------------+
|sales      |william      |40          |10                    |true           |
|sales      |mariam       |22          |2                     |false          |
|sales      |john         |35          |7                     |true           |
|sales      |susan        |29          |5                     |true           |
|HHRR       |marta        |33          |1                     |true           |
|HHRR       |leonard      |29          |7                     |true           |
|HHRR       |nancy        |31          |3                     |false          |
|HHRR       |robert       |45          |20                    |true           |
|engineering|alice        |28          |5                     |true           |
|engineering|bob          |35          |

# ARRAY INTERSECT

In [33]:
array_values = [
    ("john", 5, 1),
    ("john", 6, 5),
    ("alice", 3, 5),
    ("bob", 1, 4),
    ("alice", 5, 2),
    ("alice", 7, 2),
    ("bob", 2, 14),
    ("bob", 4, 1),
    ("grace", None, 3),
    ("john", 1, 6),
    ("john", 5, 5),
    ("john", 4, 7),
    ("john", 7, 3),
]
values_schema = ["name","value_1","value_2"]

In [34]:
df_array_values = spark.createDataFrame(array_values,values_schema)
df_array_values.show()

+-----+-------+-------+
| name|value_1|value_2|
+-----+-------+-------+
| john|      5|      1|
| john|      6|      5|
|alice|      3|      5|
|  bob|      1|      4|
|alice|      5|      2|
|alice|      7|      2|
|  bob|      2|     14|
|  bob|      4|      1|
|grace|   NULL|      3|
| john|      1|      6|
| john|      5|      5|
| john|      4|      7|
| john|      7|      3|
+-----+-------+-------+



In [35]:
df_master_values = df_array_values.groupby("name").agg(
    F.collect_list("value_1").alias("array_values_1"),
    F.collect_list("value_2").alias("array_values_2"),
)

df_master_values.show()

+-----+------------------+------------------+
| name|    array_values_1|    array_values_2|
+-----+------------------+------------------+
|alice|         [3, 5, 7]|         [5, 2, 2]|
| john|[5, 6, 1, 5, 4, 7]|[1, 5, 6, 5, 7, 3]|
|  bob|         [1, 2, 4]|        [4, 14, 1]|
|grace|                []|               [3]|
+-----+------------------+------------------+



                                                                                

In [36]:
df_array_intersect = df_master_values.withColumn("intersect", F.array_intersect("array_values_1","array_values_2"))

df_array_intersect.show()

+-----+------------------+------------------+------------+
| name|    array_values_1|    array_values_2|   intersect|
+-----+------------------+------------------+------------+
|alice|         [3, 5, 7]|         [5, 2, 2]|         [5]|
| john|[5, 6, 1, 5, 4, 7]|[1, 5, 6, 5, 7, 3]|[5, 6, 1, 7]|
|  bob|         [1, 2, 4]|        [4, 14, 1]|      [1, 4]|
|grace|                []|               [3]|          []|
+-----+------------------+------------------+------------+



# ARRAY EXCEPT

In [37]:
df_master_values.show()

[Stage 49:>                                                         (0 + 4) / 4]

+-----+------------------+------------------+
| name|    array_values_1|    array_values_2|
+-----+------------------+------------------+
|alice|         [3, 5, 7]|         [5, 2, 2]|
| john|[5, 6, 1, 5, 4, 7]|[1, 5, 6, 5, 7, 3]|
|  bob|         [1, 2, 4]|        [4, 14, 1]|
|grace|                []|               [3]|
+-----+------------------+------------------+



                                                                                

In [39]:
df_array_except_1over2 = df_master_values.withColumn("except", F.array_except("array_values_1","array_values_2"))

df_array_except_1over2.show()

[Stage 55:>                                                         (0 + 4) / 4]

+-----+------------------+------------------+------+
| name|    array_values_1|    array_values_2|except|
+-----+------------------+------------------+------+
|alice|         [3, 5, 7]|         [5, 2, 2]|[3, 7]|
| john|[5, 6, 1, 5, 4, 7]|[1, 5, 6, 5, 7, 3]|   [4]|
|  bob|         [1, 2, 4]|        [4, 14, 1]|   [2]|
|grace|                []|               [3]|    []|
+-----+------------------+------------------+------+



                                                                                

In [40]:
df_array_except_2over1 = df_master_values.withColumn("except", F.array_except("array_values_2","array_values_1"))

df_array_except_2over1.show()

+-----+------------------+------------------+------+
| name|    array_values_1|    array_values_2|except|
+-----+------------------+------------------+------+
|alice|         [3, 5, 7]|         [5, 2, 2]|   [2]|
| john|[5, 6, 1, 5, 4, 7]|[1, 5, 6, 5, 7, 3]|   [3]|
|  bob|         [1, 2, 4]|        [4, 14, 1]|  [14]|
|grace|                []|               [3]|   [3]|
+-----+------------------+------------------+------+



# ARRAY SORT

In [42]:
df_array_sort = df_master_values.withColumn("sorted_value_1", F.array_sort("array_values_1")).drop("array_values_2")

df_array_sort.show()

+-----+------------------+------------------+
| name|    array_values_1|    sorted_value_1|
+-----+------------------+------------------+
|alice|         [3, 5, 7]|         [3, 5, 7]|
| john|[5, 6, 1, 5, 4, 7]|[1, 4, 5, 5, 6, 7]|
|  bob|         [1, 2, 4]|         [1, 2, 4]|
|grace|                []|                []|
+-----+------------------+------------------+



# SORT ARRAY

In [46]:
df_array_sort = df_master_values.withColumn("sorted_value_1", F.sort_array("array_values_1")).drop("array_values_2")

df_array_sort.show()

+-----+------------------+------------------+
| name|    array_values_1|    sorted_value_1|
+-----+------------------+------------------+
|alice|         [3, 5, 7]|         [3, 5, 7]|
| john|[5, 6, 1, 5, 4, 7]|[1, 4, 5, 5, 6, 7]|
|  bob|         [1, 2, 4]|         [1, 2, 4]|
|grace|                []|                []|
+-----+------------------+------------------+



In [47]:
df_array_sort = df_master_values.withColumn("sorted_value_1", F.sort_array("array_values_1",asc=False)).drop("array_values_2")

df_array_sort.show()

+-----+------------------+------------------+
| name|    array_values_1|    sorted_value_1|
+-----+------------------+------------------+
|alice|         [3, 5, 7]|         [7, 5, 3]|
| john|[5, 6, 1, 5, 4, 7]|[7, 6, 5, 5, 4, 1]|
|  bob|         [1, 2, 4]|         [4, 2, 1]|
|grace|                []|                []|
+-----+------------------+------------------+



----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 60224)
Traceback (most recent call last):
  File "/home/andresmunozpampillon/anaconda3/lib/python3.11/socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/home/andresmunozpampillon/anaconda3/lib/python3.11/socketserver.py", line 348, in process_request
    self.finish_request(request, client_address)
  File "/home/andresmunozpampillon/anaconda3/lib/python3.11/socketserver.py", line 361, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/home/andresmunozpampillon/anaconda3/lib/python3.11/socketserver.py", line 755, in __init__
    self.handle()
  File "/home/andresmunozpampillon/spark-3.5.2-bin-hadoop3/python/pyspark/accumulators.py", line 295, in handle
    poll(accum_updates)
  File "/home/andresmunozpampillon/spark-3.5.2-bin-hadoop3/python/pyspark/accumulators.py", line 267, in poll