In [1]:
import os
from pyspark.sql.functions import col,explode,from_json
from pyspark.sql import SparkSession
from pyspark.sql.types import ArrayType,StructType,StructField,IntegerType,StringType

In [2]:
from pathlib import Path

In [3]:
absolute_path = Path().absolute()

In [4]:
input_path = os.path.join(absolute_path, 'json/map_array_struct.json')

In [5]:
spark = SparkSession.builder.appName("map_array_struct").getOrCreate()

SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/home/glue_user/spark/jars/log4j-slf4j-impl-2.17.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/home/glue_user/spark/jars/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/home/glue_user/aws-glue-libs/jars/log4j-slf4j-impl-2.17.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/home/glue_user/aws-glue-libs/jars/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [8]:
df = spark.read.option("mode", "PERMISSIVE").json(input_path)

In [9]:
df.printSchema()

root
 |-- address: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- geo: struct (nullable = true)
 |    |    |-- lat: string (nullable = true)
 |    |    |-- long: string (nullable = true)
 |    |-- street: string (nullable = true)
 |-- contacts: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- type: string (nullable = true)
 |    |    |-- value: string (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- preferences: struct (nullable = true)
 |    |-- categories: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- notifications: struct (nullable = true)
 |    |    |-- email: boolean (nullable = true)
 |    |    |-- sms: boolean (nullable = true)



In [10]:
df = df.withColumn("contact",explode(col("contacts")))

In [11]:
df.printSchema()

root
 |-- address: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- geo: struct (nullable = true)
 |    |    |-- lat: string (nullable = true)
 |    |    |-- long: string (nullable = true)
 |    |-- street: string (nullable = true)
 |-- contacts: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- type: string (nullable = true)
 |    |    |-- value: string (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- preferences: struct (nullable = true)
 |    |-- categories: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- notifications: struct (nullable = true)
 |    |    |-- email: boolean (nullable = true)
 |    |    |-- sms: boolean (nullable = true)
 |-- contact: struct (nullable = true)
 |    |-- type: string (nullable = true)
 |    |-- value: string (nullable = true)



In [12]:
df = df.drop("contacts")

In [14]:
df.show(truncate=False)

+-------------------------------------------------+---+-----+-------------------------------------+--------------------------+
|address                                          |id |name |preferences                          |contact                   |
+-------------------------------------------------+---+-----+-------------------------------------+--------------------------+
|{New York, {40.7128, -74.0060}, 123 Main St}     |1  |Alice|{[electronics, books], {true, false}}|{email, alice@example.com}|
|{New York, {40.7128, -74.0060}, 123 Main St}     |1  |Alice|{[electronics, books], {true, false}}|{phone, +123456789}       |
|{San Francisco, {37.7749, -122.4194}, 456 Elm St}|2  |Bob  |{[fashion, sports], {false, true}}   |{email, bob@example.com}  |
+-------------------------------------------------+---+-----+-------------------------------------+--------------------------+



In [15]:
df.select("preferences.categories").show()

+--------------------+
|          categories|
+--------------------+
|[electronics, books]|
|[electronics, books]|
|   [fashion, sports]|
+--------------------+



In [16]:
df = df.withColumn("categories",explode(col("preferences.categories")))

In [17]:
df.show(truncate=False)

+-------------------------------------------------+---+-----+-------------------------------------+--------------------------+-----------+
|address                                          |id |name |preferences                          |contact                   |categories |
+-------------------------------------------------+---+-----+-------------------------------------+--------------------------+-----------+
|{New York, {40.7128, -74.0060}, 123 Main St}     |1  |Alice|{[electronics, books], {true, false}}|{email, alice@example.com}|electronics|
|{New York, {40.7128, -74.0060}, 123 Main St}     |1  |Alice|{[electronics, books], {true, false}}|{email, alice@example.com}|books      |
|{New York, {40.7128, -74.0060}, 123 Main St}     |1  |Alice|{[electronics, books], {true, false}}|{phone, +123456789}       |electronics|
|{New York, {40.7128, -74.0060}, 123 Main St}     |1  |Alice|{[electronics, books], {true, false}}|{phone, +123456789}       |books      |
|{San Francisco, {37.7749, 

In [18]:
df.printSchema()

root
 |-- address: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- geo: struct (nullable = true)
 |    |    |-- lat: string (nullable = true)
 |    |    |-- long: string (nullable = true)
 |    |-- street: string (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- preferences: struct (nullable = true)
 |    |-- categories: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- notifications: struct (nullable = true)
 |    |    |-- email: boolean (nullable = true)
 |    |    |-- sms: boolean (nullable = true)
 |-- contact: struct (nullable = true)
 |    |-- type: string (nullable = true)
 |    |-- value: string (nullable = true)
 |-- categories: string (nullable = true)



In [19]:
df_flatter = df.select("id","name","address.city","address.geo.lat","address.geo.long","address.street","preferences.notifications.email","preferences.notifications.sms","contact.type","contact.value","categories")

In [20]:
df_flatter.show(truncate=False)

+---+-----+-------------+-------+---------+-----------+-----+-----+-----+-----------------+-----------+
|id |name |city         |lat    |long     |street     |email|sms  |type |value            |categories |
+---+-----+-------------+-------+---------+-----------+-----+-----+-----+-----------------+-----------+
|1  |Alice|New York     |40.7128|-74.0060 |123 Main St|true |false|email|alice@example.com|electronics|
|1  |Alice|New York     |40.7128|-74.0060 |123 Main St|true |false|email|alice@example.com|books      |
|1  |Alice|New York     |40.7128|-74.0060 |123 Main St|true |false|phone|+123456789       |electronics|
|1  |Alice|New York     |40.7128|-74.0060 |123 Main St|true |false|phone|+123456789       |books      |
|2  |Bob  |San Francisco|37.7749|-122.4194|456 Elm St |false|true |email|bob@example.com  |fashion    |
|2  |Bob  |San Francisco|37.7749|-122.4194|456 Elm St |false|true |email|bob@example.com  |sports     |
+---+-----+-------------+-------+---------+-----------+-----+---