In [1]:
import os
from pathlib import Path

In [2]:
absolute_path = Path().absolute()

In [3]:
input_path = os.path.join(absolute_path, 'json/nested_order.json')

In [4]:
print(absolute_path)

/home/glue_user/workspace/jupyter_workspace/external_pra/question_ans


In [5]:
print(input_path)

/home/glue_user/workspace/jupyter_workspace/external_pra/question_ans/json/nested_order.json


In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,explode
from pyspark.sql.types import StructType,StructField,IntegerType,StringType,ArrayType

In [7]:
spark = SparkSession.builder.appName("readJson").getOrCreate()

SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/home/glue_user/spark/jars/log4j-slf4j-impl-2.17.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/home/glue_user/spark/jars/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/home/glue_user/aws-glue-libs/jars/log4j-slf4j-impl-2.17.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/home/glue_user/aws-glue-libs/jars/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [8]:
json_schema = StructType([
    StructField("order_id",IntegerType(),True),
    StructField("customer",StringType(),True),
    StructField("items",StringType(),True),
    StructField("order_date",StringType(),True)
])

In [9]:
df = spark.read.schema(json_schema).option("mode", "PERMISSIVE").json(input_path)

In [10]:
df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- customer: string (nullable = true)
 |-- items: string (nullable = true)
 |-- order_date: string (nullable = true)



In [11]:
df.show(truncate=False)

[Stage 0:>                                                          (0 + 1) / 1]

+--------+---------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+
|order_id|customer                                                                                                             |items                                                                                                                                                                                              |order_date|
+--------+---------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----

                                                                                

In [12]:
df.select("customer").show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------+
|customer                                                                                                             |
+---------------------------------------------------------------------------------------------------------------------+
|{"name":"John Doe","email":"john.doe@example.com","address":{"street":"123 Main St","city":"New York","zip":"10001"}}|
+---------------------------------------------------------------------------------------------------------------------+



In [13]:
customer_name_schema = StructType([
    StructField("name",StringType(),True),
    StructField("email",StringType(),True),
    StructField("address",StructType([
        StructField("city",StringType(),True),
        StructField("street",StringType(),True),
        StructField("zip",StringType(),True)
    ]),True)
])

In [15]:
from pyspark.sql.functions import from_json

In [16]:
df = df.withColumn("customer",from_json(col("customer"),customer_name_schema))

In [17]:
df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- customer: struct (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- email: string (nullable = true)
 |    |-- address: struct (nullable = true)
 |    |    |-- city: string (nullable = true)
 |    |    |-- street: string (nullable = true)
 |    |    |-- zip: string (nullable = true)
 |-- items: string (nullable = true)
 |-- order_date: string (nullable = true)



In [18]:
df.select("customer.name").show(truncate=False)

+--------+
|name    |
+--------+
|John Doe|
+--------+



In [19]:
df.show(truncate=False)

+--------+----------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+
|order_id|customer                                                        |items                                                                                                                                                                                              |order_date|
+--------+----------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+
|12345   |{John Doe, john.doe@example.com, {New York, 123 Main St, 10001}}|[{"product":"Laptop","details":{"brand":"Apple","model":"MacBook Pro"},"pric

In [20]:
df = df.withColumn("name",col("customer.name"))

In [21]:
df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- customer: struct (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- email: string (nullable = true)
 |    |-- address: struct (nullable = true)
 |    |    |-- city: string (nullable = true)
 |    |    |-- street: string (nullable = true)
 |    |    |-- zip: string (nullable = true)
 |-- items: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- name: string (nullable = true)



In [22]:
df.head()

Row(order_id=12345, customer=Row(name='John Doe', email='john.doe@example.com', address=Row(city='New York', street='123 Main St', zip='10001')), items='[{"product":"Laptop","details":{"brand":"Apple","model":"MacBook Pro"},"price":1200,"quantity":1},{"product":"Mouse","details":{"brand":"Logitech","model":"MX Master 3"},"price":50,"quantity":2}]', order_date='2025-03-19', name='John Doe')

In [23]:
df = df.withColumn("email",col("customer.email"))

In [24]:
df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- customer: struct (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- email: string (nullable = true)
 |    |-- address: struct (nullable = true)
 |    |    |-- city: string (nullable = true)
 |    |    |-- street: string (nullable = true)
 |    |    |-- zip: string (nullable = true)
 |-- items: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- name: string (nullable = true)
 |-- email: string (nullable = true)



In [25]:
df.show(truncate=False)

+--------+----------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+--------+--------------------+
|order_id|customer                                                        |items                                                                                                                                                                                              |order_date|name    |email               |
+--------+----------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+--------+--------------------+
|12345   |{John Doe, john.doe@example.com, {New York, 123 Mai

In [26]:
df.select("customer.address").show()

+--------------------+
|             address|
+--------------------+
|{New York, 123 Ma...|
+--------------------+



In [27]:
df.select("customer.address.city").show()

+--------+
|    city|
+--------+
|New York|
+--------+



In [28]:
df = df.withColumn("city",col("customer.address.city"))

In [29]:
df = df.withColumn("street",col("customer.address.street"))
df = df.withColumn("zip",col("customer.address.zip"))

In [30]:
df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- customer: struct (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- email: string (nullable = true)
 |    |-- address: struct (nullable = true)
 |    |    |-- city: string (nullable = true)
 |    |    |-- street: string (nullable = true)
 |    |    |-- zip: string (nullable = true)
 |-- items: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- city: string (nullable = true)
 |-- street: string (nullable = true)
 |-- zip: string (nullable = true)



In [31]:
df = df.drop("customer")

In [32]:
df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- items: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- city: string (nullable = true)
 |-- street: string (nullable = true)
 |-- zip: string (nullable = true)



In [33]:
df.show()

+--------+--------------------+----------+--------+--------------------+--------+-----------+-----+
|order_id|               items|order_date|    name|               email|    city|     street|  zip|
+--------+--------------------+----------+--------+--------------------+--------+-----------+-----+
|   12345|[{"product":"Lapt...|2025-03-19|John Doe|john.doe@example.com|New York|123 Main St|10001|
+--------+--------------------+----------+--------+--------------------+--------+-----------+-----+



In [34]:
from pyspark.sql.types import LongType

In [35]:
items_schema = ArrayType(
    StructType([
        StructField("details",StructType([
            StructField("brand",StringType(),True),
            StructField("model",StringType(),True)
        ]),True),
        StructField("price",LongType(),True),
        StructField("product",StringType(),True),
        StructField("quantity",LongType(),True)
    ])
)

In [36]:
df.select("items").show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|items                                                                                                                                                                                              |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[{"product":"Laptop","details":{"brand":"Apple","model":"MacBook Pro"},"price":1200,"quantity":1},{"product":"Mouse","details":{"brand":"Logitech","model":"MX Master 3"},"price":50,"quantity":2}]|
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+



In [37]:
df = df.withColumn("items",from_json(col("items"),items_schema))

In [38]:
df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- items: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- details: struct (nullable = true)
 |    |    |    |-- brand: string (nullable = true)
 |    |    |    |-- model: string (nullable = true)
 |    |    |-- price: long (nullable = true)
 |    |    |-- product: string (nullable = true)
 |    |    |-- quantity: long (nullable = true)
 |-- order_date: string (nullable = true)
 |-- name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- city: string (nullable = true)
 |-- street: string (nullable = true)
 |-- zip: string (nullable = true)



In [39]:
df = df.withColumn("item",explode(col("items")))

In [40]:
df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- items: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- details: struct (nullable = true)
 |    |    |    |-- brand: string (nullable = true)
 |    |    |    |-- model: string (nullable = true)
 |    |    |-- price: long (nullable = true)
 |    |    |-- product: string (nullable = true)
 |    |    |-- quantity: long (nullable = true)
 |-- order_date: string (nullable = true)
 |-- name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- city: string (nullable = true)
 |-- street: string (nullable = true)
 |-- zip: string (nullable = true)
 |-- item: struct (nullable = true)
 |    |-- details: struct (nullable = true)
 |    |    |-- brand: string (nullable = true)
 |    |    |-- model: string (nullable = true)
 |    |-- price: long (nullable = true)
 |    |-- product: string (nullable = true)
 |    |-- quantity: long (nullable = true)



In [41]:
df.show(truncate=False)

+--------+----------------------------------------------------------------------------------+----------+--------+--------------------+--------+-----------+-----+---------------------------------------+
|order_id|items                                                                             |order_date|name    |email               |city    |street     |zip  |item                                   |
+--------+----------------------------------------------------------------------------------+----------+--------+--------------------+--------+-----------+-----+---------------------------------------+
|12345   |[{{Apple, MacBook Pro}, 1200, Laptop, 1}, {{Logitech, MX Master 3}, 50, Mouse, 2}]|2025-03-19|John Doe|john.doe@example.com|New York|123 Main St|10001|{{Apple, MacBook Pro}, 1200, Laptop, 1}|
|12345   |[{{Apple, MacBook Pro}, 1200, Laptop, 1}, {{Logitech, MX Master 3}, 50, Mouse, 2}]|2025-03-19|John Doe|john.doe@example.com|New York|123 Main St|10001|{{Logitech, MX Master 3}, 50, M

In [42]:
df.select("item").show(truncate=False)

+---------------------------------------+
|item                                   |
+---------------------------------------+
|{{Apple, MacBook Pro}, 1200, Laptop, 1}|
|{{Logitech, MX Master 3}, 50, Mouse, 2}|
+---------------------------------------+



In [43]:
df = df.drop("items")

In [44]:
df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- city: string (nullable = true)
 |-- street: string (nullable = true)
 |-- zip: string (nullable = true)
 |-- item: struct (nullable = true)
 |    |-- details: struct (nullable = true)
 |    |    |-- brand: string (nullable = true)
 |    |    |-- model: string (nullable = true)
 |    |-- price: long (nullable = true)
 |    |-- product: string (nullable = true)
 |    |-- quantity: long (nullable = true)



In [45]:
df.select("item.details").show(truncate=False)

+-----------------------+
|details                |
+-----------------------+
|{Apple, MacBook Pro}   |
|{Logitech, MX Master 3}|
+-----------------------+



In [46]:
df.select("item.details.brand").show(truncate=False)

+--------+
|brand   |
+--------+
|Apple   |
|Logitech|
+--------+



In [47]:
df = df.withColumn("price",col("item.price"))

In [48]:
df = df.withColumn("product",col("item.product"))
df = df.withColumn("quantity",col("item.quantity"))

In [49]:
df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- city: string (nullable = true)
 |-- street: string (nullable = true)
 |-- zip: string (nullable = true)
 |-- item: struct (nullable = true)
 |    |-- details: struct (nullable = true)
 |    |    |-- brand: string (nullable = true)
 |    |    |-- model: string (nullable = true)
 |    |-- price: long (nullable = true)
 |    |-- product: string (nullable = true)
 |    |-- quantity: long (nullable = true)
 |-- price: long (nullable = true)
 |-- product: string (nullable = true)
 |-- quantity: long (nullable = true)



In [50]:
df = df.withColumn("brand",col("item.details.brand"))
df = df.withColumn("model",col("item.details.model"))

In [51]:
df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- city: string (nullable = true)
 |-- street: string (nullable = true)
 |-- zip: string (nullable = true)
 |-- item: struct (nullable = true)
 |    |-- details: struct (nullable = true)
 |    |    |-- brand: string (nullable = true)
 |    |    |-- model: string (nullable = true)
 |    |-- price: long (nullable = true)
 |    |-- product: string (nullable = true)
 |    |-- quantity: long (nullable = true)
 |-- price: long (nullable = true)
 |-- product: string (nullable = true)
 |-- quantity: long (nullable = true)
 |-- brand: string (nullable = true)
 |-- model: string (nullable = true)



In [52]:
df = df.drop("item")

In [53]:
df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- city: string (nullable = true)
 |-- street: string (nullable = true)
 |-- zip: string (nullable = true)
 |-- price: long (nullable = true)
 |-- product: string (nullable = true)
 |-- quantity: long (nullable = true)
 |-- brand: string (nullable = true)
 |-- model: string (nullable = true)



In [54]:
df.show(truncate=False)

+--------+----------+--------+--------------------+--------+-----------+-----+-----+-------+--------+--------+-----------+
|order_id|order_date|name    |email               |city    |street     |zip  |price|product|quantity|brand   |model      |
+--------+----------+--------+--------------------+--------+-----------+-----+-----+-------+--------+--------+-----------+
|12345   |2025-03-19|John Doe|john.doe@example.com|New York|123 Main St|10001|1200 |Laptop |1       |Apple   |MacBook Pro|
|12345   |2025-03-19|John Doe|john.doe@example.com|New York|123 Main St|10001|50   |Mouse  |2       |Logitech|MX Master 3|
+--------+----------+--------+--------------------+--------+-----------+-----+-----+-------+--------+--------+-----------+

