In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,explode,from_json
from pyspark.sql.types import StructType,StructField,StringType,IntegerType,ArrayType

In [66]:
spark = SparkSession.builder.appName("nestedJson").getOrCreate()

In [67]:
# Sample nested JSON data
json_data = [
    (12345, '{"name": "John Doe", "email": "john.doe@example.com"}',
     '[{"product": "Laptop", "price": 1200, "quantity": 1}, {"product": "Mouse", "price": 50, "quantity": 2}]',
     "2025-03-19")
]

In [68]:
schema = StructType([
    StructField("order_id",IntegerType(),True),
    StructField("customer",StringType(),True),
    StructField("items",StringType(),True),
    StructField("order_date",StringType(),True)
])

In [81]:
df = spark.createDataFrame(json_data,schema)

In [9]:
df.show(truncate= False)

                                                                                

+--------+-----------------------------------------------------+-------------------------------------------------------------------------------------------------------+----------+
|order_id|customer                                             |items                                                                                                  |order_date|
+--------+-----------------------------------------------------+-------------------------------------------------------------------------------------------------------+----------+
|12345   |{"name": "John Doe", "email": "john.doe@example.com"}|[{"product": "Laptop", "price": 1200, "quantity": 1}, {"product": "Mouse", "price": 50, "quantity": 2}]|2025-03-19|
+--------+-----------------------------------------------------+-------------------------------------------------------------------------------------------------------+----------+



In [34]:
customer_schema = StructType([
    StructField("name",StringType(),True),
    StructField("email",StringType(),True)
])

In [82]:
df = df.withColumn("customer",from_json(col("customer"),customer_schema))

In [36]:
df.show(truncate=False)

+--------+-----------------------------------------------------+-------------------------------------------------------------------------------------------------------+----------+--------------------------------+
|order_id|customer                                             |items                                                                                                  |order_date|customer_new                    |
+--------+-----------------------------------------------------+-------------------------------------------------------------------------------------------------------+----------+--------------------------------+
|12345   |{"name": "John Doe", "email": "john.doe@example.com"}|[{"product": "Laptop", "price": 1200, "quantity": 1}, {"product": "Mouse", "price": 50, "quantity": 2}]|2025-03-19|{John Doe, john.doe@example.com}|
+--------+-----------------------------------------------------+------------------------------------------------------------------------------------

In [73]:
df = df.withColumn("customer", from_json(col("customer"), customer_schema))

In [83]:
df = df.withColumn("customer_name", col("customer.name")).withColumn("customer_email", col("customer.email"))

In [74]:
df.select(col("customer")).show(truncate=False)

+--------------------------------+
|customer                        |
+--------------------------------+
|{John Doe, john.doe@example.com}|
+--------------------------------+



In [42]:
df.select(col("customer.name")).show(truncate=False)

+--------+
|name    |
+--------+
|John Doe|
+--------+



In [44]:
df.show(truncate=False)

+--------+--------------------------------+-------------------------------------------------------------------------------------------------------+----------+--------------------------------+-------------+--------------------+
|order_id|customer                        |items                                                                                                  |order_date|customer_new                    |customer_name|customer_email      |
+--------+--------------------------------+-------------------------------------------------------------------------------------------------------+----------+--------------------------------+-------------+--------------------+
|12345   |{John Doe, john.doe@example.com}|[{"product": "Laptop", "price": 1200, "quantity": 1}, {"product": "Mouse", "price": 50, "quantity": 2}]|2025-03-19|{John Doe, john.doe@example.com}|John Doe     |john.doe@example.com|
+--------+--------------------------------+-------------------------------------------------

In [76]:
df = df.drop(col("customer")).drop(col("customer_new"))

In [49]:
df.show(truncate=False)

+--------+-------------------------------------------------------------------------------------------------------+----------+-------------+--------------------+
|order_id|items                                                                                                  |order_date|customer_name|customer_email      |
+--------+-------------------------------------------------------------------------------------------------------+----------+-------------+--------------------+
|12345   |[{"product": "Laptop", "price": 1200, "quantity": 1}, {"product": "Mouse", "price": 50, "quantity": 2}]|2025-03-19|John Doe     |john.doe@example.com|
+--------+-------------------------------------------------------------------------------------------------------+----------+-------------+--------------------+



In [84]:
items_schema = ArrayType(StructType([
    StructField("product",StringType(),True),
    StructField("price",IntegerType(),True),
    StructField("quantity",StringType(),True)
]))

In [85]:
df = df.withColumn("items", from_json(col("items"),items_schema))

In [54]:
df.show(truncate=False)

+--------+------------------+----------+-------------+--------------------+
|order_id|items             |order_date|customer_name|customer_email      |
+--------+------------------+----------+-------------+--------------------+
|12345   |{null, null, null}|2025-03-19|John Doe     |john.doe@example.com|
+--------+------------------+----------+-------------+--------------------+



In [86]:
df.select("items.price").show()

                                                                                

+----------+
|     price|
+----------+
|[1200, 50]|
+----------+



In [88]:
df = df.withColumn("item",explode(col("items")))

In [89]:
df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- customer: struct (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- email: string (nullable = true)
 |-- items: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- product: string (nullable = true)
 |    |    |-- price: integer (nullable = true)
 |    |    |-- quantity: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- customer_email: string (nullable = true)
 |-- item: struct (nullable = true)
 |    |-- product: string (nullable = true)
 |    |-- price: integer (nullable = true)
 |    |-- quantity: string (nullable = true)



In [90]:
df.show(truncate=False)

+--------+--------------------------------+-----------------------------------+----------+-------------+--------------------+-----------------+
|order_id|customer                        |items                              |order_date|customer_name|customer_email      |item             |
+--------+--------------------------------+-----------------------------------+----------+-------------+--------------------+-----------------+
|12345   |{John Doe, john.doe@example.com}|[{Laptop, 1200, 1}, {Mouse, 50, 2}]|2025-03-19|John Doe     |john.doe@example.com|{Laptop, 1200, 1}|
|12345   |{John Doe, john.doe@example.com}|[{Laptop, 1200, 1}, {Mouse, 50, 2}]|2025-03-19|John Doe     |john.doe@example.com|{Mouse, 50, 2}   |
+--------+--------------------------------+-----------------------------------+----------+-------------+--------------------+-----------------+



In [91]:
df = df.drop("customer","items")

In [92]:
df.show(truncate=False)

+--------+----------+-------------+--------------------+-----------------+
|order_id|order_date|customer_name|customer_email      |item             |
+--------+----------+-------------+--------------------+-----------------+
|12345   |2025-03-19|John Doe     |john.doe@example.com|{Laptop, 1200, 1}|
|12345   |2025-03-19|John Doe     |john.doe@example.com|{Mouse, 50, 2}   |
+--------+----------+-------------+--------------------+-----------------+



In [93]:
df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- customer_email: string (nullable = true)
 |-- item: struct (nullable = true)
 |    |-- product: string (nullable = true)
 |    |-- price: integer (nullable = true)
 |    |-- quantity: string (nullable = true)



In [94]:
df = df.withColumn("product",col("item.product"))

In [96]:
df.select(col("product")).show()

+-------+
|product|
+-------+
| Laptop|
|  Mouse|
+-------+



In [97]:
df = df.withColumn("price",col("item.price")) \
        .withColumn("quantity",col("item.quantity")) \
        .drop("item")

In [98]:
df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- customer_email: string (nullable = true)
 |-- product: string (nullable = true)
 |-- price: integer (nullable = true)
 |-- quantity: string (nullable = true)



In [99]:
df.show()

+--------+----------+-------------+--------------------+-------+-----+--------+
|order_id|order_date|customer_name|      customer_email|product|price|quantity|
+--------+----------+-------------+--------------------+-------+-----+--------+
|   12345|2025-03-19|     John Doe|john.doe@example.com| Laptop| 1200|       1|
|   12345|2025-03-19|     John Doe|john.doe@example.com|  Mouse|   50|       2|
+--------+----------+-------------+--------------------+-------+-----+--------+

