In [1]:
import os
from pathlib import Path
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,explode,from_json
from pyspark.sql.types import StructType,StructField,IntegerType,StringType,ArrayType,LongType

In [2]:
absolute_path = Path().absolute()

In [3]:
input_path = os.path.join(absolute_path, 'json/customer_order.json')

In [4]:
spark = SparkSession.builder.appName("customerOrder").getOrCreate()

SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/home/glue_user/spark/jars/log4j-slf4j-impl-2.17.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/home/glue_user/spark/jars/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/home/glue_user/aws-glue-libs/jars/log4j-slf4j-impl-2.17.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/home/glue_user/aws-glue-libs/jars/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/19 21:36:59 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [46]:
df = spark.read.option("mode", "PERMISSIVE").json(input_path)

In [47]:
df.printSchema()

root
 |-- address: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- state: string (nullable = true)
 |    |-- street: string (nullable = true)
 |    |-- zip: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- email: string (nullable = true)
 |-- name: string (nullable = true)
 |-- orders: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- items: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- item_id: string (nullable = true)
 |    |    |    |    |-- price: long (nullable = true)
 |    |    |    |    |-- product: string (nullable = true)
 |    |    |    |    |-- quantity: long (nullable = true)
 |    |    |-- order_date: string (nullable = true)
 |    |    |-- order_id: string (nullable = true)
 |    |    |-- payment: struct (nullable = true)
 |    |    |    |-- method: string (nullable = true)
 |    |    |    |-- status: string (nullable = tru

In [48]:
df.show(truncate=False)

+--------------------------------------+-----------+-----------------+-----+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|address                               |customer_id|email            |name |orders                                                                                                                                                                          |
+--------------------------------------+-----------+-----------------+-----+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|{New York, NY, 123 Main St, 10001}    |C001       |alice@example.com|Alice|[{[{I001, 1200, Laptop, 1}, {I002, 50, Mouse, 1}], 2024-01-01, O1001, {Credit Card, Completed}, 350}, {[{I003, 150, Headphones, 1}], 2024-02-15, O1002, {PayPal, P

In [49]:
df = df.withColumn("order",explode(col("orders")))

In [50]:
df.printSchema()

root
 |-- address: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- state: string (nullable = true)
 |    |-- street: string (nullable = true)
 |    |-- zip: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- email: string (nullable = true)
 |-- name: string (nullable = true)
 |-- orders: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- items: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- item_id: string (nullable = true)
 |    |    |    |    |-- price: long (nullable = true)
 |    |    |    |    |-- product: string (nullable = true)
 |    |    |    |    |-- quantity: long (nullable = true)
 |    |    |-- order_date: string (nullable = true)
 |    |    |-- order_id: string (nullable = true)
 |    |    |-- payment: struct (nullable = true)
 |    |    |    |-- method: string (nullable = true)
 |    |    |    |-- status: string (nullable = tru

In [51]:
df = df.drop("orders")

In [52]:
df.printSchema()

root
 |-- address: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- state: string (nullable = true)
 |    |-- street: string (nullable = true)
 |    |-- zip: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- email: string (nullable = true)
 |-- name: string (nullable = true)
 |-- order: struct (nullable = true)
 |    |-- items: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- item_id: string (nullable = true)
 |    |    |    |-- price: long (nullable = true)
 |    |    |    |-- product: string (nullable = true)
 |    |    |    |-- quantity: long (nullable = true)
 |    |-- order_date: string (nullable = true)
 |    |-- order_id: string (nullable = true)
 |    |-- payment: struct (nullable = true)
 |    |    |-- method: string (nullable = true)
 |    |    |-- status: string (nullable = true)
 |    |-- total_amount: long (nullable = true)



In [53]:
df = df.withColumn("item",explode(col("order.items")))

In [54]:
df.printSchema()

root
 |-- address: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- state: string (nullable = true)
 |    |-- street: string (nullable = true)
 |    |-- zip: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- email: string (nullable = true)
 |-- name: string (nullable = true)
 |-- order: struct (nullable = true)
 |    |-- items: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- item_id: string (nullable = true)
 |    |    |    |-- price: long (nullable = true)
 |    |    |    |-- product: string (nullable = true)
 |    |    |    |-- quantity: long (nullable = true)
 |    |-- order_date: string (nullable = true)
 |    |-- order_id: string (nullable = true)
 |    |-- payment: struct (nullable = true)
 |    |    |-- method: string (nullable = true)
 |    |    |-- status: string (nullable = true)
 |    |-- total_amount: long (nullable = true)
 |-- item: struct (nullable = true)
 |    |-- item_id

In [55]:
df = df.drop("order.items")

In [56]:
df.printSchema()

root
 |-- address: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- state: string (nullable = true)
 |    |-- street: string (nullable = true)
 |    |-- zip: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- email: string (nullable = true)
 |-- name: string (nullable = true)
 |-- order: struct (nullable = true)
 |    |-- items: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- item_id: string (nullable = true)
 |    |    |    |-- price: long (nullable = true)
 |    |    |    |-- product: string (nullable = true)
 |    |    |    |-- quantity: long (nullable = true)
 |    |-- order_date: string (nullable = true)
 |    |-- order_id: string (nullable = true)
 |    |-- payment: struct (nullable = true)
 |    |    |-- method: string (nullable = true)
 |    |    |-- status: string (nullable = true)
 |    |-- total_amount: long (nullable = true)
 |-- item: struct (nullable = true)
 |    |-- item_id

In [41]:
df.select("item.item_id","item.price","item.product","item.quantity").show(truncate=False)

+-------+-----+----------+--------+
|item_id|price|product   |quantity|
+-------+-----+----------+--------+
|I001   |1200 |Laptop    |1       |
|I002   |50   |Mouse     |1       |
|I001   |1200 |Laptop    |1       |
|I002   |50   |Mouse     |1       |
|I001   |1200 |Laptop    |1       |
|I002   |50   |Mouse     |1       |
|I001   |1200 |Laptop    |1       |
|I002   |50   |Mouse     |1       |
|I003   |150  |Headphones|1       |
|I004   |600  |Smartphone|1       |
|I005   |75   |Charger   |1       |
+-------+-----+----------+--------+



In [57]:
df_flat = df.select("customer_id","name","email","address.city","address.state","address.zip","address.street","order.order_date","order.order_id","order.payment.method","order.payment.status","order.total_amount","item.item_id","item.price","item.product","item.quantity")

In [58]:
df_flat.show(truncate=False)

+-----------+-----+-----------------+-------------+-----+-----+-----------+----------+--------+-----------+---------+------------+-------+-----+----------+--------+
|customer_id|name |email            |city         |state|zip  |street     |order_date|order_id|method     |status   |total_amount|item_id|price|product   |quantity|
+-----------+-----+-----------------+-------------+-----+-----+-----------+----------+--------+-----------+---------+------------+-------+-----+----------+--------+
|C001       |Alice|alice@example.com|New York     |NY   |10001|123 Main St|2024-01-01|O1001   |Credit Card|Completed|350         |I001   |1200 |Laptop    |1       |
|C001       |Alice|alice@example.com|New York     |NY   |10001|123 Main St|2024-01-01|O1001   |Credit Card|Completed|350         |I002   |50   |Mouse     |1       |
|C001       |Alice|alice@example.com|New York     |NY   |10001|123 Main St|2024-02-15|O1002   |PayPal     |Pending  |150         |I003   |150  |Headphones|1       |
|C002     

In [59]:
df_flat.count()

5