In [0]:
json_path = "dbfs:/FileStore/tables/sample_data.json"  # Replace with the actual path
orders_df = spark.read.option("multiline", "true").json(json_path)
orders_df.printSchema()

root
 |-- customer: struct (nullable = true)
 |    |-- address: struct (nullable = true)
 |    |    |-- city: string (nullable = true)
 |    |    |-- state: string (nullable = true)
 |    |    |-- street: string (nullable = true)
 |    |-- customer_id: string (nullable = true)
 |    |-- name: string (nullable = true)
 |-- items: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- item_id: string (nullable = true)
 |    |    |-- price: long (nullable = true)
 |    |    |-- product_name: string (nullable = true)
 |    |    |-- quantity: long (nullable = true)
 |-- metadata: struct (nullable = true)
 |    |-- delivery_status: string (nullable = true)
 |    |-- payment_method: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- order_id: long (nullable = true)



In [0]:
orders_df.display(truncate=False)

customer,items,metadata,order_date,order_id
"List(List(Springfield, IL, 123 Elm St), C001, John Doe)","List(List(I001, 1200, Laptop, 1), List(I002, 25, Mouse, 2))","List(Delivered, Credit Card)",2024-08-01,1
"List(List(Metropolis, NY, 456 Oak St), C002, Jane Smith)","List(List(I003, 800, Smartphone, 1), List(I004, 100, Headphones, 1), List(I002, 25, Mouse, 1))","List(Shipped, PayPal)",2024-08-03,2


In [0]:
from pyspark.sql.functions import explode

flattened_df = orders_df.withColumn("item", explode("items"))

In [0]:
from pyspark.sql.functions import col

flat_df = flattened_df.select(
    col("order_id"),
    col("customer.customer_id"),
    col("customer.name").alias("customer_name"),
    col("customer.address.street").alias("street"),
    col("customer.address.city").alias("city"),
    col("customer.address.state").alias("state"),
    col("item.item_id"),
    col("item.product_name"),
    col("item.quantity"),
    col("item.price"),
    col("order_date"),
    col("metadata.payment_method"),
    col("metadata.delivery_status")
)

flat_df.display()

order_id,customer_id,customer_name,street,city,state,item_id,product_name,quantity,price,order_date,payment_method,delivery_status
1,C001,John Doe,123 Elm St,Springfield,IL,I001,Laptop,1,1200,2024-08-01,Credit Card,Delivered
1,C001,John Doe,123 Elm St,Springfield,IL,I002,Mouse,2,25,2024-08-01,Credit Card,Delivered
2,C002,Jane Smith,456 Oak St,Metropolis,NY,I003,Smartphone,1,800,2024-08-03,PayPal,Shipped
2,C002,Jane Smith,456 Oak St,Metropolis,NY,I004,Headphones,1,100,2024-08-03,PayPal,Shipped
2,C002,Jane Smith,456 Oak St,Metropolis,NY,I002,Mouse,1,25,2024-08-03,PayPal,Shipped


In [0]:
from pyspark.sql.functions import sum

total_amount_by_each_customer= flat_df.groupBy('customer_name').agg(sum("price").alias('total_amount_spent'))

total_amount_by_each_customer.display()

customer_name,total_amount_spent
John Doe,1225
Jane Smith,925


In [0]:
from pyspark.sql.functions import countDistinct

distinct_item_by_customer=flat_df.groupBy('customer_name').agg(countDistinct('product_name'))

distinct_item_by_customer.display()

customer_name,count(product_name)
John Doe,2
Jane Smith,3


In [0]:
from pyspark.sql.functions import col, sum as spark_sum

extracted_df = orders_df.select(
    col("customer.customer_id"),
    col("items"),
    col("metadata.payment_method").alias("payment_method"),
    col("metadata.delivery_status").alias("delivery_status")
)

exploded_df=extracted_df.withColumn('item',explode("items"))
exploded_df.display()


customer_id,items,payment_method,delivery_status,item
C001,"List(List(I001, 1200, Laptop, 1), List(I002, 25, Mouse, 2))",Credit Card,Delivered,"List(I001, 1200, Laptop, 1)"
C001,"List(List(I001, 1200, Laptop, 1), List(I002, 25, Mouse, 2))",Credit Card,Delivered,"List(I002, 25, Mouse, 2)"
C002,"List(List(I003, 800, Smartphone, 1), List(I004, 100, Headphones, 1), List(I002, 25, Mouse, 1))",PayPal,Shipped,"List(I003, 800, Smartphone, 1)"
C002,"List(List(I003, 800, Smartphone, 1), List(I004, 100, Headphones, 1), List(I002, 25, Mouse, 1))",PayPal,Shipped,"List(I004, 100, Headphones, 1)"
C002,"List(List(I003, 800, Smartphone, 1), List(I004, 100, Headphones, 1), List(I002, 25, Mouse, 1))",PayPal,Shipped,"List(I002, 25, Mouse, 1)"


In [0]:
# Group by payment method and calculate total revenue 
total_revenue_by_payment_method= exploded_df.groupBy("payment_method").agg(sum(col('item.price')))

total_revenue_by_payment_method.display()

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-1331284502008089>:2[0m
[1;32m      1[0m [38;5;66;03m# Group by payment method and calculate total revenue [39;00m
[0;32m----> 2[0m total_revenue_by_payment_method[38;5;241m=[39m exploded_df[38;5;241m.[39mgroupBy([38;5;124m"[39m[38;5;124mpayment_method[39m[38;5;124m"[39m)[38;5;241m.[39magg([38;5;28msum[39m(col([38;5;124m'[39m[38;5;124mitem.price[39m[38;5;124m'[39m)))
[1;32m      4[0m total_revenue_by_payment_method[38;5;241m.[39mdisplay()

File [0;32m/databricks/spark/python/pyspark/instrumentation_utils.py:48[0m, in [0;36m_wrap_function.<locals>.wrapper[0;34m(*args, **kwargs)[0m
[1;32m     46[0m start [38;5;241m=[39m time[38;5;241m.[39mperf_counter()
[1;32m     47[0m [38;5;28;01mtry[39;00m:
[0;32m---> 48[0m     res [38;5;241m=[39m [43mfunc[49m

In [0]:
mouse_order=flat_df.groupBy("product_name").agg(count("customer_id").alias("total_customer")).filter(col("product_name")=='Mouse')

mouse_order.display()

product_name,total_customer
Mouse,2


In [0]:
from pyspark.sql.functions import desc

most_bought_item=flat_df.groupBy("product_name").agg(count("product_name").alias("most_bought_item")).orderBy(desc("most_bought_item"))

most_bought_item.show(1)





+------------+----------------+
|product_name|most_bought_item|
+------------+----------------+
|       Mouse|               2|
+------------+----------------+
only showing top 1 row



In [0]:
exploded_df=orders_df.withColumn("item",explode("items"))

# exploded_df.display()
item_count=exploded_df.groupBy("item.product_name").agg(count("item.product_name").alias("items_bought")).orderBy(desc("items_bought"))

item_count.show(1)

+------------+------------+
|product_name|items_bought|
+------------+------------+
|       Mouse|           2|
+------------+------------+
only showing top 1 row



In [0]:
%run /Users/sunnynuri12@gmail.com/Pyspark_Project02Json/Import_Notebooks

In [0]:
#calling Data Loader
json1=DataLoader("dbfs:/FileStore/tables/sample_data.json")
json_df=json1.load_json_df()

#calling Data Transformer
explode_obj=DataTransformer(json_df)
explode_df=explode_obj.exploded_df()
flat_df=explode_obj.flattened_df()

#calling Data Analyzer
data_analyze1=DataAnalyzer(flat_df)
amount_by_customer_df=data_analyze1.total_amount_by_customer()

#Amount_by_customer_df.display()
data_analyze2=DataAnalyzer(explode_df)
payment_method_df=data_analyze2.revenue_by_payment_method()
payment_method_df.display()




payment_method,sum(item.price)
Credit Card,1225
PayPal,925
