# In Notebook, find and show: what is the highest cost per product and which customer has bought highest number of the items. 

## Adding dummy data in the dataframe

In [0]:
from pyspark.sql.functions import *

# 1. Create Manual Data
data = [
    (101, 5, 150.0, 100.0, "Wireless Mouse", "Alice Johnson"),
    (102, 2, 1200.0, 800.0, "Gaming Laptop", "Bob Smith"),
    (103, 10, 25.0, 10.0, "Mouse Pad", "Alice Johnson"),
    (104, 1, 900.0, 750.0, "Office Chair", "Charlie Brown"),
    (105, 3, 300.0, 200.0, "Mechanical Keyboard", "Bob Smith"),
    (106, 15, 15.0, 5.0, "USB Cable", "Alice Johnson"),
    (107, 1, 5500.0, 4800.0, "Premium Workstation", "Diana Prince"),
    (108, 20, 12.0, 4.0, "Cable Ties", "Charlie Brown"),
    (109, 2, 450.0, 310.0, "27-inch Monitor", "Alice Johnson"),
    (110, 1, 3500.0, 3100.0, "Server Rack", "Edward Norton"),
    (111, 8, 80.0, 45.0, "Laptop Stand", "Bob Smith"),
    (112, 12, 60.0, 30.0, "Webcam", "Diana Prince"),
    (113, 1, 2200.0, 1850.0, "Smart Sofa", "Charlie Brown"),
    (114, 25, 5.0, 1.5, "AA Batteries", "Bob Smith"),
    (115, 4, 110.0, 75.0, "USB-C Hub", "Alice Johnson")
]

columns = ["sales_id", "sales_quantity", "selling_price", "cost", "product_name", "customer_name"]

# 2. Create DataFrame
df = spark.createDataFrame(data, columns)


In [0]:
display(df)

sales_id,sales_quantity,selling_price,cost,product_name,customer_name
101,5,150.0,100.0,Wireless Mouse,Alice Johnson
102,2,1200.0,800.0,Gaming Laptop,Bob Smith
103,10,25.0,10.0,Mouse Pad,Alice Johnson
104,1,900.0,750.0,Office Chair,Charlie Brown
105,3,300.0,200.0,Mechanical Keyboard,Bob Smith
106,15,15.0,5.0,USB Cable,Alice Johnson
107,1,5500.0,4800.0,Premium Workstation,Diana Prince
108,20,12.0,4.0,Cable Ties,Charlie Brown
109,2,450.0,310.0,27-inch Monitor,Alice Johnson
110,1,3500.0,3100.0,Server Rack,Edward Norton


# Calculating the Profit

In [0]:
df = df.withColumn("profit",((col('selling_price')*col("sales_quantity"))-(col('cost')*col("sales_quantity"))))
display(df)

sales_id,sales_quantity,selling_price,cost,product_name,customer_name,profit
101,5,150.0,100.0,Wireless Mouse,Alice Johnson,250.0
102,2,1200.0,800.0,Gaming Laptop,Bob Smith,800.0
103,10,25.0,10.0,Mouse Pad,Alice Johnson,150.0
104,1,900.0,750.0,Office Chair,Charlie Brown,150.0
105,3,300.0,200.0,Mechanical Keyboard,Bob Smith,300.0
106,15,15.0,5.0,USB Cable,Alice Johnson,150.0
107,1,5500.0,4800.0,Premium Workstation,Diana Prince,700.0
108,20,12.0,4.0,Cable Ties,Charlie Brown,160.0
109,2,450.0,310.0,27-inch Monitor,Alice Johnson,280.0
110,1,3500.0,3100.0,Server Rack,Edward Norton,400.0


# Calculating the revenue

In [0]:
df = df.withColumn("revenue",(col('selling_price')*col("sales_quantity")))
display(df)

sales_id,sales_quantity,selling_price,cost,product_name,customer_name,profit,revenue
101,5,150.0,100.0,Wireless Mouse,Alice Johnson,250.0,750.0
102,2,1200.0,800.0,Gaming Laptop,Bob Smith,800.0,2400.0
103,10,25.0,10.0,Mouse Pad,Alice Johnson,150.0,250.0
104,1,900.0,750.0,Office Chair,Charlie Brown,150.0,900.0
105,3,300.0,200.0,Mechanical Keyboard,Bob Smith,300.0,900.0
106,15,15.0,5.0,USB Cable,Alice Johnson,150.0,225.0
107,1,5500.0,4800.0,Premium Workstation,Diana Prince,700.0,5500.0
108,20,12.0,4.0,Cable Ties,Charlie Brown,160.0,240.0
109,2,450.0,310.0,27-inch Monitor,Alice Johnson,280.0,900.0
110,1,3500.0,3100.0,Server Rack,Edward Norton,400.0,3500.0


# Storing Data in delta Table

In [0]:
df.write.format("delta").mode('overwrite').option('mergeSchema', 'true').saveAsTable("bhim_bricks.dbo.sales_details_table")

In [0]:
%sql

select * from bhim_bricks.dbo.sales_details_table

sales_id,sales_quantity,selling_price,cost,product_name,customer_name,profit,revenue
110,1,3500.0,3100.0,Server Rack,Edward Norton,400.0,3500.0
111,8,80.0,45.0,Laptop Stand,Bob Smith,280.0,640.0
112,12,60.0,30.0,Webcam,Diana Prince,360.0,720.0
113,1,2200.0,1850.0,Smart Sofa,Charlie Brown,350.0,2200.0
114,25,5.0,1.5,AA Batteries,Bob Smith,87.5,125.0
115,4,110.0,75.0,USB-C Hub,Alice Johnson,140.0,440.0
104,1,900.0,750.0,Office Chair,Charlie Brown,150.0,900.0
105,3,300.0,200.0,Mechanical Keyboard,Bob Smith,300.0,900.0
106,15,15.0,5.0,USB Cable,Alice Johnson,150.0,225.0
107,1,5500.0,4800.0,Premium Workstation,Diana Prince,700.0,5500.0


# Finding the highest cost of the Product

In [0]:
high_cost_product = df.groupBy('product_name').agg(max('cost').alias('max_cost')).orderBy(desc('max_cost')).limit(1)
display(high_cost_product)

product_name,max_cost
Premium Workstation,4800.0


# Finding the highest number of item bought by customer

In [0]:
high_no_item = df.groupBy('customer_name').agg(sum("sales_quantity").alias("total_quantity")).orderBy(desc('total_quantity')).limit(1)
display(high_no_item)

customer_name,total_quantity
Bob Smith,38
