In [9]:
from pyspark.sql import SparkSession, Row
from datetime import date

spark = SparkSession.builder.getOrCreate()

# 1. Create a new database named sales_db .
# 2. Set the current database to sales_db .
# 3. Create a table product_sales with columns:
# ProductID (INT)
# ProductName (STRING)
# Category (STRING)
# Price (DOUBLE)
# Quantity (INT)
# SaleDate (DATE)
# 4. Insert at least 5 rows into product_sales
data = [
    Row(ProductID=1, ProductName="Laptop", Category="Electronics", Price=900.0, Quantity=2, SaleDate=date(2025, 5, 1)),
    Row(ProductID=2, ProductName="Phone", Category="Electronics", Price=700.0, Quantity=1, SaleDate=date(2025, 5, 2)),
    Row(ProductID=3, ProductName="TV", Category="Electronics", Price=500.0, Quantity=4, SaleDate=date(2025, 5, 3)),
    Row(ProductID=4, ProductName="Book", Category="Books", Price=80.0, Quantity=10, SaleDate=date(2025, 5, 4)),
    Row(ProductID=5, ProductName="Shoes", Category="Fashion", Price=500.0, Quantity=3, SaleDate=date(2025, 5, 5)),
]

df_product_sales = spark.createDataFrame(data)

df_product_sales.write.mode("overwrite").parquet("parquet_data/product_sales")

df_sales_read = spark.read.parquet("parquet_data/product_sales")
df_sales_read.createOrReplaceTempView("product_sales")
df_sales_read.show()

# 5. Select all records
spark.sql("SELECT * FROM product_sales").show()

# 6. Products where price > 500
spark.sql("SELECT * FROM product_sales WHERE Price > 500").show()

# 7. Total sale amount
spark.sql("SELECT ProductName, Price, Quantity, (Price * Quantity) AS TotalSale FROM product_sales").show()

# 8. Number of products sold in each Category
spark.sql("SELECT Category, SUM(Quantity) AS TotalSold FROM product_sales GROUP BY Category").show()

# 9. Sort products by total sales descending
spark.sql("""
    SELECT ProductName, (Price * Quantity) AS TotalSale
    FROM product_sales
    ORDER BY TotalSale DESC
""").show()

# 10. Create a PySpark DataFrame with dummy product data
data = [
    (101, 'Mouse', 'Electronics', 25.0, 2, '2025-06-01'),
    (102, 'Pen', 'Stationery', 5.0, 1, '2025-06-01')
]
columns = ['ProductID', 'ProductName', 'Category', 'Price', 'Quantity', 'SaleDate']
temp_df = spark.createDataFrame(data, columns)

# 11. Register it as a temporary view
temp_df.createOrReplaceTempView("temp_orders")

# 12. Query: quantity > 1
spark.sql("SELECT * FROM temp_orders WHERE Quantity > 1").show()

# 13. Create a global temp view
temp_df.createOrReplaceGlobalTempView("global_orders")

# 14. Query global view (in same or different session)
spark.sql("SELECT * FROM global_temp.global_orders").show()


# 15. Create a second table customer_details with:
# CustomerID , Name , Gender , City , SignupDate
# 16. Insert at least 3 records into customer_details

customer_data = [
    Row(CustomerID=1, Name='Aditya', Gender='M', City='Coimbatore', SignupDate=date(2025, 1, 1)),
    Row(CustomerID=3, Name='Raj', Gender='M', City='Chennai', SignupDate=date(2025, 2, 15)),
    Row(CustomerID=5, Name='Muneeb', Gender='M', City='Bangalore', SignupDate=date(2025, 3, 10)),
]

df_customers = spark.createDataFrame(customer_data)

df_customers.write.mode("overwrite").parquet("parquet_data/customer_details")

df_customers_read = spark.read.parquet("parquet_data/customer_details")
df_customers_read.createOrReplaceTempView("customer_details")


spark.sql("SELECT * FROM customer_details").show()


# 17. Join on ProductID = CustomerID
spark.sql("""
    SELECT p.ProductID, p.ProductName, c.Name, c.City
    FROM product_sales p
    JOIN customer_details c
    ON p.ProductID = c.CustomerID
""").show()

# 18. Customers who bought more than 2 products
spark.sql("""
    SELECT c.Name, p.Quantity
    FROM product_sales p
    JOIN customer_details c
    ON p.ProductID = c.CustomerID
    WHERE p.Quantity > 2
""").show()

# 19. Create view sales_summary
spark.sql("""
    CREATE OR REPLACE TEMP VIEW sales_summary AS
    SELECT ProductName, Price, Quantity, (Price * Quantity) AS Total
    FROM product_sales
""")


# 20. Query view where Total > 1000
spark.sql("SELECT * FROM sales_summary WHERE Total > 1000").show()

# 21. Drop the view
spark.sql("DROP VIEW IF EXISTS sales_summary")

# 22. Drop tables
spark.sql("DROP TABLE IF EXISTS product_sales")
spark.sql("DROP TABLE IF EXISTS customer_details")

# 23. Drop the database
spark.sql("DROP DATABASE IF EXISTS sales_db CASCADE")


+---------+-----------+-----------+-----+--------+----------+
|ProductID|ProductName|   Category|Price|Quantity|  SaleDate|
+---------+-----------+-----------+-----+--------+----------+
|        1|     Laptop|Electronics|900.0|       2|2025-05-01|
|        2|      Phone|Electronics|700.0|       1|2025-05-02|
|        3|         TV|Electronics|500.0|       4|2025-05-03|
|        4|       Book|      Books| 80.0|      10|2025-05-04|
|        5|      Shoes|    Fashion|500.0|       3|2025-05-05|
+---------+-----------+-----------+-----+--------+----------+

+---------+-----------+-----------+-----+--------+----------+
|ProductID|ProductName|   Category|Price|Quantity|  SaleDate|
+---------+-----------+-----------+-----+--------+----------+
|        1|     Laptop|Electronics|900.0|       2|2025-05-01|
|        2|      Phone|Electronics|700.0|       1|2025-05-02|
|        3|         TV|Electronics|500.0|       4|2025-05-03|
|        4|       Book|      Books| 80.0|      10|2025-05-04|
|      

DataFrame[]