In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.getOrCreate()

#Database & Table Tasks

1. Create a new database named sales_db.

In [3]:
spark.sql("CREATE DATABASE IF NOT EXISTS sales_db")

DataFrame[]

2. Set the current database to sales_db

In [6]:
spark.sql("USE sales_db")

DataFrame[]

3.  Create a table product_sales with columns: \
 ProductID (INT) \
 ProductName (STRING) \
 Category (STRING) \
 Price (DOUBLE) \
 Quantity (INT) \
 SaleDate (DATE) \

In [7]:
spark.sql("""
          CREATE TABLE IF NOT EXISTS product_sales(
          ProductID INT,
          ProductName VARCHAR(30),
          Category VARCHAR(30),
          Price DOUBLE,
          Quantity INT,
          SaleDate DATE
          ) USING PARQUET
""")

DataFrame[]

4. Insert at least 5 rows into product_sales.

In [14]:
spark.sql("""
          INSERT INTO product_sales VALUES
          (101, "Asus Rog", "Laptop", 120000, 1, CAST('2021-09-21' AS DATE)),
          (102, "Logitech mouse", "Mouse", 600, 3, CAST('2022-11-10' AS DATE)),
          (103, "Samsung 15 inch", "TV", 100000, 5, CAST('2025-03-20' AS DATE)),
          (104, "I Phone", "Mobile", 80000, 10, CAST('2024-08-20' AS DATE)),
          (105, "RealMe True wireless", "TWS", 1600, 4, CAST('2023-10-18' AS DATE))
""")


DataFrame[]

## Query Tasks

5. Select all records from product_sales.

In [15]:
spark.sql("SELECT * FROM product_sales").show()

+---------+--------------------+--------+--------+--------+----------+
|ProductID|         ProductName|Category|   Price|Quantity|  SaleDate|
+---------+--------------------+--------+--------+--------+----------+
|      103|     Samsung 15 inch|      TV|100000.0|       5|2025-03-20|
|      104|             I Phone|  Mobile| 80000.0|      10|2024-08-20|
|      105|RealMe True wireless|     TWS|  1600.0|       4|2023-10-18|
|      101|            Asus Rog|  Laptop|120000.0|       1|2021-09-21|
|      102|      Logitech mouse|   Mouse|   600.0|       3|2022-11-10|
+---------+--------------------+--------+--------+--------+----------+



6. Retrieve products where price is above 500.

In [16]:
spark.sql("""
          SELECT
          ProductID,
          ProductName,
          Price
          FROM product_sales
          WHERE Price > 500
""").show()

+---------+--------------------+--------+
|ProductID|         ProductName|   Price|
+---------+--------------------+--------+
|      103|     Samsung 15 inch|100000.0|
|      104|             I Phone| 80000.0|
|      105|RealMe True wireless|  1600.0|
|      101|            Asus Rog|120000.0|
|      102|      Logitech mouse|   600.0|
+---------+--------------------+--------+



7. Calculate total sale amount (Price * Quantity) for each product.

In [19]:
spark.sql("""
          SELECT
          ProductID,
          ProductName,
          Price,
          Quantity,
          Price * Quantity AS TotalSaleAmount
          FROM product_sales
""").show()

+---------+--------------------+--------+--------+---------------+
|ProductID|         ProductName|   Price|Quantity|TotalSaleAmount|
+---------+--------------------+--------+--------+---------------+
|      103|     Samsung 15 inch|100000.0|       5|       500000.0|
|      104|             I Phone| 80000.0|      10|       800000.0|
|      105|RealMe True wireless|  1600.0|       4|         6400.0|
|      101|            Asus Rog|120000.0|       1|       120000.0|
|      102|      Logitech mouse|   600.0|       3|         1800.0|
+---------+--------------------+--------+--------+---------------+



8. Find the number of products sold in each Category.

In [22]:
spark.sql("""
          SELECT
          Category,
          COUNT(ProductID) AS NumberOfProductsSold
          FROM product_sales
          GROUP BY Category
          ORDER BY NumberOfProductsSold
""").show()

+--------+--------------------+
|Category|NumberOfProductsSold|
+--------+--------------------+
|      TV|                   1|
|     TWS|                   1|
|  Mobile|                   1|
|  Laptop|                   1|
|   Mouse|                   1|
+--------+--------------------+



9. Sort products by total sales in descending order.




In [29]:
spark.sql("""
          SELECT
          ProductID,
          ProductName,
          Price,
          Quantity,
          Price * Quantity AS TotalSaleAmount,
          RANK() OVER(ORDER BY Price * Quantity DESC) AS SaleRank
          FROM product_sales
""").show()

+---------+--------------------+--------+--------+---------------+--------+
|ProductID|         ProductName|   Price|Quantity|TotalSaleAmount|SaleRank|
+---------+--------------------+--------+--------+---------------+--------+
|      104|             I Phone| 80000.0|      10|       800000.0|       1|
|      103|     Samsung 15 inch|100000.0|       5|       500000.0|       2|
|      101|            Asus Rog|120000.0|       1|       120000.0|       3|
|      105|RealMe True wireless|  1600.0|       4|         6400.0|       4|
|      102|      Logitech mouse|   600.0|       3|         1800.0|       5|
+---------+--------------------+--------+--------+---------------+--------+



##Temporary View Tasks

 10. Create a PySpark DataFrame with dummy product data.

In [30]:
# Sample data
data = [
    (201, "Dell Inspiron", "Laptop", 90000, 2, "2023-01-15"),
    (202, "HP Wireless Keyboard", "Keyboard", 1500, 5, "2023-04-22"),
    (203, "Sony Bravia 55 inch", "TV", 150000, 3, "2024-12-05"),
    (204, "OnePlus Nord", "Mobile", 35000, 7, "2023-09-30"),
    (205, "JBL Bluetooth Speaker", "Speaker", 4000, 6, "2022-07-19")
]

# Schema Defenition
columns = ["ProductID", "ProductName", "Category", "Price", "Quantity", "SaleDate"]

# Creating dataframe
df = spark.createDataFrame(data, schema=columns)

df.show()

+---------+--------------------+--------+------+--------+----------+
|ProductID|         ProductName|Category| Price|Quantity|  SaleDate|
+---------+--------------------+--------+------+--------+----------+
|      201|       Dell Inspiron|  Laptop| 90000|       2|2023-01-15|
|      202|HP Wireless Keyboard|Keyboard|  1500|       5|2023-04-22|
|      203| Sony Bravia 55 inch|      TV|150000|       3|2024-12-05|
|      204|        OnePlus Nord|  Mobile| 35000|       7|2023-09-30|
|      205|JBL Bluetooth Spe...| Speaker|  4000|       6|2022-07-19|
+---------+--------------------+--------+------+--------+----------+



 11. Register it as a temporary view called temp_orders.

---



In [31]:
df.createOrReplaceTempView("temp_orders")
spark.sql("SELECT * FROM temp_orders").show()

+---------+--------------------+--------+------+--------+----------+
|ProductID|         ProductName|Category| Price|Quantity|  SaleDate|
+---------+--------------------+--------+------+--------+----------+
|      201|       Dell Inspiron|  Laptop| 90000|       2|2023-01-15|
|      202|HP Wireless Keyboard|Keyboard|  1500|       5|2023-04-22|
|      203| Sony Bravia 55 inch|      TV|150000|       3|2024-12-05|
|      204|        OnePlus Nord|  Mobile| 35000|       7|2023-09-30|
|      205|JBL Bluetooth Spe...| Speaker|  4000|       6|2022-07-19|
+---------+--------------------+--------+------+--------+----------+



12. Run a SQL query to filter temp_orders where quantity > 1

In [32]:
spark.sql("""
          SELECT
          ProductID,
          ProductName,
          Category,
          Price,
          Quantity
          FROM temp_orders
          WHERE Quantity > 1
""").show()

+---------+--------------------+--------+------+--------+
|ProductID|         ProductName|Category| Price|Quantity|
+---------+--------------------+--------+------+--------+
|      201|       Dell Inspiron|  Laptop| 90000|       2|
|      202|HP Wireless Keyboard|Keyboard|  1500|       5|
|      203| Sony Bravia 55 inch|      TV|150000|       3|
|      204|        OnePlus Nord|  Mobile| 35000|       7|
|      205|JBL Bluetooth Spe...| Speaker|  4000|       6|
+---------+--------------------+--------+------+--------+



## Global View Tasks

13.  Create a global temp view from a PySpark DataFrame named global_orders.

In [33]:
df.createOrReplaceGlobalTempView("global_orders")

+---------+--------------------+--------+------+--------+----------+
|ProductID|         ProductName|Category| Price|Quantity|  SaleDate|
+---------+--------------------+--------+------+--------+----------+
|      201|       Dell Inspiron|  Laptop| 90000|       2|2023-01-15|
|      202|HP Wireless Keyboard|Keyboard|  1500|       5|2023-04-22|
|      203| Sony Bravia 55 inch|      TV|150000|       3|2024-12-05|
|      204|        OnePlus Nord|  Mobile| 35000|       7|2023-09-30|
|      205|JBL Bluetooth Spe...| Speaker|  4000|       6|2022-07-19|
+---------+--------------------+--------+------+--------+----------+



14. Run a SQL query on the global view from another notebook cell/session.

In [35]:
result_df = spark.sql("""
                      SELECT *
                      FROM global_temp.global_orders
                      WHERE Price > 1000;
""")
result_df.show()

+---------+--------------------+--------+------+--------+----------+
|ProductID|         ProductName|Category| Price|Quantity|  SaleDate|
+---------+--------------------+--------+------+--------+----------+
|      201|       Dell Inspiron|  Laptop| 90000|       2|2023-01-15|
|      202|HP Wireless Keyboard|Keyboard|  1500|       5|2023-04-22|
|      203| Sony Bravia 55 inch|      TV|150000|       3|2024-12-05|
|      204|        OnePlus Nord|  Mobile| 35000|       7|2023-09-30|
|      205|JBL Bluetooth Spe...| Speaker|  4000|       6|2022-07-19|
+---------+--------------------+--------+------+--------+----------+



##  Join Tasks

15. Create a second table customer_details with: \
 CustomerID, \
 Name,
 Gender, \
 City, \
 SignupDate

In [49]:
spark.sql("""
          CREATE TABLE IF NOT EXISTS customer_details (
          CustomerID INT,
          Name VARCHAR(30),
          Gender VARCHAR(6),
          City VARCHAR(30),
          SignupDate DATE) USING PARQUET
""")

DataFrame[]

16. Insert at least 3 records into customer_details.

In [50]:
spark.sql("""
          INSERT INTO customer_details VALUES
          (101, 'Ashwin', 'Male', 'Coimbatore', CAST('2022-05-14' AS DATE)),
          (102, 'Ram', 'Male', 'Hyderabad', CAST('2023-01-23' AS DATE)),
          (103, 'Catherine Lee', 'Female', 'Chicago', CAST('2021-11-30' AS DATE)),
          (104, 'David Brown', 'Male', 'Houston', CAST('2022-09-10' AS DATE)),
          (105, 'Eva Green', 'Female', 'San Francisco', CAST('2023-03-15' AS DATE))
""")

DataFrame[]

17.  Write a SQL join between product_sales and customer_details based on
 ProductID = CustomerID (simulate a match).

In [53]:
spark.sql("""
          SELECT
          p.ProductName,
          p.Price,
          p.Quantity,
          c.SignupDate,
          p.SaleDate,
          c.Name
          FROM product_sales p
          LEFT JOIN customer_details c
          ON p.ProductID = c.CustomerID
""").show()

+--------------------+--------+--------+----------+----------+-------------+
|         ProductName|   Price|Quantity|SignupDate|  SaleDate|         Name|
+--------------------+--------+--------+----------+----------+-------------+
|     Samsung 15 inch|100000.0|       5|2021-11-30|2025-03-20|Catherine Lee|
|             I Phone| 80000.0|      10|2022-09-10|2024-08-20|  David Brown|
|RealMe True wireless|  1600.0|       4|2023-03-15|2023-10-18|    Eva Green|
|            Asus Rog|120000.0|       1|2022-05-14|2021-09-21|       Ashwin|
|      Logitech mouse|   600.0|       3|2023-01-23|2022-11-10|          Ram|
+--------------------+--------+--------+----------+----------+-------------+



 18. List customers who bought more than 2 products

In [55]:
spark.sql("""
          SELECT
          c.CustomerID,
          c.Name,
          p.ProductName,
          p.Quantity
          FROM customer_details c
          LEFT JOIN product_sales p
          ON c.CustomerID = p.ProductID
          WHERE p.Quantity > 2
""").show()

+----------+-------------+--------------------+--------+
|CustomerID|         Name|         ProductName|Quantity|
+----------+-------------+--------------------+--------+
|       103|Catherine Lee|     Samsung 15 inch|       5|
|       104|  David Brown|             I Phone|      10|
|       105|    Eva Green|RealMe True wireless|       4|
|       102|          Ram|      Logitech mouse|       3|
+----------+-------------+--------------------+--------+



 ## View & Summary Tasks

Create a SQL view sales_summary that includes:\
 ProductName, \
 Price, \
 Quantity, \
 Total = Price * Quantity

In [58]:
spark.sql("""
        CREATE OR REPLACE VIEW sales_summary AS
        SELECT
        ProductName,
        Price,
        Quantity,
        Price * Quantity AS Total
        FROM product_sales
""")

spark.sql("SELECT * FROM sales_summary").show()

+--------------------+--------+--------+--------+
|         ProductName|   Price|Quantity|   Total|
+--------------------+--------+--------+--------+
|     Samsung 15 inch|100000.0|       5|500000.0|
|             I Phone| 80000.0|      10|800000.0|
|RealMe True wireless|  1600.0|       4|  6400.0|
|            Asus Rog|120000.0|       1|120000.0|
|      Logitech mouse|   600.0|       3|  1800.0|
+--------------------+--------+--------+--------+



20. Query the view for records with
Total > 1000

In [59]:
spark.sql("""
          SELECT *
          FROM sales_summary
          WHERE Total > 1000
""").show()

+--------------------+--------+--------+--------+
|         ProductName|   Price|Quantity|   Total|
+--------------------+--------+--------+--------+
|     Samsung 15 inch|100000.0|       5|500000.0|
|             I Phone| 80000.0|      10|800000.0|
|RealMe True wireless|  1600.0|       4|  6400.0|
|            Asus Rog|120000.0|       1|120000.0|
|      Logitech mouse|   600.0|       3|  1800.0|
+--------------------+--------+--------+--------+



## Cleanup Tasks

21. Drop the view sales_summary.

In [60]:
spark.sql("DROP VIEW IF EXISTS sales_summary")

DataFrame[]

22. Drop the tables product_sales and customer_details

In [61]:
spark.sql("DROP TABLE IF EXISTS product_sales")
spark.sql("DROP TABLE IF EXISTS customer_details")

DataFrame[]

23. Drop the database sales_db

In [62]:
spark.sql("DROP DATABASE IF EXISTS sales_db")

DataFrame[]