Database & Table Tasks



In [1]:
# 1. Create a new database named sales_db
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
spark.sql("CREATE DATABASE IF NOT EXISTS sales_db")


DataFrame[]

In [2]:
# 2.Set the current database to sales_db .
spark.sql("USE sales_db")

DataFrame[]

In [3]:
# 3.Create a table product_sales with columns
spark.sql("""
    CREATE TABLE IF NOT EXISTS product_sales (
        ProductID INT,
        ProductName STRING,
        Category STRING,
        Price DOUBLE,
        Quantity INT,
        SaleDate DATE
    )
    USING PARQUET
""")


DataFrame[]

In [4]:
# 4.Insert at least 5 rows into product_sales .
spark.sql("""
    INSERT INTO product_sales VALUES
    (1, 'iPhone 15', 'Electronics', 89999.99, 5, DATE('2024-06-01')),
    (2, 'MacBook Air', 'Electronics', 124999.00, 2, DATE('2024-06-03')),
    (3, 'Office Chair', 'Furniture', 8999.50, 10, DATE('2024-06-02')),
    (4, 'LED Monitor', 'Electronics', 15000.00, 3, DATE('2024-06-01')),
    (5, 'Wooden Table', 'Furniture', 4500.75, 4, DATE('2024-06-04'))
""")


DataFrame[]

Query Tasks

In [5]:
# Select all records from product_sales .
spark.sql("SELECT * FROM product_sales").show()

+---------+------------+-----------+--------+--------+----------+
|ProductID| ProductName|   Category|   Price|Quantity|  SaleDate|
+---------+------------+-----------+--------+--------+----------+
|        3|Office Chair|  Furniture|  8999.5|      10|2024-06-02|
|        4| LED Monitor|Electronics| 15000.0|       3|2024-06-01|
|        5|Wooden Table|  Furniture| 4500.75|       4|2024-06-04|
|        1|   iPhone 15|Electronics|89999.99|       5|2024-06-01|
|        2| MacBook Air|Electronics|124999.0|       2|2024-06-03|
+---------+------------+-----------+--------+--------+----------+



In [6]:
# 6. Retrieve products where price is above 500.
spark.sql("SELECT * FROM product_sales WHERE Price > 50000").show()

+---------+-----------+-----------+--------+--------+----------+
|ProductID|ProductName|   Category|   Price|Quantity|  SaleDate|
+---------+-----------+-----------+--------+--------+----------+
|        1|  iPhone 15|Electronics|89999.99|       5|2024-06-01|
|        2|MacBook Air|Electronics|124999.0|       2|2024-06-03|
+---------+-----------+-----------+--------+--------+----------+



In [7]:
#7 . Calculate total sale amount ( Price * Quantity ) for each product.
spark.sql("SELECT ProductName, Price * Quantity AS TotalSaleAmount FROM product_sales").show()

+------------+---------------+
| ProductName|TotalSaleAmount|
+------------+---------------+
|Office Chair|        89995.0|
| LED Monitor|        45000.0|
|Wooden Table|        18003.0|
|   iPhone 15|      449999.95|
| MacBook Air|       249998.0|
+------------+---------------+



In [8]:
# 8. Find the number of products sold in each Category .
spark.sql("""SELECT Category, SUM(Quantity) AS TotalProductsSold
              FROM product_sales GROUP BY Category""").show()

+-----------+-----------------+
|   Category|TotalProductsSold|
+-----------+-----------------+
|Electronics|               10|
|  Furniture|               14|
+-----------+-----------------+



In [9]:
# 9. Sort products by total sales in descending order.
spark.sql("""SELECT ProductName, SUM(Quantity) AS TotalSales
              FROM product_sales GROUP BY ProductName
              ORDER BY TotalSales DESC""").show()

+------------+----------+
| ProductName|TotalSales|
+------------+----------+
|Office Chair|        10|
|   iPhone 15|         5|
|Wooden Table|         4|
| LED Monitor|         3|
| MacBook Air|         2|
+------------+----------+



Temporary View Tasks

In [11]:
# 10. Create a PySpark DataFrame with dummy product data.
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, DateType
from datetime import date
product_data = [
    (101, "Keyboard", "Electronics", 1200.0, 3, date(2024, 6, 1)),
    (102, "Mouse", "Electronics", 600.0, 1, date(2024, 6, 2)),
    (103, "Desk Lamp", "Furniture", 1500.0, 2, date(2024, 6, 3)),
    (104, "Notebook", "Stationery", 50.0, 10, date(2024, 6, 4)),
    (105, "Monitor", "Electronics", 7500.0, 1, date(2024, 6, 5))]
schema = StructType([
    StructField("ProductID", IntegerType(), True),
    StructField("ProductName", StringType(), True),
    StructField("Category", StringType(), True),
    StructField("Price", DoubleType(), True),
    StructField("Quantity", IntegerType(), True),
    StructField("SaleDate", DateType(), True)])
sample_df = spark.createDataFrame(product_data, schema=schema)
sample_df.show()


+---------+-----------+-----------+------+--------+----------+
|ProductID|ProductName|   Category| Price|Quantity|  SaleDate|
+---------+-----------+-----------+------+--------+----------+
|      101|   Keyboard|Electronics|1200.0|       3|2024-06-01|
|      102|      Mouse|Electronics| 600.0|       1|2024-06-02|
|      103|  Desk Lamp|  Furniture|1500.0|       2|2024-06-03|
|      104|   Notebook| Stationery|  50.0|      10|2024-06-04|
|      105|    Monitor|Electronics|7500.0|       1|2024-06-05|
+---------+-----------+-----------+------+--------+----------+



In [12]:
# 11. Register it as a temporary view called temp_orders .
sample_df.createOrReplaceTempView("temp_orders")

In [13]:
# 12. Run a SQL query to filter temp_orders where quantity > 1.
spark.sql("SELECT * FROM temp_orders WHERE Quantity > 1").show()

+---------+-----------+-----------+------+--------+----------+
|ProductID|ProductName|   Category| Price|Quantity|  SaleDate|
+---------+-----------+-----------+------+--------+----------+
|      101|   Keyboard|Electronics|1200.0|       3|2024-06-01|
|      103|  Desk Lamp|  Furniture|1500.0|       2|2024-06-03|
|      104|   Notebook| Stationery|  50.0|      10|2024-06-04|
+---------+-----------+-----------+------+--------+----------+



Global View Tasks


In [14]:
# 13. Create a global temp view from a PySpark DataFrame named global_orders .
sample_df.createOrReplaceGlobalTempView("global_orders")

In [15]:
# 14. Run a SQL query on the global view from another notebook cell/session.
spark.sql("SELECT * FROM global_temp.global_orders").show()

+---------+-----------+-----------+------+--------+----------+
|ProductID|ProductName|   Category| Price|Quantity|  SaleDate|
+---------+-----------+-----------+------+--------+----------+
|      101|   Keyboard|Electronics|1200.0|       3|2024-06-01|
|      102|      Mouse|Electronics| 600.0|       1|2024-06-02|
|      103|  Desk Lamp|  Furniture|1500.0|       2|2024-06-03|
|      104|   Notebook| Stationery|  50.0|      10|2024-06-04|
|      105|    Monitor|Electronics|7500.0|       1|2024-06-05|
+---------+-----------+-----------+------+--------+----------+



Join Tasks

In [18]:
# 15. Create a second table customer_details with: CustomerID , Name , Gender , City , SignupDate
spark.sql("""
    CREATE TABLE IF NOT EXISTS customer_details (
        CustomerID INT,
        Name STRING,
        Gender STRING,
        City STRING,
        SignupDate DATE
    )
    USING PARQUET
""")

DataFrame[]

In [20]:
# 16. Insert at least 3 records into customer_details .
spark.sql(""" INSERT INTO customer_details VALUES
              (1, 'Alice Johnson', 'Female', 'New York', DATE('2023-01-15')),
              (2, 'Bob Smith', 'Male', 'Los Angeles', DATE('2022-07-20')),
              (3, 'Charlie Brown', 'Male', 'Chicago', DATE('2023-03-12')) """)


DataFrame[]

In [22]:
# 17. Write a SQL join between product_sales and customer_details based on
#ProductID = CustomerID (simulate a match).
spark.sql("""
    SELECT *
    FROM product_sales p
    JOIN customer_details c ON p.ProductID = c.CustomerID
""").show()

+---------+------------+-----------+--------+--------+----------+----------+-------------+------+-----------+----------+
|ProductID| ProductName|   Category|   Price|Quantity|  SaleDate|CustomerID|         Name|Gender|       City|SignupDate|
+---------+------------+-----------+--------+--------+----------+----------+-------------+------+-----------+----------+
|        3|Office Chair|  Furniture|  8999.5|      10|2024-06-02|         3|Charlie Brown|  Male|    Chicago|2023-03-12|
|        1|   iPhone 15|Electronics|89999.99|       5|2024-06-01|         1|Alice Johnson|Female|   New York|2023-01-15|
|        2| MacBook Air|Electronics|124999.0|       2|2024-06-03|         2|    Bob Smith|  Male|Los Angeles|2022-07-20|
+---------+------------+-----------+--------+--------+----------+----------+-------------+------+-----------+----------+



In [25]:
# 18. List customers who bought more than 2 products.
spark.sql("""
          SELECT c.Name, SUM(p.Quantity) as TotalProductsBought
          FROM product_sales p
          JOIN customer_details c ON p.ProductID = c.CustomerID
          GROUP BY c.Name
          HAVING SUM(p.Quantity) > 2
""").show()


+-------------+-------------------+
|         Name|TotalProductsBought|
+-------------+-------------------+
|Charlie Brown|                 10|
|Alice Johnson|                  5|
+-------------+-------------------+



View & Summary Tasks


In [26]:
# 19. Create a SQL view sales_summary that includes:
# ProductName , Price , Quantity , Total = Price * Quantity
spark.sql("""
CREATE OR REPLACE VIEW sales_summary AS
SELECT ProductName, Price, Quantity, (Price * Quantity) AS Total
FROM product_sales
""")

DataFrame[]

In [27]:
# 20. Query the view for records with Total > 1000 .
spark.sql("SELECT * FROM sales_summary WHERE Total > 1000").show()

+------------+--------+--------+---------+
| ProductName|   Price|Quantity|    Total|
+------------+--------+--------+---------+
|Office Chair|  8999.5|      10|  89995.0|
| LED Monitor| 15000.0|       3|  45000.0|
|Wooden Table| 4500.75|       4|  18003.0|
|   iPhone 15|89999.99|       5|449999.95|
| MacBook Air|124999.0|       2| 249998.0|
+------------+--------+--------+---------+



Cleanup Tasks


In [28]:
# 21. Drop the view sales_summary .
spark.sql("DROP VIEW sales_summary")

DataFrame[]

In [29]:
# 22. Drop the tables product_sales and customer_details .
spark.sql("DROP TABLE product_sales")
spark.sql("DROP TABLE customer_details")

DataFrame[]

In [30]:
# 23. Drop the database sales_db .
spark.sql("DROP DATABASE sales_db")

DataFrame[]