**Intialize the Saprk Session**

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("Spark-SQL-Exercises") \
    .enableHiveSupport() \
    .getOrCreate()

**Database & Table Tasks**

In [5]:
#1.Create a new database named sales_db
spark.sql("CREATE DATABASE IF NOT EXISTS sales_db")
#2.Set the current database to sales_db
spark.sql("USE sales_db")
#3.Create product_sales table
spark.sql("""
    CREATE TABLE IF NOT EXISTS product_sales (
        ProductID INT,
        ProductName STRING,
        Category STRING,
        Price DOUBLE,
        Quantity INT,
        SaleDate DATE
    )
""")
#4.Insert 5 rows into product_sales
spark.sql("""
    INSERT INTO product_sales VALUES
    (1, 'Laptop', 'Electronics', 1000.0, 2, CAST('2024-01-10' AS DATE)),
    (2, 'Phone', 'Electronics', 600.0, 1, CAST('2024-02-15' AS DATE)),
    (3, 'Desk', 'Furniture', 200.0, 3, CAST('2024-03-05' AS DATE)),
    (4, 'Chair', 'Furniture', 150.0, 4, CAST('2024-04-20' AS DATE)),
    (5, 'Tablet', 'Electronics', 450.0, 2, CAST('2024-05-10' AS DATE))
""")


DataFrame[]

**Query Tasks**

In [6]:
#5.Select all records
spark.sql("SELECT * FROM product_sales").show()
#6.Retrieve products where price is above 500.
spark.sql("SELECT * FROM product_sales WHERE Price > 500").show()
#7.Total sale amount per product
spark.sql("SELECT ProductName, Price, Quantity, Price * Quantity AS Total FROM product_sales").show()
#8.Number of products sold by Category
spark.sql("SELECT Category, SUM(Quantity) AS TotalSold FROM product_sales GROUP BY Category").show()
#9.Sort by total sales
spark.sql("SELECT ProductName, Price, Quantity, Price * Quantity AS Total FROM product_sales ORDER BY Total DESC").show()


+---------+-----------+-----------+------+--------+----------+
|ProductID|ProductName|   Category| Price|Quantity|  SaleDate|
+---------+-----------+-----------+------+--------+----------+
|        1|     Laptop|Electronics|1000.0|       2|2024-01-10|
|        2|      Phone|Electronics| 600.0|       1|2024-02-15|
|        3|       Desk|  Furniture| 200.0|       3|2024-03-05|
|        4|      Chair|  Furniture| 150.0|       4|2024-04-20|
|        5|     Tablet|Electronics| 450.0|       2|2024-05-10|
+---------+-----------+-----------+------+--------+----------+

+---------+-----------+-----------+------+--------+----------+
|ProductID|ProductName|   Category| Price|Quantity|  SaleDate|
+---------+-----------+-----------+------+--------+----------+
|        1|     Laptop|Electronics|1000.0|       2|2024-01-10|
|        2|      Phone|Electronics| 600.0|       1|2024-02-15|
+---------+-----------+-----------+------+--------+----------+

+-----------+------+--------+------+
|ProductName| Pr

**Temporary View Tasks**

In [8]:
#10.Create a PySpark DataFrame
temp_data = [
    (1, "Laptop", 2),
    (2, "Phone", 1),
    (3, "Tablet", 3),
    (4, "Desktop", 2),
    (5, "Monitor", 4),
    (6, "Printer", 1),
    (7, "Scanner", 2),
    (8, "Keyboard", 3),
    (9, "Mouse", 2),
    (10, "Headphones", 1)
]
columns = ["ProductID", "ProductName", "Quantity"]
a=spark.createDataFrame(temp_data, columns)
#11.Register as temp_orders
a.createOrReplaceTempView("temp_orders")
#12.SQL query: quantity > 1
spark.sql("SELECT * FROM temp_orders WHERE Quantity > 1").show()

+---------+-----------+--------+
|ProductID|ProductName|Quantity|
+---------+-----------+--------+
|        1|     Laptop|       2|
|        3|     Tablet|       3|
|        4|    Desktop|       2|
|        5|    Monitor|       4|
|        7|    Scanner|       2|
|        8|   Keyboard|       3|
|        9|      Mouse|       2|
+---------+-----------+--------+



**Global View Tasks**

In [9]:
#13.Global temporary view global_orders
a.createOrReplaceGlobalTempView("global_orders")
#14.Query from another session (use global_temp)
spark.sql("SELECT * FROM global_temp.global_orders").show()

+---------+-----------+--------+
|ProductID|ProductName|Quantity|
+---------+-----------+--------+
|        1|     Laptop|       2|
|        2|      Phone|       1|
|        3|     Tablet|       3|
|        4|    Desktop|       2|
|        5|    Monitor|       4|
|        6|    Printer|       1|
|        7|    Scanner|       2|
|        8|   Keyboard|       3|
|        9|      Mouse|       2|
|       10| Headphones|       1|
+---------+-----------+--------+



**Join Tasks**

In [11]:
#15.Create customer_details table
spark.sql("""
    CREATE TABLE IF NOT EXISTS customer_details (
        CustomerID INT,
        Name STRING,
        Gender STRING,
        City STRING,
        SignupDate DATE
    )
""")
#16.Insert records
spark.sql("""
    INSERT INTO customer_details VALUES
    (1,'Aaron','Male','Madurai',CAST('2022-01-01' AS DATE)),
    (2,'Sobana','Femlae','Bengaluru',CAST('2021-03-15' AS DATE)),
    (3,'Choki','Femlae','chennai',CAST('2023-06-20' AS DATE)),
    (4,'Mickey','Male','Mumbai',CAST('2024-01-10' AS DATE)),
    (5,'Bubu','Male','Madurai',CAST('2025-01-10' AS DATE))
""")
#17.Join on ProductID = CustomerID
spark.sql("""
    SELECT p.ProductID, p.ProductName, c.Name, c.City
    FROM product_sales p
    JOIN customer_details c
    ON p.ProductID = c.CustomerID
""").show()
#18.Customers who bought more than 2 products
spark.sql("""
    SELECT c.Name, SUM(p.Quantity) AS TotalBought
    FROM product_sales p
    JOIN customer_details c
    ON p.ProductID = c.CustomerID
    GROUP BY c.Name
    HAVING TotalBought > 2
""").show()

+---------+-----------+------+---------+
|ProductID|ProductName|  Name|     City|
+---------+-----------+------+---------+
|        1|     Laptop| Aaron|  Madurai|
|        2|      Phone|Sobana|Bengaluru|
|        3|       Desk| Choki|  chennai|
|        4|      Chair|Mickey|   Mumbai|
|        5|     Tablet|  Bubu|  Madurai|
+---------+-----------+------+---------+

+------+-----------+
|  Name|TotalBought|
+------+-----------+
|Mickey|          4|
| Choki|          3|
+------+-----------+



**View & Summary Tasks**

In [12]:
#19.Create view sales_summary
spark.sql("""
    CREATE OR REPLACE VIEW sales_summary AS
    SELECT ProductName, Price, Quantity, Price * Quantity AS Total
    FROM product_sales
""")
#20.Query for Total > 1000
spark.sql("SELECT * FROM sales_summary WHERE Total > 1000").show()

+-----------+------+--------+------+
|ProductName| Price|Quantity| Total|
+-----------+------+--------+------+
|     Laptop|1000.0|       2|2000.0|
+-----------+------+--------+------+



**Cleanup Tasks**

In [14]:
#21.Drop view
spark.sql("DROP VIEW IF EXISTS sales_summary")
#22.Drop tables
spark.sql("DROP TABLE IF EXISTS product_sales")
spark.sql("DROP TABLE IF EXISTS customer_details")
#23.Drop database
spark.sql("DROP DATABASE IF EXISTS sales_db")

DataFrame[]