In [1]:
import os
import mysql.connector as connector
import logging
from time import time

## Create logger

In [2]:
logger = logging.getLogger("[Window Functions MySQL]")
if os.path.exists("../log/window-fxn.log"):
  os.remove("../log/window-fxn.log")
logging.basicConfig(filename='../log/window-fxn.log', encoding='utf-8', level=logging.DEBUG, format='%(asctime)s ==> %(message)s', datefmt='%m/%d/%Y %I:%M:%S')

## Create Connection

In [3]:
logger.info("Creating a connection between MySQL and Python")
dbconfig={"user":"root", "password":os.environ["MYSQL_ROOT_PASSWORD"], "port":33061, "host":"localhost"}
connection=connector.connect(**dbconfig)
print("Connection established between MySQL and Python")
logger.info("Connection established between MySQL and Python")

Connection established between MySQL and Python


## Create Cursor Object

In [4]:
print("Creating cursor object from connection")
logger.info("Creating first cursor object from connection")
cursor = connection.cursor()
print("Cursor object created to communicate with MySQL")
logger.info("Cursor object created to communicate with MySQL")

Creating cursor object from connection
Cursor object created to communicate with MySQL


## Create Database

In [5]:
database_name: str = "db_window_fxn"
drop_database_query: str = f"""DROP DATABASE IF EXISTS {database_name}"""
cursor.execute(drop_database_query)
logger.info(f"Dropping Database {database_name} if it already exists.")

create_database_query: str = f"""CREATE DATABASE IF NOT EXISTS {database_name}"""
print(f"Creating Database {database_name}.")
logger.info(f"Creating Database {database_name}.")
cursor.execute(create_database_query)
logger.info(f"{database_name} Database created.")
print(f"{database_name} Database created.")

Creating Database db_window_fxn.
db_window_fxn Database created.


## Confirm that database was actually created

In [6]:
cursor.execute("SHOW DATABASES;")
databases = cursor.fetchall()
isCreated = False
for database in databases:
    db_name: str = database[0]
    if db_name == database_name:
        isCreated = True
        print(f"Database '{database_name}' was successfully created")
        logger.info(f"Database '{database_name}' was successfully created.")
        break

if not isCreated:
    print(f"Database '{database_name}' was not successfully created")
    logger.info(f"Database '{database_name}' was not successfully created.")
    sys.exit()

Database 'db_window_fxn' was successfully created


## Set new created database as database to use

In [7]:
# Set the new created database as the database to use
cursor.execute(f"USE {database_name}")
print(f"Database '{database_name}' set for use.")
logger.info(f"Database '{database_name}' set for use.")

Database 'db_window_fxn' set for use.


## Create tables

In [8]:
create_orders_query = """CREATE TABLE IF NOT EXISTS tbl_orders (OrderID INT NOT NULL PRIMARY KEY, ProductID INT, CustomerID INT, SalesPersonID INT, OrderDate Date, ShipDate DATE, OrderStatus VARCHAR(50), ShipAddress VARCHAR(255), BillAddress VARCHAR(255), 
Quantity INT, Sales INT, CreationTime TimeStamp);"""
cursor.execute(create_orders_query)
logger.info("tbl_orders table created.")

create_orders_archive_query = """CREATE TABLE IF NOT EXISTS tbl_orders_archive (OrderID INT NOT NULL, ProductID INT, CustomerID INT, SalesPersonID INT, OrderDate Date, ShipDate DATE, OrderStatus VARCHAR(50), ShipAddress VARCHAR(255), BillAddress VARCHAR(255), 
Quantity INT, Sales INT, CreationTime TimeStamp);"""
cursor.execute(create_orders_archive_query)
logger.info("tbl_orders table created.")

create_products_query = """CREATE TABLE IF NOT EXISTS tbl_products (ProductID INT NOT NULL PRIMARY KEY, Product VARCHAR(50), Category VARCHAR(50), Price INT);"""
cursor.execute(create_products_query)
logger.info("tbl_products table created.")

create_customers_query = """CREATE TABLE IF NOT EXISTS tbl_customers (CustomerID INT NOT NULL PRIMARY KEY, FirstName VARCHAR(50), LastName VARCHAR(50), Country VARCHAR(50), Score INT);"""
cursor.execute(create_customers_query)
logger.info("tbl_customers table created.")

create_employees_query = """CREATE TABLE IF NOT EXISTS tbl_employees (EmployeeID INT NOT NULL PRIMARY KEY, FirstName VARCHAR(50), LastName VARCHAR(50), Department VARCHAR(50), BirthDate Date, Gender CHAR(1), Salary INT, ManagerID INT);"""
cursor.execute(create_employees_query)
logger.info("tbl_employees table created.")

## Insert records into table

In [9]:
insert_into_orders_query = """INSERT INTO tbl_orders(OrderID, ProductID, CustomerID, SalesPersonID, OrderDate, ShipDate, OrderStatus, ShipAddress, BillAddress, Quantity, Sales, CreationTime) VALUES
(1, 101, 2, 3, '2025-01-01', '2025-01-05', 'Delivered', '9833 Mt. Dias Blv.', '1226 Shoe St.', 1, 10, '2025-01-01 12:34:56.0000000'),  
(2, 102, 3,	3, '2025-01-05', '2025-01-10', 'Shipped', '250 Race Court', NULL, 1, 15, '2025-01-05 23:22:04.0000000'),
(3, 101, 1,	5, '2025-01-10', '2025-01-25', 'Delivered', '8157 W. Book', '8157 W. Book', 2, 20, '2025-01-10 18:24:08.0000000'),
(4, 105, 1, 3, '2025-01-20', '2025-01-25', 'Shipped', '5724 Victory Lane', '',	2, 60, '2025-01-20 05:50:33.0000000'),
(5, 104, 2, 5, '2025-02-01', '2025-02-05', 'Delivered', NULL , NULL , 1, 25, '2025-02-01 14:02:41.0000000'),
(6, 104, 3, 5, '2025-02-05', '2025-02-10', 'Delivered', '1792 Belmont Rd.',NULL , 2, 50, '2025-02-06 15:34:57.0000000'),
(7, 102, 1, 1, '2025-02-15', '2025-02-27', 'Delivered', '136 Balboa Court', '' , 2, 30, '2025-02-16 06:22:01.0000000'),
(8, 101, 4, 3, '2025-02-18', '2025-02-27', 'Shipped', '2947 Vine Lane', '4311 Clay Rd', 3, 90, '2025-02-18 10:45:22.0000000'),
(9, 101, 2, 3, '2025-03-10', '2025-03-15', 'Shipped', '3768 Door Way', '', 2, 20, '2025-03-10 12:59:04.0000000'),
(10, 102, 3, 5, '2025-03-15', '2025-03-20', 'Shipped', NULL , NULL, 0, 60, '2025-03-16 23:25:15.0000000');"""
cursor.execute(insert_into_orders_query)

insert_into_orders_archive_query = """INSERT INTO tbl_orders_archive(OrderID, ProductID, CustomerID, SalesPersonID, OrderDate, ShipDate, OrderStatus, ShipAddress, BillAddress, Quantity, Sales, CreationTime) VALUES
(1, 101, 2, 3, '2025-01-01', '2025-01-05', 'Delivered', '9833 Mt. Dias Blv.', '1226 Shoe St.', 1, 10, '2025-01-01 12:34:56.0000000'),  
(2, 102, 3,	3, '2025-01-05', '2025-01-10', 'Shipped', '250 Race Court', NULL, 1, 15, '2025-01-05 23:22:04.0000000'),
(3, 101, 1,	5, '2025-01-10', '2025-01-25', 'Delivered', '8157 W. Book', '8157 W. Book', 2, 20, '2025-01-10 18:24:08.0000000'),
(4, 105, 1, 3, '2025-01-20', '2025-01-25', 'Shipped', '5724 Victory Lane', '',	2, 60, '2025-01-20 05:50:33.0000000'),
(4, 104, 2, 5, '2025-02-01', '2025-02-05', 'Delivered', NULL , NULL , 1, 25, '2025-02-01 14:02:41.0000000'),
(5, 104, 3, 5, '2025-02-05', '2025-02-10', 'Delivered', '1792 Belmont Rd.',NULL , 2, 50, '2025-02-06 15:34:57.0000000'),
(6, 102, 1, 1, '2025-02-15', '2025-02-27', 'Delivered', '136 Balboa Court', '' , 2, 30, '2025-02-16 06:22:01.0000000'),
(6, 101, 4, 3, '2025-02-18', '2025-02-27', 'Shipped', '2947 Vine Lane', '4311 Clay Rd', 3, 90, '2025-02-18 10:45:22.0000000'),
(6, 101, 2, 3, '2025-03-10', '2025-03-15', 'Shipped', '3768 Door Way', '', 2, 20, '2025-03-10 12:59:04.0000000'),
(7, 102, 3, 5, '2025-03-15', '2025-03-20', 'Shipped', NULL , NULL, 0, 60, '2025-03-16 23:25:15.0000000');"""
cursor.execute(insert_into_orders_archive_query)


insert_into_products_query = """INSERT INTO tbl_products(ProductID, Product, Category, Price) VALUES
(101, 'Bottle', 'Accessories', 10),  
(102, 'Tire', 'Accessories', 15),
(103, 'Socks', 'Clothing', 20),  
(104, 'Caps', 'Clothing', 25),
(105, 'Gloves', 'Clothing', 30);"""
cursor.execute(insert_into_products_query)


insert_into_customers_query = """INSERT INTO tbl_customers(CustomerID, FirstName, LastName, Country, Score) VALUES
(1, 'Jossef', 'Goldberg', 'Germany', 350),
(2, 'Kevin', 'Brown', 'USA', 900),
(3, 'Mary', NULL, 'USA', 750),
(4, 'Mark', 'Schwarz', 'Germany', 500),
(5, 'Anna', 'Adams', 'USA', NULL);"""
cursor.execute(insert_into_customers_query)


insert_into_employees_query = """INSERT INTO tbl_employees(EmployeeID, FirstName, LastName, Department, BirthDate, Gender, Salary, ManagerID) VALUES
(1, 'Frank', 'Lee', 'Marketing', '1988-12-05', 'M', 55000, NULL),
(2, 'Kevin', 'Brown', 'Marketing', '1972-11-25', 'M', 65000, 1),
(3, 'Mary', NULL, 'Sales', '1986-01-05', 'F', 75000, 1), 
(4, 'Michael', 'Ray', 'Sales', '1977-02-10', 'M', 90000, 2), 
(5, 'Carol', 'Baker', 'Sales', '1982-02-11', 'F', 55000, 3);"""

cursor.execute(insert_into_employees_query)


connection.commit()

## Function to Display results
This function is to display the outputs of the mysql query in a nice format similar to that obtained when running the query in mysql cli

In [59]:
def select_all_query(table_name: str):
    query = f"""SELECT * FROM {table_name} LIMIT 10;"""
    return query


def display_results(table_column_names: list, results: list, exec_time):
    table_columns_length = [len(x) for x in table_column_names]
    for result in results:
        for value in range(len(result)):
            row_data = result[value]
            if row_data:
                row_data = str(row_data)
                if len(row_data) > table_columns_length[value]:
                    table_columns_length[value] = len(row_data)
    dashes_plus = ""
    for num in range(len(table_columns_length)):
        dashes_plus = dashes_plus + "+" + '-'*(table_columns_length[num]+2)
    dashes_plus = dashes_plus + "+"
    
    print(dashes_plus)
    
    table_headers = ""
    for num in range(len(table_column_names)):
        table_headers = table_headers + f"| {table_column_names[num]:^{table_columns_length[num]}} "
    table_headers = table_headers + "|"
    print(table_headers)
    
    print(dashes_plus)
    
    for result in results:
        table_row = ""
        for value in range(len(result)):
            row_data = result[value]
            if row_data is None:
                row_data = "NULL"            
            table_row = table_row + "|" + f"{str(row_data):^{table_columns_length[value]+2}}"
        print(table_row + "|")
    print(dashes_plus)
    num_rows: int = len(results)
    message: str = "row returned" if num_rows == 1 else "rows returned"
    print(f"{num_rows} {message} in set: ({exec_time} sec)")

def execute_display_query_results(query: str = "", table_column_names: list = [], results: list = []): 
    if len(query) > 0 and (table_column_names or results):
        print("You can only pass in the query alone or the table_column_names and results list")
        assert False
    if query and not table_column_names and not results:
        logger.info(f"Executing the query: {query}")
        init_time = time()
        cursor.execute(query)
        end_time = time()
        exec_time = end_time - init_time
        results = cursor.fetchall()    
        table_column_names = cursor.column_names
    
    display_results(table_column_names, results, round(exec_time, 3))

In [11]:
print("\ntbl_orders")
execute_display_query_results(select_all_query("tbl_orders"))
print("\ntbl_products")
execute_display_query_results(select_all_query("tbl_products"))
print("\ntbl_customers")
execute_display_query_results(select_all_query("tbl_customers"))
print("\ntbl_employees")
execute_display_query_results(select_all_query("tbl_employees"))


tbl_orders
+---------+-----------+------------+---------------+------------+------------+-------------+--------------------+---------------+----------+-------+---------------------+
| OrderID | ProductID | CustomerID | SalesPersonID | OrderDate  |  ShipDate  | OrderStatus |    ShipAddress     |  BillAddress  | Quantity | Sales |    CreationTime     |
+---------+-----------+------------+---------------+------------+------------+-------------+--------------------+---------------+----------+-------+---------------------+
|    1    |    101    |     2      |       3       | 2025-01-01 | 2025-01-05 |  Delivered  | 9833 Mt. Dias Blv. | 1226 Shoe St. |    1     |  10   | 2025-01-01 12:34:56 |
|    2    |    102    |     3      |       3       | 2025-01-05 | 2025-01-10 |   Shipped   |   250 Race Court   |     NULL      |    1     |  15   | 2025-01-05 23:22:04 |
|    3    |    101    |     1      |       5       | 2025-01-10 | 2025-01-25 |  Delivered  |    8157 W. Book    | 8157 W. Book  |    

# Window Functions
Perform calculations (e.g aggregation) on a specific subset of data, without losing the level of details of the rows
- **GroupBy**: Returns a single for each group. Changes the granularity. ```**AGGREGATE FXNS**```(COUNT(expr), SUM(expr), AVG(expr), MIN(expr), MAX(expr))
- **Window**: Returns a result for each row. The granularity stays the same. ```**AGGREGATE FXNS**```(COUNT(expr), SUM(expr), AVG(expr), MIN(expr), MAX(expr)), ```**RANK FXNS**```(ROW_NUMBER(), RANK(), DENSE_RANK(), CUME_DIST(), PERCENT_RANK(), NTILE(n)),```**VALUE**(Analytics) FXNS```(LEAD(expr, offset, default), LAG(expr, offset, default), FIRST_VALUE(expr), LAST_VALUE(expr))

# Window Function Syntax 
| **Window Function** |     (Over Clause) | **Partition Clause** | | **Order Clause** | | **Frame Clause** |

Over Clause: Partition Clause, Order Clause, Frame Clause

```sql 
AVG(Sales) OVER(PARTITION BY Category ORDER BY OrderDate ROWS UNBOUNDED PRECEDING)
```
- The **OVER** clause tells SQL that the function used is a window function. It defines a window or subset of data
- THE **PARTITION BY**: Divides the result set into partitions (Windows). divides the rows into groups, based on the column/s. Partition By Clause is optional  
  Without Partition By: ```SUM(Sales) OVER()```. ==> Total Sales across all rows  
  Partition By Single Column:  ```SUM(Sales) OVER(PARTITION BY Product)```. ==> Total sales for each product  
  Partition By Combined-Columns:  ```SUM(Sales) OVER(PARTITION BY Product, OrderStatus)```. ==> Total sales for each combination of Product and OrderStatus

Flexibility of Window: Allows aggregation of data at different granularities within the same query

# Task

## Task 1
Find the total sales across all orders

In [12]:
select_query = """SELECT SUM(Sales) AS TotalSales FROM tbl_orders;"""
execute_display_query_results(select_query)

+------------+
| TotalSales |
+------------+
|    380     |
+------------+
1 row returned in set: (0.004 sec)


## Task 2
Find the total sales for each product

In [13]:
select_query = """SELECT productid, SUM(sales) AS TotalSales FROM tbl_orders GROUP BY productid;"""
execute_display_query_results(select_query)

+-----------+------------+
| productid | TotalSales |
+-----------+------------+
|    101    |    140     |
|    102    |    105     |
|    105    |     60     |
|    104    |     75     |
+-----------+------------+
4 rows returned in set: (0.001 sec)


## Task 3
Find the total sales for each product, addditionally provide details such as orderid and order date

In [14]:
select_query = """SELECT OrderID, OrderDate, ProductID, SUM(Sales) AS TotalSales FROM tbl_orders GROUP BY OrderID, OrderDate, ProductID;"""
execute_display_query_results(select_query)

## Group By limits: Can't do aggregations and provide details at same time

+---------+------------+-----------+------------+
| OrderID | OrderDate  | ProductID | TotalSales |
+---------+------------+-----------+------------+
|    1    | 2025-01-01 |    101    |     10     |
|    2    | 2025-01-05 |    102    |     15     |
|    3    | 2025-01-10 |    101    |     20     |
|    4    | 2025-01-20 |    105    |     60     |
|    5    | 2025-02-01 |    104    |     25     |
|    6    | 2025-02-05 |    104    |     50     |
|    7    | 2025-02-15 |    102    |     30     |
|    8    | 2025-02-18 |    101    |     90     |
|    9    | 2025-03-10 |    101    |     20     |
|   10    | 2025-03-15 |    102    |     60     |
+---------+------------+-----------+------------+
10 rows returned in set: (0.001 sec)


In [15]:
# Using OVER indicates that it is a window function. Window functions returns a result for each row
select_query = """SELECT SUM(Sales) OVER() AS TotalSales FROM tbl_orders;"""
execute_display_query_results(select_query)

+------------+
| TotalSales |
+------------+
|    380     |
|    380     |
|    380     |
|    380     |
|    380     |
|    380     |
|    380     |
|    380     |
|    380     |
|    380     |
+------------+
10 rows returned in set: (0.001 sec)


In [16]:
select_query = """SELECT SUM(Sales) OVER(PARTITION BY productid) AS TotalSalesByProduct FROM tbl_orders;"""
execute_display_query_results(select_query)

+---------------------+
| TotalSalesByProduct |
+---------------------+
|         140         |
|         140         |
|         140         |
|         140         |
|         105         |
|         105         |
|         105         |
|         75          |
|         75          |
|         60          |
+---------------------+
10 rows returned in set: (0.001 sec)


In [17]:
select_query = """SELECT orderid, orderdate, productid, SUM(Sales) OVER(PARTITION BY productid) AS TotalSalesByProduct FROM tbl_orders;"""
execute_display_query_results(select_query)

+---------+------------+-----------+---------------------+
| orderid | orderdate  | productid | TotalSalesByProduct |
+---------+------------+-----------+---------------------+
|    1    | 2025-01-01 |    101    |         140         |
|    3    | 2025-01-10 |    101    |         140         |
|    8    | 2025-02-18 |    101    |         140         |
|    9    | 2025-03-10 |    101    |         140         |
|    2    | 2025-01-05 |    102    |         105         |
|    7    | 2025-02-15 |    102    |         105         |
|   10    | 2025-03-15 |    102    |         105         |
|    5    | 2025-02-01 |    104    |         75          |
|    6    | 2025-02-05 |    104    |         75          |
|    4    | 2025-01-20 |    105    |         60          |
+---------+------------+-----------+---------------------+
10 rows returned in set: (0.001 sec)


## Task 4:
- Find the total sales across all orders
- Find the total sales for each product
- Find the total sales for each combination of product and order status
- Additionally provide details such as orderid, orderdate 

In [18]:
select_query = """SELECT orderid, orderdate, productid, sales, orderstatus,
SUM(sales) OVER() AS TotalSales,
SUM(Sales) OVER(PARTITION BY productid) AS TotalSalesByProduct, 
SUM(Sales) OVER(PARTITION BY productid, orderstatus) AS TotalSalesByProductsAndStatus 
FROM tbl_orders;"""
execute_display_query_results(select_query)

+---------+------------+-----------+-------+-------------+------------+---------------------+-------------------------------+
| orderid | orderdate  | productid | sales | orderstatus | TotalSales | TotalSalesByProduct | TotalSalesByProductsAndStatus |
+---------+------------+-----------+-------+-------------+------------+---------------------+-------------------------------+
|    1    | 2025-01-01 |    101    |  10   |  Delivered  |    380     |         140         |              30               |
|    3    | 2025-01-10 |    101    |  20   |  Delivered  |    380     |         140         |              30               |
|    8    | 2025-02-18 |    101    |  90   |   Shipped   |    380     |         140         |              110              |
|    9    | 2025-03-10 |    101    |  20   |   Shipped   |    380     |         140         |              110              |
|    7    | 2025-02-15 |    102    |  30   |  Delivered  |    380     |         105         |              30         

In [19]:
# Find the total sales for each combination of product and order status
select_query = """SELECT orderid, orderdate, sales, productid, orderstatus,
SUM(Sales) OVER(PARTITION BY productid, orderstatus) AS TotalSalesByProductsAndStatus 
FROM tbl_orders;"""
execute_display_query_results(select_query)

+---------+------------+-------+-----------+-------------+-------------------------------+
| orderid | orderdate  | sales | productid | orderstatus | TotalSalesByProductsAndStatus |
+---------+------------+-------+-----------+-------------+-------------------------------+
|    1    | 2025-01-01 |  10   |    101    |  Delivered  |              30               |
|    3    | 2025-01-10 |  20   |    101    |  Delivered  |              30               |
|    8    | 2025-02-18 |  90   |    101    |   Shipped   |              110              |
|    9    | 2025-03-10 |  20   |    101    |   Shipped   |              110              |
|    7    | 2025-02-15 |  30   |    102    |  Delivered  |              30               |
|    2    | 2025-01-05 |  15   |    102    |   Shipped   |              75               |
|   10    | 2025-03-15 |  60   |    102    |   Shipped   |              75               |
|    5    | 2025-02-01 |  25   |    104    |  Delivered  |              75               |

## Task 5 
Rank each order based on their sales from highest to lowest, additionally provide details such as orderid and orderdate

In [20]:
# With RANK(), we *must* use the Order By Clause

select_query = """SELECT orderid, orderdate, productid, sales, orderstatus,
RANK() OVER(ORDER BY sales DESC) AS RankSales
FROM tbl_orders;"""
execute_display_query_results(select_query)

+---------+------------+-----------+-------+-------------+-----------+
| orderid | orderdate  | productid | sales | orderstatus | RankSales |
+---------+------------+-----------+-------+-------------+-----------+
|    8    | 2025-02-18 |    101    |  90   |   Shipped   |     1     |
|    4    | 2025-01-20 |    105    |  60   |   Shipped   |     2     |
|   10    | 2025-03-15 |    102    |  60   |   Shipped   |     2     |
|    6    | 2025-02-05 |    104    |  50   |  Delivered  |     4     |
|    7    | 2025-02-15 |    102    |  30   |  Delivered  |     5     |
|    5    | 2025-02-01 |    104    |  25   |  Delivered  |     6     |
|    3    | 2025-01-10 |    101    |  20   |  Delivered  |     7     |
|    9    | 2025-03-10 |    101    |  20   |   Shipped   |     7     |
|    2    | 2025-01-05 |    102    |  15   |   Shipped   |     9     |
|    1    | 2025-01-01 |    101    |  10   |  Delivered  |    10     |
+---------+------------+-----------+-------+-------------+-----------+
10 row

# Frame Clause
 ```sql
AVG(Sales) OVER (PARTITION BY Category ORDER BY OrderDate ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING)
 ```
 - Frame Types: ROWS, RANGE
 - Frame Boundary(Lower Value): CURRENT ROW, N PRECEDING, UNBOUNDED PRECEDING
 - Frame Boundary(Higher Value): CURRENT ROW, N FOLLOWING, UNBOUNDED FOLLOWING  
**Frame Clause can only be used together with order by clause**  
**Lower Value must be BEFORE the higher value**

In [21]:
select_query = """SELECT orderid, orderdate, productid, sales, orderstatus,
SUM(sales) OVER(PARTITION BY ProductID ORDER BY orderdate ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS TotalSalesByProduct
FROM tbl_orders;"""
execute_display_query_results(select_query)

+---------+------------+-----------+-------+-------------+---------------------+
| orderid | orderdate  | productid | sales | orderstatus | TotalSalesByProduct |
+---------+------------+-----------+-------+-------------+---------------------+
|    1    | 2025-01-01 |    101    |  10   |  Delivered  |         10          |
|    3    | 2025-01-10 |    101    |  20   |  Delivered  |         30          |
|    8    | 2025-02-18 |    101    |  90   |   Shipped   |         120         |
|    9    | 2025-03-10 |    101    |  20   |   Shipped   |         140         |
|    2    | 2025-01-05 |    102    |  15   |   Shipped   |         15          |
|    7    | 2025-02-15 |    102    |  30   |  Delivered  |         45          |
|   10    | 2025-03-15 |    102    |  60   |   Shipped   |         105         |
|    5    | 2025-02-01 |    104    |  25   |  Delivered  |         25          |
|    6    | 2025-02-05 |    104    |  50   |  Delivered  |         75          |
|    4    | 2025-01-20 |    

In [22]:
select_query = """SELECT orderid, orderdate, productid, sales, orderstatus,
SUM(sales) OVER(PARTITION BY orderstatus ORDER BY orderdate ROWS BETWEEN CURRENT ROW AND 2 FOLLOWING) AS TotalSalesByOrderStatus
FROM tbl_orders;"""
execute_display_query_results(select_query)

+---------+------------+-----------+-------+-------------+-------------------------+
| orderid | orderdate  | productid | sales | orderstatus | TotalSalesByOrderStatus |
+---------+------------+-----------+-------+-------------+-------------------------+
|    1    | 2025-01-01 |    101    |  10   |  Delivered  |           55            |
|    3    | 2025-01-10 |    101    |  20   |  Delivered  |           95            |
|    5    | 2025-02-01 |    104    |  25   |  Delivered  |           105           |
|    6    | 2025-02-05 |    104    |  50   |  Delivered  |           80            |
|    7    | 2025-02-15 |    102    |  30   |  Delivered  |           30            |
|    2    | 2025-01-05 |    102    |  15   |   Shipped   |           165           |
|    4    | 2025-01-20 |    105    |  60   |   Shipped   |           170           |
|    8    | 2025-02-18 |    101    |  90   |   Shipped   |           170           |
|    9    | 2025-03-10 |    101    |  20   |   Shipped   |       

In [23]:
select_query = """SELECT orderid, orderdate, productid, sales, orderstatus,
SUM(sales) OVER(PARTITION BY ProductID ORDER BY orderdate ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS TotalSalesByProduct
FROM tbl_orders;"""
execute_display_query_results(select_query)

+---------+------------+-----------+-------+-------------+---------------------+
| orderid | orderdate  | productid | sales | orderstatus | TotalSalesByProduct |
+---------+------------+-----------+-------+-------------+---------------------+
|    1    | 2025-01-01 |    101    |  10   |  Delivered  |         140         |
|    3    | 2025-01-10 |    101    |  20   |  Delivered  |         140         |
|    8    | 2025-02-18 |    101    |  90   |   Shipped   |         140         |
|    9    | 2025-03-10 |    101    |  20   |   Shipped   |         140         |
|    2    | 2025-01-05 |    102    |  15   |   Shipped   |         105         |
|    7    | 2025-02-15 |    102    |  30   |  Delivered  |         105         |
|   10    | 2025-03-15 |    102    |  60   |   Shipped   |         105         |
|    5    | 2025-02-01 |    104    |  25   |  Delivered  |         75          |
|    6    | 2025-02-05 |    104    |  50   |  Delivered  |         75          |
|    4    | 2025-01-20 |    

In [24]:
select_query = """SELECT orderid, orderdate, productid, sales, orderstatus,
SUM(sales) OVER(PARTITION BY orderstatus ORDER BY orderdate ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS TotalSales
FROM tbl_orders;"""
execute_display_query_results(select_query)

+---------+------------+-----------+-------+-------------+------------+
| orderid | orderdate  | productid | sales | orderstatus | TotalSales |
+---------+------------+-----------+-------+-------------+------------+
|    1    | 2025-01-01 |    101    |  10   |  Delivered  |     10     |
|    3    | 2025-01-10 |    101    |  20   |  Delivered  |     30     |
|    5    | 2025-02-01 |    104    |  25   |  Delivered  |     55     |
|    6    | 2025-02-05 |    104    |  50   |  Delivered  |    105     |
|    7    | 2025-02-15 |    102    |  30   |  Delivered  |    135     |
|    2    | 2025-01-05 |    102    |  15   |   Shipped   |     15     |
|    4    | 2025-01-20 |    105    |  60   |   Shipped   |     75     |
|    8    | 2025-02-18 |    101    |  90   |   Shipped   |    165     |
|    9    | 2025-03-10 |    101    |  20   |   Shipped   |    185     |
|   10    | 2025-03-15 |    102    |  60   |   Shipped   |    245     |
+---------+------------+-----------+-------+-------------+------

In [25]:
#filter the result for only shipped
select_query = """SELECT orderid, orderdate, productid, sales, orderstatus,
SUM(sales) OVER(PARTITION BY orderstatus ORDER BY orderdate ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS TotalSales
FROM tbl_orders WHERE orderstatus = 'Shipped';"""
execute_display_query_results(select_query)

print("\n")

#filter the result for only delivered
select_query = """SELECT orderid, orderdate, productid, sales, orderstatus,
SUM(sales) OVER(PARTITION BY orderstatus ORDER BY orderdate ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS TotalSales
FROM tbl_orders WHERE orderstatus = 'Delivered';"""
execute_display_query_results(select_query)

+---------+------------+-----------+-------+-------------+------------+
| orderid | orderdate  | productid | sales | orderstatus | TotalSales |
+---------+------------+-----------+-------+-------------+------------+
|    2    | 2025-01-05 |    102    |  15   |   Shipped   |     15     |
|    4    | 2025-01-20 |    105    |  60   |   Shipped   |     75     |
|    8    | 2025-02-18 |    101    |  90   |   Shipped   |    165     |
|    9    | 2025-03-10 |    101    |  20   |   Shipped   |    185     |
|   10    | 2025-03-15 |    102    |  60   |   Shipped   |    245     |
+---------+------------+-----------+-------+-------------+------------+
5 rows returned in set: (0.001 sec)


+---------+------------+-----------+-------+-------------+------------+
| orderid | orderdate  | productid | sales | orderstatus | TotalSales |
+---------+------------+-----------+-------+-------------+------------+
|    1    | 2025-01-01 |    101    |  10   |  Delivered  |     10     |
|    3    | 2025-01-10 |  

# 4 Rules of Window Functions
- Window functions can be used only in the SELECT and ORDER BY Clause
- Nesting Window functions is not allowed!
- SQL execute WINDOW functions after WHERE Clause
- Window function can be úsed together with GROUP BY in the same query, ONLY if the same columns are used

In [26]:
select_query = """SELECT orderid, orderdate, productid, sales, orderstatus,
SUM(sales) OVER(PARTITION BY orderstatus) AS TotalSales
FROM tbl_orders
ORDER BY SUM(sales) OVER(PARTITION BY orderstatus);"""
execute_display_query_results(select_query)

+---------+------------+-----------+-------+-------------+------------+
| orderid | orderdate  | productid | sales | orderstatus | TotalSales |
+---------+------------+-----------+-------+-------------+------------+
|    1    | 2025-01-01 |    101    |  10   |  Delivered  |    135     |
|    3    | 2025-01-10 |    101    |  20   |  Delivered  |    135     |
|    5    | 2025-02-01 |    104    |  25   |  Delivered  |    135     |
|    6    | 2025-02-05 |    104    |  50   |  Delivered  |    135     |
|    7    | 2025-02-15 |    102    |  30   |  Delivered  |    135     |
|    2    | 2025-01-05 |    102    |  15   |   Shipped   |    245     |
|    4    | 2025-01-20 |    105    |  60   |   Shipped   |    245     |
|    8    | 2025-02-18 |    101    |  90   |   Shipped   |    245     |
|    9    | 2025-03-10 |    101    |  20   |   Shipped   |    245     |
|   10    | 2025-03-15 |    102    |  60   |   Shipped   |    245     |
+---------+------------+-----------+-------+-------------+------

In [27]:
execute_display_query_results(select_all_query("tbl_orders"))

+---------+-----------+------------+---------------+------------+------------+-------------+--------------------+---------------+----------+-------+---------------------+
| OrderID | ProductID | CustomerID | SalesPersonID | OrderDate  |  ShipDate  | OrderStatus |    ShipAddress     |  BillAddress  | Quantity | Sales |    CreationTime     |
+---------+-----------+------------+---------------+------------+------------+-------------+--------------------+---------------+----------+-------+---------------------+
|    1    |    101    |     2      |       3       | 2025-01-01 | 2025-01-05 |  Delivered  | 9833 Mt. Dias Blv. | 1226 Shoe St. |    1     |  10   | 2025-01-01 12:34:56 |
|    2    |    102    |     3      |       3       | 2025-01-05 | 2025-01-10 |   Shipped   |   250 Race Court   |     NULL      |    1     |  15   | 2025-01-05 23:22:04 |
|    3    |    101    |     1      |       5       | 2025-01-10 | 2025-01-25 |  Delivered  |    8157 W. Book    | 8157 W. Book  |    2     |  20 

In [28]:
## Rank Customers based on their total sales
# With RANK(), we *must* use the Order By Clause

select_query = """SELECT customerid, 
SUM(Sales) AS TotalSalesByCustomer,
RANK() OVER(Order By SUM(Sales) DESC) AS RankCustomersBySales 
FROM tbl_orders
GROUP BY customerid;"""
execute_display_query_results(select_query)

+------------+----------------------+----------------------+
| customerid | TotalSalesByCustomer | RankCustomersBySales |
+------------+----------------------+----------------------+
|     3      |         125          |          1           |
|     1      |         110          |          2           |
|     4      |          90          |          3           |
|     2      |          55          |          4           |
+------------+----------------------+----------------------+
4 rows returned in set: (0.001 sec)


In [29]:
select_query = """SELECT customerid, orderid, orderdate, productid, sales, orderstatus,
RANK() OVER(ORDER BY sales DESC) AS RankSales
FROM tbl_orders;"""
execute_display_query_results(select_query)

+------------+---------+------------+-----------+-------+-------------+-----------+
| customerid | orderid | orderdate  | productid | sales | orderstatus | RankSales |
+------------+---------+------------+-----------+-------+-------------+-----------+
|     4      |    8    | 2025-02-18 |    101    |  90   |   Shipped   |     1     |
|     1      |    4    | 2025-01-20 |    105    |  60   |   Shipped   |     2     |
|     3      |   10    | 2025-03-15 |    102    |  60   |   Shipped   |     2     |
|     3      |    6    | 2025-02-05 |    104    |  50   |  Delivered  |     4     |
|     1      |    7    | 2025-02-15 |    102    |  30   |  Delivered  |     5     |
|     2      |    5    | 2025-02-01 |    104    |  25   |  Delivered  |     6     |
|     1      |    3    | 2025-01-10 |    101    |  20   |  Delivered  |     7     |
|     2      |    9    | 2025-03-10 |    101    |  20   |   Shipped   |     7     |
|     3      |    2    | 2025-01-05 |    102    |  15   |   Shipped   |     

In [30]:
select_query = """SELECT customerid, SUM(sales),
RANK() OVER(ORDER BY SUM(sales) DESC) AS RankCustomersBySales 
FROM tbl_orders
GROUP BY customerid;"""
execute_display_query_results(select_query)

+------------+------------+----------------------+
| customerid | SUM(sales) | RankCustomersBySales |
+------------+------------+----------------------+
|     3      |    125     |          1           |
|     1      |    110     |          2           |
|     4      |     90     |          3           |
|     2      |     55     |          4           |
+------------+------------+----------------------+
4 rows returned in set: (0.001 sec)


In [31]:
#  Perform a group by before the ranking. Use a column in the group by to perform the rank
select_query = """SELECT customerid, SUM(sales)
FROM tbl_orders
GROUP BY customerid;"""
execute_display_query_results(select_query)

print("\n")

select_query = """SELECT customerid, SUM(sales),
RANK() OVER(ORDER BY SUM(sales) DESC) AS RankCustomersBySales 
FROM tbl_orders
GROUP BY customerid;"""
execute_display_query_results(select_query)

+------------+------------+
| customerid | SUM(sales) |
+------------+------------+
|     2      |     55     |
|     3      |    125     |
|     1      |    110     |
|     4      |     90     |
+------------+------------+
4 rows returned in set: (0.001 sec)


+------------+------------+----------------------+
| customerid | SUM(sales) | RankCustomersBySales |
+------------+------------+----------------------+
|     3      |    125     |          1           |
|     1      |    110     |          2           |
|     4      |     90     |          3           |
|     2      |     55     |          4           |
+------------+------------+----------------------+
4 rows returned in set: (0.0 sec)


# SQL Aggregate Window Function
COUNT, SUM, AVG, MIN, MAX

```sql
AVG(Sales) OVER (PARTITION BY ProductID ORDER BY Sale)
```
Partition By and Order By are Optional

## COUNT
Use Cases
- 1: Overall Analysis
- 2: Category Analysis
- 3: Quality Checks: Identify NULLS
- 4: Quality Checks: Identify Duplicates

In [32]:
select_query = """SELECT orderstatus, COUNT(orderstatus) OVER (PARTITION BY orderstatus) AS CountOrderStatus
FROM tbl_orders;"""
execute_display_query_results(select_query)

print("\n")

select_query = """SELECT productid, COUNT(productid) OVER (PARTITION BY productid) AS CountProducts
FROM tbl_orders;"""
execute_display_query_results(select_query)

+-------------+------------------+
| orderstatus | CountOrderStatus |
+-------------+------------------+
|  Delivered  |        5         |
|  Delivered  |        5         |
|  Delivered  |        5         |
|  Delivered  |        5         |
|  Delivered  |        5         |
|   Shipped   |        5         |
|   Shipped   |        5         |
|   Shipped   |        5         |
|   Shipped   |        5         |
|   Shipped   |        5         |
+-------------+------------------+
10 rows returned in set: (0.001 sec)


+-----------+---------------+
| productid | CountProducts |
+-----------+---------------+
|    101    |       4       |
|    101    |       4       |
|    101    |       4       |
|    101    |       4       |
|    102    |       3       |
|    102    |       3       |
|    102    |       3       |
|    104    |       2       |
|    104    |       2       |
|    105    |       1       |
+-----------+---------------+
10 rows returned in set: (0.0 sec)


In [33]:
select_query = """SELECT productid, COUNT(*) AS CountProducts
FROM tbl_orders
GROUP BY productid;"""
execute_display_query_results(select_query)

+-----------+---------------+
| productid | CountProducts |
+-----------+---------------+
|    101    |       4       |
|    102    |       3       |
|    105    |       1       |
|    104    |       2       |
+-----------+---------------+
4 rows returned in set: (0.001 sec)


In [34]:
select_query = """SELECT count(productid) AS CountProductID, COUNT(*) AS CountProducts
FROM tbl_orders;"""
execute_display_query_results(select_query)

+----------------+---------------+
| CountProductID | CountProducts |
+----------------+---------------+
|       10       |      10       |
+----------------+---------------+
1 row returned in set: (0.001 sec)


In [35]:
select_query = """SELECT orderid, orderdate, COUNT(*) OVER() AS TotalOrders
FROM tbl_orders;"""
execute_display_query_results(select_query)

+---------+------------+-------------+
| orderid | orderdate  | TotalOrders |
+---------+------------+-------------+
|    1    | 2025-01-01 |     10      |
|    2    | 2025-01-05 |     10      |
|    3    | 2025-01-10 |     10      |
|    4    | 2025-01-20 |     10      |
|    5    | 2025-02-01 |     10      |
|    6    | 2025-02-05 |     10      |
|    7    | 2025-02-15 |     10      |
|    8    | 2025-02-18 |     10      |
|    9    | 2025-03-10 |     10      |
|   10    | 2025-03-15 |     10      |
+---------+------------+-------------+
10 rows returned in set: (0.001 sec)


In [36]:
## Find the total orders for each customers
select_query = """SELECT orderid, orderdate,
COUNT(*) OVER() AS TotalOrders,
customerid,
COUNT(*) OVER(PARTITION BY customerid) AS TotalOrdersByCustomers
FROM tbl_orders;"""
execute_display_query_results(select_query)

+---------+------------+-------------+------------+------------------------+
| orderid | orderdate  | TotalOrders | customerid | TotalOrdersByCustomers |
+---------+------------+-------------+------------+------------------------+
|    3    | 2025-01-10 |     10      |     1      |           3            |
|    4    | 2025-01-20 |     10      |     1      |           3            |
|    7    | 2025-02-15 |     10      |     1      |           3            |
|    1    | 2025-01-01 |     10      |     2      |           3            |
|    5    | 2025-02-01 |     10      |     2      |           3            |
|    9    | 2025-03-10 |     10      |     2      |           3            |
|    2    | 2025-01-05 |     10      |     3      |           3            |
|    6    | 2025-02-05 |     10      |     3      |           3            |
|   10    | 2025-03-15 |     10      |     3      |           3            |
|    8    | 2025-02-18 |     10      |     4      |           1            |

In [37]:
## Find the total number of customers, additionally provide all customer details
select_query = """SELECT *
FROM tbl_customers;"""
execute_display_query_results(select_query)

print("\n")

select_query = """SELECT *, COUNT(*) OVER() AS TotalCustomers
FROM tbl_customers;"""
execute_display_query_results(select_query)

print("\n")
#Find the total number of scores for the customers. Here we need to ignore the null value hence the count would be performed on the score column
#Can used to check for nulls
select_query = """SELECT *, 
COUNT(*) OVER() AS TotalCustomers,
COUNT(score) OVER() AS TotalScores
FROM tbl_customers;"""
execute_display_query_results(select_query)

+------------+-----------+----------+---------+-------+
| CustomerID | FirstName | LastName | Country | Score |
+------------+-----------+----------+---------+-------+
|     1      |  Jossef   | Goldberg | Germany |  350  |
|     2      |   Kevin   |  Brown   |   USA   |  900  |
|     3      |   Mary    |   NULL   |   USA   |  750  |
|     4      |   Mark    | Schwarz  | Germany |  500  |
|     5      |   Anna    |  Adams   |   USA   | NULL  |
+------------+-----------+----------+---------+-------+
5 rows returned in set: (0.0 sec)


+------------+-----------+----------+---------+-------+----------------+
| CustomerID | FirstName | LastName | Country | Score | TotalCustomers |
+------------+-----------+----------+---------+-------+----------------+
|     1      |  Jossef   | Goldberg | Germany |  350  |       5        |
|     2      |   Kevin   |  Brown   |   USA   |  900  |       5        |
|     3      |   Mary    |   NULL   |   USA   |  750  |       5        |
|     4      |   Mark 

In [38]:
## Data Quality Issue. Duplicates leads to inaccuracies in analysis
# COUNT() can be used to identify duplicates
# Check whether the table 'Orders' contains any duplicate

select_query = """SELECT orderid, 
COUNT(orderid) OVER(PARTITION BY orderid) AS CheckPrimaryKey
FROM tbl_orders;"""
execute_display_query_results(select_query)

+---------+-----------------+
| orderid | CheckPrimaryKey |
+---------+-----------------+
|    1    |        1        |
|    2    |        1        |
|    3    |        1        |
|    4    |        1        |
|    5    |        1        |
|    6    |        1        |
|    7    |        1        |
|    8    |        1        |
|    9    |        1        |
|   10    |        1        |
+---------+-----------------+
10 rows returned in set: (0.001 sec)


In [39]:
select_query = """SELECT orderid, 
COUNT(orderid) OVER(PARTITION BY orderid) AS CheckPrimaryKey
FROM tbl_orders_archive;"""
execute_display_query_results(select_query)

print("\n")

select_query = """
SELECT * FROM 
(SELECT orderid, 
COUNT(orderid) OVER(PARTITION BY orderid) AS CheckPrimaryKey
FROM tbl_orders_archive) AS T
WHERE CheckPrimaryKey > 1;"""
execute_display_query_results(select_query)

+---------+-----------------+
| orderid | CheckPrimaryKey |
+---------+-----------------+
|    1    |        1        |
|    2    |        1        |
|    3    |        1        |
|    4    |        2        |
|    4    |        2        |
|    5    |        1        |
|    6    |        3        |
|    6    |        3        |
|    6    |        3        |
|    7    |        1        |
+---------+-----------------+
10 rows returned in set: (0.001 sec)


+---------+-----------------+
| orderid | CheckPrimaryKey |
+---------+-----------------+
|    4    |        2        |
|    4    |        2        |
|    6    |        3        |
|    6    |        3        |
|    6    |        3        |
+---------+-----------------+
5 rows returned in set: (0.0 sec)


## SUM

In [40]:
# Find the total sales across all orders
# And the total sales for each product. Provide additional details such as orderid, orderdate

select_query = """SELECT orderid, orderdate, sales, productid, orderstatus,
SUM(sales) OVER(PARTITION BY productid) AS TotalSalesByProduct,
SUM(sales) OVER(PARTITION BY productid, orderstatus) AS TotalSalesByProductAndOrderStatus,
SUM(sales) OVER() AS TotalSales
FROM tbl_orders;"""
execute_display_query_results(select_query)

+---------+------------+-------+-----------+-------------+---------------------+-----------------------------------+------------+
| orderid | orderdate  | sales | productid | orderstatus | TotalSalesByProduct | TotalSalesByProductAndOrderStatus | TotalSales |
+---------+------------+-------+-----------+-------------+---------------------+-----------------------------------+------------+
|    1    | 2025-01-01 |  10   |    101    |  Delivered  |         140         |                30                 |    380     |
|    3    | 2025-01-10 |  20   |    101    |  Delivered  |         140         |                30                 |    380     |
|    8    | 2025-02-18 |  90   |    101    |   Shipped   |         140         |                110                |    380     |
|    9    | 2025-03-10 |  20   |    101    |   Shipped   |         140         |                110                |    380     |
|    7    | 2025-02-15 |  30   |    102    |  Delivered  |         105         |          

In [41]:
# Find the percentage contribution of each order to the total sales
select_query = """SELECT orderid, orderdate, sales, productid, orderstatus,
SUM(sales) OVER(PARTITION BY productid) AS TotalSalesByProduct,
SUM(sales) OVER(PARTITION BY productid, orderstatus) AS TotalSalesByProductAndOrderStatus,
SUM(sales) OVER() AS TotalSales,
(sales / SUM(sales) OVER()) * 100 AS PercentageContribution
FROM tbl_orders
ORDER BY PercentageContribution DESC;"""
execute_display_query_results(select_query)

+---------+------------+-------+-----------+-------------+---------------------+-----------------------------------+------------+------------------------+
| orderid | orderdate  | sales | productid | orderstatus | TotalSalesByProduct | TotalSalesByProductAndOrderStatus | TotalSales | PercentageContribution |
+---------+------------+-------+-----------+-------------+---------------------+-----------------------------------+------------+------------------------+
|    8    | 2025-02-18 |  90   |    101    |   Shipped   |         140         |                110                |    380     |        23.6842         |
|   10    | 2025-03-15 |  60   |    102    |   Shipped   |         105         |                75                 |    380     |        15.7895         |
|    4    | 2025-01-20 |  60   |    105    |   Shipped   |         60          |                60                 |    380     |        15.7895         |
|    6    | 2025-02-05 |  50   |    104    |  Delivered  |         75 

## AVERAGE

In [42]:
# Find the average sales for each product
select_query = """SELECT orderid, orderdate, sales, productid, orderstatus,
ROUND(AVG(sales) OVER(), 2) AS AverageSales, 
ROUND(AVG(sales) OVER(PARTITION BY productid), 2) AS AverageSalesByProduct
FROM tbl_orders;"""
execute_display_query_results(select_query)

+---------+------------+-------+-----------+-------------+--------------+-----------------------+
| orderid | orderdate  | sales | productid | orderstatus | AverageSales | AverageSalesByProduct |
+---------+------------+-------+-----------+-------------+--------------+-----------------------+
|    1    | 2025-01-01 |  10   |    101    |  Delivered  |    38.00     |         35.00         |
|    3    | 2025-01-10 |  20   |    101    |  Delivered  |    38.00     |         35.00         |
|    8    | 2025-02-18 |  90   |    101    |   Shipped   |    38.00     |         35.00         |
|    9    | 2025-03-10 |  20   |    101    |   Shipped   |    38.00     |         35.00         |
|    2    | 2025-01-05 |  15   |    102    |   Shipped   |    38.00     |         35.00         |
|    7    | 2025-02-15 |  30   |    102    |  Delivered  |    38.00     |         35.00         |
|   10    | 2025-03-15 |  60   |    102    |   Shipped   |    38.00     |         35.00         |
|    5    | 2025-02-

In [43]:
#AVG(COALESCE(Sales, 0)) OVER(PARTITION BY productid) --> This is to handle the case of null values
# Find the average sales for each product
select_query = """SELECT orderid, orderdate, sales, productid, orderstatus,
ROUND(AVG(COALESCE(sales, 0)) OVER(PARTITION BY productid), 2) AS AverageSalesByProduct
FROM tbl_orders;"""
execute_display_query_results(select_query)

+---------+------------+-------+-----------+-------------+-----------------------+
| orderid | orderdate  | sales | productid | orderstatus | AverageSalesByProduct |
+---------+------------+-------+-----------+-------------+-----------------------+
|    1    | 2025-01-01 |  10   |    101    |  Delivered  |         35.00         |
|    3    | 2025-01-10 |  20   |    101    |  Delivered  |         35.00         |
|    8    | 2025-02-18 |  90   |    101    |   Shipped   |         35.00         |
|    9    | 2025-03-10 |  20   |    101    |   Shipped   |         35.00         |
|    2    | 2025-01-05 |  15   |    102    |   Shipped   |         35.00         |
|    7    | 2025-02-15 |  30   |    102    |  Delivered  |         35.00         |
|   10    | 2025-03-15 |  60   |    102    |   Shipped   |         35.00         |
|    5    | 2025-02-01 |  25   |    104    |  Delivered  |         37.50         |
|    6    | 2025-02-05 |  50   |    104    |  Delivered  |         37.50         |
|   

In [44]:
# Find the average score for customers. Provide additional details.
select_query = """SELECT *,
SUM(score) OVER() AS TotalScore,
(SUM(score) OVER()) / COUNT(customerid) OVER() AS CalculatedAverageScore,
ROUND(AVG(score) OVER(), 2) AS AverageScoreWithNull,
ROUND(AVG(COALESCE(score, 0)) OVER(), 2) AS AverageScore
FROM tbl_customers;"""
execute_display_query_results(select_query)

+------------+-----------+----------+---------+-------+------------+------------------------+----------------------+--------------+
| CustomerID | FirstName | LastName | Country | Score | TotalScore | CalculatedAverageScore | AverageScoreWithNull | AverageScore |
+------------+-----------+----------+---------+-------+------------+------------------------+----------------------+--------------+
|     1      |  Jossef   | Goldberg | Germany |  350  |    2500    |        500.0000        |        625.00        |    500.00    |
|     2      |   Kevin   |  Brown   |   USA   |  900  |    2500    |        500.0000        |        625.00        |    500.00    |
|     3      |   Mary    |   NULL   |   USA   |  750  |    2500    |        500.0000        |        625.00        |    500.00    |
|     4      |   Mark    | Schwarz  | Germany |  500  |    2500    |        500.0000        |        625.00        |    500.00    |
|     5      |   Anna    |  Adams   |   USA   | NULL  |    2500    |        

In [45]:
## Find all orders where sales are higher than the average sales across all orders
select_query = """SELECT orderid, sales, productid, quantity FROM 
tbl_orders WHERE Sales > 
(SELECT ROUND(AVG(COALESCE(sales, 0)), 2) AS AverageSale FROM tbl_orders);"""
execute_display_query_results(select_query)

print("\n")

select_query = """SELECT * FROM 
(SELECT orderid, sales, productid, quantity, ROUND(AVG(COALESCE(sales, 0)) OVER(), 2) AS AverageSale 
FROM tbl_orders) AS T
WHERE Sales > AverageSale;"""
execute_display_query_results(select_query)

+---------+-------+-----------+----------+
| orderid | sales | productid | quantity |
+---------+-------+-----------+----------+
|    4    |  60   |    105    |    2     |
|    6    |  50   |    104    |    2     |
|    8    |  90   |    101    |    3     |
|   10    |  60   |    102    |    0     |
+---------+-------+-----------+----------+
4 rows returned in set: (0.001 sec)


+---------+-------+-----------+----------+-------------+
| orderid | sales | productid | quantity | AverageSale |
+---------+-------+-----------+----------+-------------+
|    4    |  60   |    105    |    2     |    38.00    |
|    6    |  50   |    104    |    2     |    38.00    |
|    8    |  90   |    101    |    3     |    38.00    |
|   10    |  60   |    102    |    0     |    38.00    |
+---------+-------+-----------+----------+-------------+
4 rows returned in set: (0.0 sec)


## MIN AND MAX

In [46]:
# Find the highest and lowest sales across all orders and the highest & lowest sales for each product. Additionally provide details such as order íd and order date
select_query = """SELECT orderid, orderdate, orderstatus, productid, quantity,sales,
MIN(Sales) OVER() AS LowestSale,
MAX(Sales) OVER() AS HighestSale,
MIN(Sales) OVER(PARTITION BY productid) AS LowestProductSale,
MAX(Sales) OVER(PARTITION BY productid) AS HighestProductSale
FROM tbl_orders;"""
execute_display_query_results(select_query)

+---------+------------+-------------+-----------+----------+-------+------------+-------------+-------------------+--------------------+
| orderid | orderdate  | orderstatus | productid | quantity | sales | LowestSale | HighestSale | LowestProductSale | HighestProductSale |
+---------+------------+-------------+-----------+----------+-------+------------+-------------+-------------------+--------------------+
|    1    | 2025-01-01 |  Delivered  |    101    |    1     |  10   |     10     |     90      |        10         |         90         |
|    3    | 2025-01-10 |  Delivered  |    101    |    2     |  20   |     10     |     90      |        10         |         90         |
|    8    | 2025-02-18 |   Shipped   |    101    |    3     |  90   |     10     |     90      |        10         |         90         |
|    9    | 2025-03-10 |   Shipped   |    101    |    2     |  20   |     10     |     90      |        10         |         90         |
|    2    | 2025-01-05 |   Shipped

In [47]:
# Find the employees with the highest salaries
select_query = """SELECT *, 
MAX(salary) OVER() AS HighestSalary
FROM tbl_employees;"""
execute_display_query_results(select_query)

+------------+-----------+----------+------------+------------+--------+--------+-----------+---------------+
| EmployeeID | FirstName | LastName | Department | BirthDate  | Gender | Salary | ManagerID | HighestSalary |
+------------+-----------+----------+------------+------------+--------+--------+-----------+---------------+
|     1      |   Frank   |   Lee    | Marketing  | 1988-12-05 |   M    | 55000  |   NULL    |     90000     |
|     2      |   Kevin   |  Brown   | Marketing  | 1972-11-25 |   M    | 65000  |     1     |     90000     |
|     3      |   Mary    |   NULL   |   Sales    | 1986-01-05 |   F    | 75000  |     1     |     90000     |
|     4      |  Michael  |   Ray    |   Sales    | 1977-02-10 |   M    | 90000  |     2     |     90000     |
|     5      |   Carol   |  Baker   |   Sales    | 1982-02-11 |   F    | 55000  |     3     |     90000     |
+------------+-----------+----------+------------+------------+--------+--------+-----------+---------------+
5 rows ret

In [48]:
select_query = """SELECT * FROM 
tbl_employees
WHERE Salary = (SELECT MAX(salary) FROM tbl_employees);"""
execute_display_query_results(select_query)

+------------+-----------+----------+------------+------------+--------+--------+-----------+
| EmployeeID | FirstName | LastName | Department | BirthDate  | Gender | Salary | ManagerID |
+------------+-----------+----------+------------+------------+--------+--------+-----------+
|     4      |  Michael  |   Ray    |   Sales    | 1977-02-10 |   M    | 90000  |     2     |
+------------+-----------+----------+------------+------------+--------+--------+-----------+
1 row returned in set: (0.001 sec)


In [49]:
# Find the employees with the highest salaries
select_query = """SELECT * FROM 
(SELECT *, MAX(salary) OVER() AS HighestSalary
FROM tbl_employees) AS T
WHERE salary = highestsalary;"""
execute_display_query_results(select_query)

+------------+-----------+----------+------------+------------+--------+--------+-----------+---------------+
| EmployeeID | FirstName | LastName | Department | BirthDate  | Gender | Salary | ManagerID | HighestSalary |
+------------+-----------+----------+------------+------------+--------+--------+-----------+---------------+
|     4      |  Michael  |   Ray    |   Sales    | 1977-02-10 |   M    | 90000  |     2     |     90000     |
+------------+-----------+----------+------------+------------+--------+--------+-----------+---------------+
1 row returned in set: (0.001 sec)


In [50]:
## Find the deviation of each sales from the minimum and maximum sales amount
select_query = """SELECT orderid, orderdate, orderstatus, productid, quantity,sales,
MIN(Sales) OVER() AS LowestSale,
MAX(Sales) OVER() AS HighestSale,
Sales - MIN(Sales) OVER() AS DeviationFromMin,
MAX(Sales) OVER() - Sales AS DeviationFromMax
FROM tbl_orders;"""
execute_display_query_results(select_query)

+---------+------------+-------------+-----------+----------+-------+------------+-------------+------------------+------------------+
| orderid | orderdate  | orderstatus | productid | quantity | sales | LowestSale | HighestSale | DeviationFromMin | DeviationFromMax |
+---------+------------+-------------+-----------+----------+-------+------------+-------------+------------------+------------------+
|    1    | 2025-01-01 |  Delivered  |    101    |    1     |  10   |     10     |     90      |        0         |        80        |
|    2    | 2025-01-05 |   Shipped   |    102    |    1     |  15   |     10     |     90      |        5         |        75        |
|    3    | 2025-01-10 |  Delivered  |    101    |    2     |  20   |     10     |     90      |        10        |        70        |
|    4    | 2025-01-20 |   Shipped   |    105    |    2     |  60   |     10     |     90      |        50        |        30        |
|    5    | 2025-02-01 |  Delivered  |    104    |    1

# Running & Rolling Total
They aggregate sequence of members, and the aggregation is updated each time a new member is added
- Running Total: Aggregate all values from the beginning up to the current point without dropping off older data. ```SUM(sales) OVER(ORDER BY MONTH)    default:ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW```  
- Rolling Rotal: Aggregate all values within a fixed time window(e.g. 30 days). As new data is added, the oldest data point will be dropped. ```SUM(sales) OVER(ORDER BY MONTH ROWS BETWEEN 2 PRECEDING AND CURRENT ROW)```

In [56]:
## Calculate the moving average of sales for each product over time
select_query = """SELECT orderid, orderdate, orderstatus, productid, quantity,sales,
ROUND(AVG(Sales) OVER(PARTITION BY productid), 2) AS AverageSalesByProduct,
ROUND(AVG(Sales) OVER(PARTITION BY productid ORDER BY orderdate ASC), 2) AS MovingAverage
FROM tbl_orders;"""
execute_display_query_results(select_query)

+---------+------------+-------------+-----------+----------+-------+-----------------------+---------------+
| orderid | orderdate  | orderstatus | productid | quantity | sales | AverageSalesByProduct | MovingAverage |
+---------+------------+-------------+-----------+----------+-------+-----------------------+---------------+
|    1    | 2025-01-01 |  Delivered  |    101    |    1     |  10   |         35.00         |     10.00     |
|    3    | 2025-01-10 |  Delivered  |    101    |    2     |  20   |         35.00         |     15.00     |
|    8    | 2025-02-18 |   Shipped   |    101    |    3     |  90   |         35.00         |     40.00     |
|    9    | 2025-03-10 |   Shipped   |    101    |    2     |  20   |         35.00         |     35.00     |
|    2    | 2025-01-05 |   Shipped   |    102    |    1     |  15   |         35.00         |     15.00     |
|    7    | 2025-02-15 |  Delivered  |    102    |    2     |  30   |         35.00         |     22.50     |
|   10    

In [58]:
## Calculate the moving average of sales for each product over time, including only the next order
select_query = """SELECT orderid, orderdate, orderstatus, productid, quantity,sales,
ROUND(AVG(Sales) OVER(PARTITION BY productid), 2) AS AverageSalesByProduct,
ROUND(AVG(Sales) OVER(PARTITION BY productid ORDER BY orderdate ASC ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING), 2) AS RollingAverage
FROM tbl_orders;"""
execute_display_query_results(select_query)

+---------+------------+-------------+-----------+----------+-------+-----------------------+----------------+
| orderid | orderdate  | orderstatus | productid | quantity | sales | AverageSalesByProduct | RollingAverage |
+---------+------------+-------------+-----------+----------+-------+-----------------------+----------------+
|    1    | 2025-01-01 |  Delivered  |    101    |    1     |  10   |         35.00         |     15.00      |
|    3    | 2025-01-10 |  Delivered  |    101    |    2     |  20   |         35.00         |     55.00      |
|    8    | 2025-02-18 |   Shipped   |    101    |    3     |  90   |         35.00         |     55.00      |
|    9    | 2025-03-10 |   Shipped   |    101    |    2     |  20   |         35.00         |     20.00      |
|    2    | 2025-01-05 |   Shipped   |    102    |    1     |  15   |         35.00         |     22.50      |
|    7    | 2025-02-15 |  Delivered  |    102    |    2     |  30   |         35.00         |     45.00      |
|