# Connect to a SQLite Database

In [10]:
%load_ext sql
%sql sqlite:///ecommerce.db

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [12]:
import pandas as pd
import sqlite3
# Connect to DB
conn = sqlite3.connect("ecommerce.db")

# Data

In [13]:
%%sql

CREATE TABLE IF NOT EXISTS orders (
    order_id INT,
    customer_id INT,
    order_date DATE,
    product_id INT,
    quantity INT,
    price_per_unit DECIMAL(10, 2),
    discount DECIMAL(5, 2),
    status TEXT,
    warehouse_id INT
);

 * sqlite:///ecommerce.db
Done.


[]

In [14]:
%%sql

INSERT INTO orders (order_id, customer_id, order_date, product_id, quantity, price_per_unit, discount, status, warehouse_id) VALUES
(1, 101, '2024-10-01', 1, 3, 100.00, 0.1, 'Delivered', 1),
(2, 102, '2024-10-02', 2, 1, 200.00, 0.2, 'Cancelled', 2),
(3, 101, '2024-11-01', 1, 2, 100.00, 0.0, 'Delivered', 1),
(4, 103, '2024-12-01', 3, 4, 50.00, 0.15, 'Shipped', 3),
(5, 104, '2025-01-01', 2, 1, 200.00, 0.05, 'Delivered', 2);

 * sqlite:///ecommerce.db
5 rows affected.


[]

In [21]:
query = """
SELECT * from orders;
"""

df = pd.read_sql_query(query, conn)
df

Unnamed: 0,order_id,customer_id,order_date,product_id,quantity,price_per_unit,discount,status,warehouse_id
0,1,101,2024-10-01,1,3,100,0.1,Delivered,1
1,2,102,2024-10-02,2,1,200,0.2,Cancelled,2
2,3,101,2024-11-01,1,2,100,0.0,Delivered,1
3,4,103,2024-12-01,3,4,50,0.15,Shipped,3
4,5,104,2025-01-01,2,1,200,0.05,Delivered,2
5,1,101,2024-10-01,1,3,100,0.1,Delivered,1
6,2,102,2024-10-02,2,1,200,0.2,Cancelled,2
7,3,101,2024-11-01,1,2,100,0.0,Delivered,1
8,4,103,2024-12-01,3,4,50,0.15,Shipped,3
9,5,104,2025-01-01,2,1,200,0.05,Delivered,2


# Questions

In [5]:
# Retrieve the top 3 customers who have spent the most on completed orders (status = 'Delivered'), considering discounts.

In [15]:
# Run query and fetch into DataFrame
# First WHERE and then GROUP BY, ORDER BY
query = """
SELECT customer_id, 
       SUM(quantity * price_per_unit * (1 - discount)) AS total_sales
FROM orders
WHERE status = 'Delivered'
GROUP BY customer_id
ORDER BY total_sales DESC
LIMIT 3;
"""

df = pd.read_sql_query(query, conn)
df

Unnamed: 0,customer_id,total_sales
0,101,1410.0
1,104,570.0


In [16]:
# Find the warehouse(s) that have processed the highest total revenue (considering discounts) in the last 3 months from today.

In [20]:
query = """
SELECT warehouse_id, 
       SUM(quantity * price_per_unit * (1 - discount)) AS revenue
FROM orders
WHERE status = 'Delivered' 
  AND order_date >= DATE('now', '-3 months')
GROUP BY warehouse_id
ORDER BY revenue DESC
LIMIT 1;
"""

df = pd.read_sql_query(query, conn)
df

Unnamed: 0,warehouse_id,revenue
0,2,570.0


In [22]:
# Find the top 2 products that had the highest total quantity sold in the last 6 months, 
# but exclude orders where a discount of 50% or more was applied.

In [26]:
query = """
SELECT product_id, 
       SUM(quantity) AS total_quantity
FROM orders
WHERE order_date >= DATE('now', '-6 months') AND discount < 0.5
GROUP BY product_id
ORDER BY total_quantity DESC
LIMIT 2;
"""

df = pd.read_sql_query(query, conn)
df

Unnamed: 0,product_id,total_quantity
0,1,15
1,3,12


In [27]:
# Find the percentage of orders that were cancelled out of the total orders placed in the last year, 
# and round the percentage to two decimal places.

In [54]:
# multiplying by 100.0 makes it float and makes division possible

query = """
SELECT  
       SUM(CASE WHEN status = 'Cancelled' THEN 1 ELSE 0 END) * 100.0 / COUNT(*) as cancel_percentage
FROM orders
WHERE 
       order_date >= DATE('now', '-12 months') 
"""

df = pd.read_sql_query(query, conn)
df

Unnamed: 0,cancel_percentage
0,20


In [55]:
# Find the month and year that had the highest total revenue (considering discounts) across all orders.

In [60]:
query = """
SELECT 
    strftime('%Y', order_date) AS year,
    strftime('%m', order_date) AS month,
    SUM(quantity * price_per_unit * (1 - discount)) AS revenue
FROM orders
GROUP BY year, month
ORDER BY revenue DESC
LIMIT 1;
"""

df = pd.read_sql_query(query, conn)
df

Unnamed: 0,year,month,revenue
0,2024,10,1290.0


In [61]:
# Find the cumulative revenue (considering discounts) for each day, ordered by order_date. 
# The cumulative revenue should be calculated as a running total.

In [62]:
query = """
WITH daily_sales AS (
    SELECT 
        order_date,
        SUM(quantity * price_per_unit * (1 - discount)) AS daily_revenue
    FROM orders
    GROUP BY order_date
)
SELECT 
    order_date, 
    daily_revenue,
    SUM(daily_revenue) OVER (ORDER BY order_date) AS cumulative_revenue
FROM daily_sales
ORDER BY order_date;
"""

df = pd.read_sql_query(query, conn)
df

Unnamed: 0,order_date,daily_revenue,cumulative_revenue
0,2024-10-01,810.0,810.0
1,2024-10-02,480.0,1290.0
2,2024-11-01,600.0,1890.0
3,2024-12-01,510.0,2400.0
4,2025-01-01,570.0,2970.0


Breaking It Down:

- 1️⃣ SUM(daily_revenue)
  - This applies the SUM() aggregation, which normally sums values in a group.
  - But in this case, we use it as a window function, meaning it doesn’t collapse rows but instead computes a running total.

- 2️⃣ OVER (ORDER BY order_date)
  - OVER() transforms SUM() into a window function.
  - ORDER BY order_date makes it a running total, summing up daily_revenue in increasing order of order_date.

In [63]:
# Find the second highest revenue-generating product (considering discounts). 
# Ensure that your query works even if there are duplicate revenues for multiple products.

In [76]:
query = """
WITH product_sales AS (
    SELECT 
        product_id, 
        SUM(quantity * price_per_unit * (1 - discount)) AS revenue
    FROM orders 
    GROUP BY product_id
),

ranked_products AS (
    SELECT 
        product_id, 
        revenue,
        DENSE_RANK() OVER (ORDER BY revenue DESC) AS rank
    FROM product_sales
)

SELECT product_id, revenue, rank
FROM ranked_products
WHERE rank = 2;
"""

df = pd.read_sql_query(query, conn)
df

Unnamed: 0,product_id,revenue,rank
0,2,1050.0,2


In [77]:
# Find the customer(s) who placed the most orders in the last 12 months. 
# If multiple customers have the same highest order count, return all of them.

In [84]:
query = """
WITH cust_orders AS (
    SELECT 
        customer_id, 
        COUNT(order_id) AS order_count
    FROM orders
    WHERE order_date >= DATE('now', '-12 months') 
    GROUP BY customer_id
),

ranked_customers AS (
    SELECT 
        customer_id, 
        order_count,
        DENSE_RANK() OVER (ORDER BY order_count DESC) AS cust_rank
    FROM cust_orders
)

SELECT customer_id, order_count, cust_rank
FROM ranked_customers
WHERE cust_rank = 1;
"""

df = pd.read_sql_query(query, conn)
df

Unnamed: 0,customer_id,order_count,cust_rank
0,101,6,1


1️⃣ RANK() (With Gaps in Ranking)
- If two customers tie at Rank 1, the next rank will be 3 (skipping Rank 2).

2️⃣ DENSE_RANK() (Without Gaps in Ranking)
- If two customers tie at Rank 1, the next rank will be 2 (no gap).