In [None]:
import duckdb
import pandas as pd

# Connect to DuckDB database
con = duckdb.connect('pipeline.duckdb')


In [None]:
# Query 1: Total number of users
query_total_users = "SELECT COUNT(*) AS total_users FROM users;"
df_total_users = con.execute(query_total_users).fetchdf()
df_total_users


In [None]:
# Query 2: Total revenue
query_total_revenue = "SELECT SUM(total_amount) AS total_revenue FROM orders;"
df_total_revenue = con.execute(query_total_revenue).fetchdf()
df_total_revenue


In [None]:
# Query 3: Top 10 products by revenue
query_top_products = """
SELECT p.product_id, p.name, SUM(o.total_amount) AS revenue
FROM products p
JOIN orders o ON p.product_id = o.product_id
GROUP BY p.product_id, p.name
ORDER BY revenue DESC
LIMIT 10;
"""
df_top_products = con.execute(query_top_products).fetchdf()
df_top_products


In [None]:
# Query 4: Top 10 users by total spend
query_top_users = """
SELECT u.user_id, u.name, SUM(o.total_amount) AS spend
FROM users u
JOIN orders o ON u.user_id = o.user_id
GROUP BY u.user_id, u.name
ORDER BY spend DESC
LIMIT 10;
"""
df_top_users = con.execute(query_top_users).fetchdf()
df_top_users


In [None]:
# Query 5: Monthly revenue and growth
query_monthly_revenue = """
WITH monthly_revenue AS (
    SELECT DATE_TRUNC('month', order_date) AS month,
           SUM(total_amount) AS revenue
    FROM orders
    GROUP BY month
)
SELECT *,
       revenue - LAG(revenue) OVER (ORDER BY month) AS growth
FROM monthly_revenue;
"""
df_monthly_revenue = con.execute(query_monthly_revenue).fetchdf()
df_monthly_revenue


In [None]:
# Query 6: Revenue by product category
query_category_revenue = """
SELECT p.category, SUM(o.total_amount) AS revenue
FROM products p
JOIN orders o ON p.product_id = o.product_id
GROUP BY p.category
ORDER BY revenue DESC;
"""
df_category_revenue = con.execute(query_category_revenue).fetchdf()
df_category_revenue


In [None]:
# Query 7: Number of orders per user (top 10)
query_orders_per_user = """
SELECT u.user_id, u.name, COUNT(o.order_id) AS total_orders
FROM users u
JOIN orders o ON u.user_id = o.user_id
GROUP BY u.user_id, u.name
ORDER BY total_orders DESC
LIMIT 10;
"""
df_orders_per_user = con.execute(query_orders_per_user).fetchdf()
df_orders_per_user


In [None]:
# Query 8: Average order value per user (top 10)
query_avg_order_value = """
SELECT u.user_id, u.name, AVG(o.total_amount) AS avg_order_value
FROM users u
JOIN orders o ON u.user_id = o.user_id
GROUP BY u.user_id, u.name
ORDER BY avg_order_value DESC
LIMIT 10;
"""
df_avg_order_value = con.execute(query_avg_order_value).fetchdf()
df_avg_order_value


In [None]:
# Query 9: Most popular product category by order count
query_popular_category = """
SELECT p.category, COUNT(o.order_id) AS order_count
FROM products p
JOIN orders o ON p.product_id = o.product_id
GROUP BY p.category
ORDER BY order_count DESC;
"""
df_popular_category = con.execute(query_popular_category).fetchdf()
df_popular_category
