# Task 02 Solution: SQL Analytics and Database Operations

Complete solutions for SQL queries, database operations with SQLAlchemy, and analytics queries.

In [None]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine, Column, Integer, String, Float, DateTime, ForeignKey, JSON, text
from sqlalchemy.orm import declarative_base, relationship, Session
from sqlalchemy.pool import StaticPool
from datetime import datetime, timedelta
import sqlite3

## Task 2.1: Setup Database Schema with SQLAlchemy

Create tables for users, orders, and products with relationships.

In [None]:
# Solution: Define database schema
Base = declarative_base()

class User(Base):
    __tablename__ = 'users'
    
    id = Column(Integer, primary_key=True)
    name = Column(String(100), nullable=False)
    email = Column(String(100), unique=True, nullable=False)
    country = Column(String(50))
    signup_date = Column(DateTime, default=datetime.utcnow)
    
    # Relationship
    orders = relationship('Order', back_populates='user', cascade='all, delete-orphan')

class Product(Base):
    __tablename__ = 'products'
    
    id = Column(Integer, primary_key=True)
    name = Column(String(200), nullable=False)
    category = Column(String(50))
    price = Column(Float, nullable=False)
    
    # Relationship
    orders = relationship('Order', back_populates='product')

class Order(Base):
    __tablename__ = 'orders'
    
    id = Column(Integer, primary_key=True)
    user_id = Column(Integer, ForeignKey('users.id'), nullable=False)
    product_id = Column(Integer, ForeignKey('products.id'), nullable=False)
    quantity = Column(Integer, nullable=False)
    total_amount = Column(Float, nullable=False)
    order_date = Column(DateTime, nullable=False)
    status = Column(String(20), default='completed')
    
    # Relationships
    user = relationship('User', back_populates='orders')
    product = relationship('Product', back_populates='orders')

# Create in-memory SQLite database
engine = create_engine(
    'sqlite:///:memory:',
    connect_args={'check_same_thread': False},
    poolclass=StaticPool,
    echo=False  # Set to True to see SQL queries
)

# Create all tables
Base.metadata.create_all(engine)

print("âœ… Database schema created successfully!")
print("\nTables created:")
for table in Base.metadata.tables.keys():
    print(f"  - {table}")

## Task 2.2: Populate Database with Sample Data

Insert users, products, and orders.

In [None]:
# Solution: Generate and insert sample data
np.random.seed(42)

with Session(engine) as session:
    # Insert users
    users = []
    for i in range(1, 101):
        user = User(
            name=f"User_{i}",
            email=f"user{i}@example.com",
            country=np.random.choice(['US', 'UK', 'CA', 'DE', 'FR']),
            signup_date=datetime(2024, 1, 1) + timedelta(days=np.random.randint(0, 365))
        )
        users.append(user)
    session.add_all(users)
    
    # Insert products
    products = []
    categories = ['Electronics', 'Clothing', 'Books', 'Home', 'Sports']
    product_names = {
        'Electronics': ['Laptop', 'Phone', 'Tablet', 'Headphones', 'Camera'],
        'Clothing': ['T-Shirt', 'Jeans', 'Jacket', 'Shoes', 'Hat'],
        'Books': ['Novel', 'Textbook', 'Magazine', 'Comic', 'Cookbook'],
        'Home': ['Chair', 'Table', 'Lamp', 'Rug', 'Vase'],
        'Sports': ['Ball', 'Racket', 'Bike', 'Weights', 'Mat']
    }
    
    for category in categories:
        for name in product_names[category]:
            product = Product(
                name=f"{name} ({category})",
                category=category,
                price=round(np.random.uniform(10, 1000), 2)
            )
            products.append(product)
    session.add_all(products)
    session.flush()  # Get IDs
    
    # Insert orders
    orders = []
    start_date = datetime(2024, 1, 1)
    
    for i in range(500):
        user = np.random.choice(users)
        product = np.random.choice(products)
        quantity = np.random.randint(1, 5)
        
        order = Order(
            user_id=user.id,
            product_id=product.id,
            quantity=quantity,
            total_amount=round(product.price * quantity, 2),
            order_date=start_date + timedelta(days=np.random.randint(0, 365)),
            status=np.random.choice(['completed', 'pending', 'cancelled'], p=[0.8, 0.15, 0.05])
        )
        orders.append(order)
    session.add_all(orders)
    
    session.commit()
    
    print(f"âœ… Inserted {len(users)} users")
    print(f"âœ… Inserted {len(products)} products")
    print(f"âœ… Inserted {len(orders)} orders")

# Verify data
with Session(engine) as session:
    user_count = session.query(User).count()
    product_count = session.query(Product).count()
    order_count = session.query(Order).count()
    
    assert user_count == 100, f"Expected 100 users, got {user_count}"
    assert product_count == 25, f"Expected 25 products, got {product_count}"
    assert order_count == 500, f"Expected 500 orders, got {order_count}"
    
print("\nâœ… Data insertion verified!")

## Task 2.3: Basic SELECT Queries

Query data using pandas and SQLAlchemy.

In [None]:
# Solution: Basic queries with pandas

# Query 1: All users from US
df_us_users = pd.read_sql("""
    SELECT id, name, email, country
    FROM users
    WHERE country = 'US'
    ORDER BY name
""", engine)

print("US Users:")
print(df_us_users.head())
print(f"Total US users: {len(df_us_users)}")

# Query 2: Products in Electronics category
df_electronics = pd.read_sql("""
    SELECT name, category, price
    FROM products
    WHERE category = 'Electronics'
    ORDER BY price DESC
""", engine)

print("\nElectronics Products:")
print(df_electronics)

# Query 3: Recent orders
df_recent_orders = pd.read_sql("""
    SELECT id, user_id, product_id, total_amount, order_date, status
    FROM orders
    WHERE order_date > '2024-10-01'
    ORDER BY order_date DESC
    LIMIT 10
""", engine)

print("\nRecent Orders:")
print(df_recent_orders)

# Verify
assert len(df_us_users) > 0, "Should have US users"
assert len(df_electronics) == 5, "Should have 5 electronics products"
assert len(df_recent_orders) <= 10, "Should have at most 10 orders"
print("\nâœ… Basic queries completed!")

## Task 2.4: JOIN Queries

Combine data from multiple tables using JOINs.

In [None]:
# Solution: JOIN queries

# Query 1: Orders with user and product details (INNER JOIN)
df_order_details = pd.read_sql("""
    SELECT 
        o.id as order_id,
        u.name as user_name,
        u.country,
        p.name as product_name,
        p.category,
        o.quantity,
        o.total_amount,
        o.order_date,
        o.status
    FROM orders o
    INNER JOIN users u ON o.user_id = u.id
    INNER JOIN products p ON o.product_id = p.id
    WHERE o.status = 'completed'
    ORDER BY o.order_date DESC
    LIMIT 20
""", engine)

print("Order Details (with JOINs):")
print(df_order_details.head(10))

# Query 2: Users with their total order count (LEFT JOIN)
df_user_orders = pd.read_sql("""
    SELECT 
        u.id,
        u.name,
        u.country,
        COUNT(o.id) as order_count,
        COALESCE(SUM(o.total_amount), 0) as total_spent
    FROM users u
    LEFT JOIN orders o ON u.id = o.user_id AND o.status = 'completed'
    GROUP BY u.id, u.name, u.country
    ORDER BY total_spent DESC
    LIMIT 10
""", engine)

print("\nTop Users by Total Spent:")
print(df_user_orders)

# Verify
assert len(df_order_details) > 0, "Should have order details"
assert len(df_user_orders) == 10, "Should have 10 users"
assert 'user_name' in df_order_details.columns, "Should have user_name column"
assert 'order_count' in df_user_orders.columns, "Should have order_count column"
print("\nâœ… JOIN queries completed!")

## Task 2.5: Aggregation Queries

Use GROUP BY and aggregate functions.

In [None]:
# Solution: Aggregation queries

# Query 1: Sales by category
df_category_sales = pd.read_sql("""
    SELECT 
        p.category,
        COUNT(o.id) as order_count,
        SUM(o.quantity) as total_quantity,
        SUM(o.total_amount) as total_revenue,
        AVG(o.total_amount) as avg_order_value,
        MAX(o.total_amount) as max_order_value
    FROM orders o
    INNER JOIN products p ON o.product_id = p.id
    WHERE o.status = 'completed'
    GROUP BY p.category
    ORDER BY total_revenue DESC
""", engine)

print("Sales by Category:")
print(df_category_sales.to_string(index=False))

# Query 2: Sales by country
df_country_sales = pd.read_sql("""
    SELECT 
        u.country,
        COUNT(DISTINCT u.id) as customer_count,
        COUNT(o.id) as order_count,
        SUM(o.total_amount) as total_revenue,
        AVG(o.total_amount) as avg_order_value
    FROM users u
    LEFT JOIN orders o ON u.id = o.user_id AND o.status = 'completed'
    GROUP BY u.country
    HAVING SUM(o.total_amount) > 0
    ORDER BY total_revenue DESC
""", engine)

print("\nSales by Country:")
print(df_country_sales.to_string(index=False))

# Query 3: Monthly sales trend
df_monthly_sales = pd.read_sql("""
    SELECT 
        strftime('%Y-%m', order_date) as month,
        COUNT(*) as order_count,
        SUM(total_amount) as revenue
    FROM orders
    WHERE status = 'completed'
    GROUP BY strftime('%Y-%m', order_date)
    ORDER BY month
""", engine)

print("\nMonthly Sales:")
print(df_monthly_sales.head(10))

# Verify
assert len(df_category_sales) == 5, "Should have 5 categories"
assert len(df_country_sales) > 0, "Should have country sales"
assert df_category_sales['total_revenue'].sum() > 0, "Should have revenue"
print("\nâœ… Aggregation queries completed!")

## Task 2.6: Window Functions

Use window functions for advanced analytics.

In [None]:
# Solution: Window functions

# Query 1: Rank users by spending within each country
df_user_rank = pd.read_sql("""
    SELECT 
        u.name,
        u.country,
        SUM(o.total_amount) as total_spent,
        RANK() OVER (PARTITION BY u.country ORDER BY SUM(o.total_amount) DESC) as country_rank
    FROM users u
    INNER JOIN orders o ON u.id = o.user_id
    WHERE o.status = 'completed'
    GROUP BY u.id, u.name, u.country
    ORDER BY u.country, country_rank
    LIMIT 20
""", engine)

print("User Rankings by Country:")
print(df_user_rank.head(15))

# Query 2: Running total of sales
df_running_total = pd.read_sql("""
    WITH daily_sales AS (
        SELECT 
            DATE(order_date) as sale_date,
            SUM(total_amount) as daily_revenue
        FROM orders
        WHERE status = 'completed'
        GROUP BY DATE(order_date)
    )
    SELECT 
        sale_date,
        daily_revenue,
        SUM(daily_revenue) OVER (ORDER BY sale_date) as running_total
    FROM daily_sales
    ORDER BY sale_date
    LIMIT 15
""", engine)

print("\nRunning Total Sales:")
print(df_running_total)

# Query 3: Compare each product's sales to category average
df_product_comparison = pd.read_sql("""
    WITH product_sales AS (
        SELECT 
            p.id,
            p.name,
            p.category,
            COUNT(o.id) as order_count,
            SUM(o.total_amount) as revenue
        FROM products p
        LEFT JOIN orders o ON p.id = o.product_id AND o.status = 'completed'
        GROUP BY p.id, p.name, p.category
    )
    SELECT 
        name,
        category,
        revenue,
        AVG(revenue) OVER (PARTITION BY category) as category_avg,
        revenue - AVG(revenue) OVER (PARTITION BY category) as diff_from_avg
    FROM product_sales
    WHERE revenue > 0
    ORDER BY category, revenue DESC
""", engine)

print("\nProduct Performance vs Category Average:")
print(df_product_comparison.head(10))

# Verify
assert 'country_rank' in df_user_rank.columns, "Should have ranking"
assert 'running_total' in df_running_total.columns, "Should have running total"
assert df_running_total['running_total'].is_monotonic_increasing, "Running total should increase"
print("\nâœ… Window functions completed!")

## Task 2.7: Common Table Expressions (CTEs)

Use CTEs for complex multi-step queries.

In [None]:
# Solution: Complex CTE queries

# Query 1: Find power users (top 20% by spending) and their favorite category
df_power_users = pd.read_sql("""
    WITH user_spending AS (
        SELECT 
            u.id,
            u.name,
            SUM(o.total_amount) as total_spent,
            COUNT(o.id) as order_count
        FROM users u
        INNER JOIN orders o ON u.id = o.user_id
        WHERE o.status = 'completed'
        GROUP BY u.id, u.name
    ),
    power_users AS (
        SELECT id, name, total_spent, order_count
        FROM user_spending
        ORDER BY total_spent DESC
        LIMIT (SELECT COUNT(DISTINCT user_id) / 5 FROM orders WHERE status = 'completed')
    ),
    user_category_preference AS (
        SELECT 
            pu.id,
            pu.name,
            p.category,
            COUNT(o.id) as category_orders,
            RANK() OVER (PARTITION BY pu.id ORDER BY COUNT(o.id) DESC) as rank
        FROM power_users pu
        INNER JOIN orders o ON pu.id = o.user_id
        INNER JOIN products p ON o.product_id = p.id
        WHERE o.status = 'completed'
        GROUP BY pu.id, pu.name, p.category
    )
    SELECT 
        pu.name,
        pu.total_spent,
        pu.order_count,
        ucp.category as favorite_category,
        ucp.category_orders
    FROM power_users pu
    LEFT JOIN user_category_preference ucp ON pu.id = ucp.id AND ucp.rank = 1
    ORDER BY pu.total_spent DESC
""", engine)

print("Power Users and Their Preferences:")
print(df_power_users.head(10))

# Query 2: Month-over-month growth analysis
df_mom_growth = pd.read_sql("""
    WITH monthly_stats AS (
        SELECT 
            strftime('%Y-%m', order_date) as month,
            COUNT(*) as order_count,
            SUM(total_amount) as revenue
        FROM orders
        WHERE status = 'completed'
        GROUP BY strftime('%Y-%m', order_date)
    )
    SELECT 
        month,
        order_count,
        revenue,
        LAG(revenue) OVER (ORDER BY month) as prev_month_revenue,
        ROUND(
            (revenue - LAG(revenue) OVER (ORDER BY month)) * 100.0 / 
            NULLIF(LAG(revenue) OVER (ORDER BY month), 0),
            2
        ) as growth_rate_pct
    FROM monthly_stats
    ORDER BY month
""", engine)

print("\nMonth-over-Month Growth:")
print(df_mom_growth.head(12))

# Verify
assert len(df_power_users) > 0, "Should have power users"
assert len(df_mom_growth) > 0, "Should have monthly data"
assert 'favorite_category' in df_power_users.columns, "Should have favorite category"
print("\nâœ… CTE queries completed!")

## Task 2.8: CRUD Operations with SQLAlchemy ORM

Demonstrate Create, Read, Update, Delete operations.

In [None]:
# Solution: CRUD operations

with Session(engine) as session:
    # CREATE: Add new user
    new_user = User(
        name="Alice Johnson",
        email="alice@example.com",
        country="US",
        signup_date=datetime.utcnow()
    )
    session.add(new_user)
    session.commit()
    print(f"âœ… Created user: {new_user.name} (ID: {new_user.id})")
    
    # READ: Query the user
    user = session.query(User).filter(User.email == "alice@example.com").first()
    print(f"âœ… Read user: {user.name} from {user.country}")
    
    # UPDATE: Change user's country
    user.country = "CA"
    session.commit()
    print(f"âœ… Updated user country to: {user.country}")
    
    # Verify update
    updated_user = session.query(User).filter(User.id == user.id).first()
    assert updated_user.country == "CA", "Country should be updated"
    
    # CREATE: Add order for the user
    product = session.query(Product).first()
    new_order = Order(
        user_id=user.id,
        product_id=product.id,
        quantity=2,
        total_amount=product.price * 2,
        order_date=datetime.utcnow(),
        status="completed"
    )
    session.add(new_order)
    session.commit()
    print(f"âœ… Created order: Order #{new_order.id} for ${new_order.total_amount}")
    
    # READ: Get user with orders (relationship)
    user_with_orders = session.query(User).filter(User.id == user.id).first()
    print(f"âœ… User has {len(user_with_orders.orders)} order(s)")
    
    # Query with filters
    high_value_orders = session.query(Order)\
        .filter(Order.total_amount > 1000)\
        .filter(Order.status == 'completed')\
        .order_by(Order.total_amount.desc())\
        .limit(5)\
        .all()
    
    print(f"\nâœ… Found {len(high_value_orders)} high-value orders:")
    for order in high_value_orders:
        print(f"  Order #{order.id}: ${order.total_amount:.2f}")
    
    # DELETE: Remove the order
    order_id = new_order.id
    session.delete(new_order)
    session.commit()
    print(f"\nâœ… Deleted order: Order #{order_id}")
    
    # Verify deletion
    deleted_order = session.query(Order).filter(Order.id == order_id).first()
    assert deleted_order is None, "Order should be deleted"

print("\nâœ… CRUD operations completed!")

## Task 2.9: Export Query Results to Parquet

Bridge SQL and file formats by exporting analytics results.

In [None]:
# Solution: Export SQL results to Parquet

# Query comprehensive user analytics
df_user_analytics = pd.read_sql("""
    SELECT 
        u.id,
        u.name,
        u.email,
        u.country,
        u.signup_date,
        COUNT(o.id) as total_orders,
        COUNT(CASE WHEN o.status = 'completed' THEN 1 END) as completed_orders,
        COUNT(CASE WHEN o.status = 'cancelled' THEN 1 END) as cancelled_orders,
        COALESCE(SUM(CASE WHEN o.status = 'completed' THEN o.total_amount ELSE 0 END), 0) as total_spent,
        COALESCE(AVG(CASE WHEN o.status = 'completed' THEN o.total_amount END), 0) as avg_order_value,
        MIN(o.order_date) as first_order_date,
        MAX(o.order_date) as last_order_date
    FROM users u
    LEFT JOIN orders o ON u.id = o.user_id
    GROUP BY u.id, u.name, u.email, u.country, u.signup_date
    ORDER BY total_spent DESC
""", engine)

print("User Analytics DataFrame:")
print(df_user_analytics.head(10))
print(f"\nTotal users: {len(df_user_analytics)}")

# Export to Parquet
parquet_path = 'user_analytics.parquet'
df_user_analytics.to_parquet(
    parquet_path,
    compression='snappy',
    index=False
)

file_size_kb = os.path.getsize(parquet_path) / 1024
print(f"\nâœ… Exported to Parquet: {parquet_path} ({file_size_kb:.2f} KB)")

# Read back and verify
df_verify = pd.read_parquet(parquet_path)
assert len(df_verify) == len(df_user_analytics), "Should preserve all rows"
assert list(df_verify.columns) == list(df_user_analytics.columns), "Should preserve columns"

print("\nâœ… Export verified!")

## Task 2.10: Performance Comparison - SQL vs Pandas

Compare query performance between SQL and pandas operations.

In [None]:
# Solution: Benchmark SQL vs pandas
import time

# Task: Calculate total sales by category

# Method 1: SQL aggregation
start_time = time.time()
df_sql = pd.read_sql("""
    SELECT 
        p.category,
        SUM(o.total_amount) as total_sales
    FROM orders o
    INNER JOIN products p ON o.product_id = p.id
    WHERE o.status = 'completed'
    GROUP BY p.category
    ORDER BY total_sales DESC
""", engine)
sql_time = time.time() - start_time

# Method 2: Pandas operations
start_time = time.time()
df_orders = pd.read_sql("SELECT * FROM orders WHERE status = 'completed'", engine)
df_products = pd.read_sql("SELECT id, category FROM products", engine)
df_merged = df_orders.merge(df_products, left_on='product_id', right_on='id')
df_pandas = df_merged.groupby('category')['total_amount'].sum()\
    .reset_index()\
    .rename(columns={'total_amount': 'total_sales'})\
    .sort_values('total_sales', ascending=False)
pandas_time = time.time() - start_time

print("Performance Comparison:")
print(f"SQL aggregation:    {sql_time:.4f}s")
print(f"Pandas operations:  {pandas_time:.4f}s")
print(f"Speed ratio:        {pandas_time/sql_time:.2f}x")

print("\nSQL Result:")
print(df_sql)

print("\nPandas Result:")
print(df_pandas)

# Verify both methods produce same results
df_sql_sorted = df_sql.sort_values('category').reset_index(drop=True)
df_pandas_sorted = df_pandas.sort_values('category').reset_index(drop=True)

assert len(df_sql_sorted) == len(df_pandas_sorted), "Should have same number of rows"
assert df_sql_sorted['category'].tolist() == df_pandas_sorted['category'].tolist(), "Categories should match"

print("\nâœ… Both methods produce identical results!")
print("\nðŸ’¡ Key Insight: SQL is typically faster for aggregations on large datasets")

## Summary

This notebook demonstrated:

1. **Database Schema**: Created tables with relationships using SQLAlchemy
2. **Basic Queries**: SELECT, WHERE, ORDER BY, LIMIT
3. **JOINs**: INNER JOIN and LEFT JOIN for combining tables
4. **Aggregations**: GROUP BY, COUNT, SUM, AVG, MAX
5. **Window Functions**: RANK, SUM OVER, LAG for advanced analytics
6. **CTEs**: Multi-step queries with Common Table Expressions
7. **ORM Operations**: Create, Read, Update, Delete with SQLAlchemy
8. **SQL to Parquet**: Export analytics results for ML workflows
9. **Performance**: SQL vs pandas comparison

**Key Takeaways:**
- Use SQL for complex aggregations and joins
- SQLAlchemy provides both raw SQL and ORM approaches
- Window functions enable sophisticated analytics
- Export SQL results to Parquet for ML pipelines
- SQL is often faster than pandas for aggregations
- CTEs make complex queries more readable

**Best Practices:**
- Always use parameterized queries to prevent SQL injection
- Index frequently filtered/joined columns
- Use EXPLAIN ANALYZE to optimize slow queries
- Prefer SQL aggregations over pandas for large datasets
- Use CTEs for readability and maintainability