<a href="https://colab.research.google.com/github/Dee-Nwanjah/SQL-Database-Fundamental-Projects/blob/main/5.)Performance_Optimization_On_Large_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Setup Large Dataset for Performance Testing
import pandas as pd
import numpy as np
import sqlite3
from IPython import get_ipython
import time

# Creating a database connection
conn = sqlite3.connect('performance_test.db')
print("✅ Database connection created!")

# Loading SQL magic
get_ipython().run_line_magic('load_ext', 'sql')
get_ipython().run_line_magic('sql', 'sqlite:///performance_test.db')
print("✅ SQL magic loaded!")

print("📊 Creating large dataset for performance testing...")

# Creating larger dataset (50,000 orders for realistic performance testing)
np.random.seed(42)

# Products
products_data = {
    'product_id': range(1, 1001),
    'product_name': [f'Product_{i}' for i in range(1, 1001)],
    'category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Home', 'Sports'], 1000),
    'price': np.round(np.random.uniform(10, 500, 1000), 2)
}

# Customers
customers_data = {
    'customer_id': range(1, 5001),
    'first_name': [f'Customer_{i}' for i in range(1, 5001)],
    'city': np.random.choice(['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'], 5000),
    'age': np.random.randint(18, 70, 5000)
}

# Large orders dataset
orders_data = {
    'order_id': range(1, 50001),
    'customer_id': np.random.randint(1, 5001, 50000),
    'product_id': np.random.randint(1, 1001, 50000),
    'order_date': np.random.choice(pd.date_range('2023-01-01', '2024-01-31'), 50000),
    'quantity': np.random.randint(1, 5, 50000),
    'total_amount': np.round(np.random.uniform(20, 1000, 50000), 2)
}

# Create DataFrames
products_df = pd.DataFrame(products_data)
customers_df = pd.DataFrame(customers_data)
orders_df = pd.DataFrame(orders_data)

# Convert dates
orders_df['order_date'] = pd.to_datetime(orders_df['order_date']).dt.strftime('%Y-%m-%d')

# Save to database
products_df.to_sql('products', conn, if_exists='replace', index=False)
customers_df.to_sql('customers', conn, if_exists='replace', index=False)
orders_df.to_sql('large_orders', conn, if_exists='replace', index=False)

print("✅ Large dataset created!")
print(f"   - Products: {len(products_df):,} records")
print(f"   - Customers: {len(customers_df):,} records")
print(f"   - Orders: {len(orders_df):,} records")

✅ Database connection created!
✅ SQL magic loaded!
📊 Creating large dataset for performance testing...
✅ Large dataset created!
   - Products: 1,000 records
   - Customers: 5,000 records
   - Orders: 50,000 records


In [4]:
 # Performance Testing Function
def time_query(query_name, query):
    """Time how long a query takes to execute"""
    print(f"\n⏱️  Testing: {query_name}")
    start_time = time.time()
    result = pd.read_sql(query, conn)
    end_time = time.time()
    execution_time = end_time - start_time
    print(f"   Execution time: {execution_time:.4f} seconds")
    print(f"   Rows returned: {len(result):,}")
    return result, execution_time

print("✅ Performance testing function ready!")

✅ Performance testing function ready!


In [6]:
# Testing Slow Query
print("🐌 Testing SLOW queries (no indexes)...")

# Slow query 1: Customer aggregation without index
slow_query_1 = """
SELECT
    customer_id,
    COUNT(order_id) as order_count,
    SUM(total_amount) as total_spent
FROM large_orders
WHERE order_date >= '2024-01-01'
GROUP BY customer_id
HAVING total_spent > 1000
ORDER BY total_spent DESC
"""

result1, time1 = time_query("Customer Aggregation (No Index)", slow_query_1)

🐌 Testing SLOW queries (no indexes)...

⏱️  Testing: Customer Aggregation (No Index)
   Execution time: 0.0159 seconds
   Rows returned: 523


In [10]:
# Creating Indexes
print("📊 Creating indexes for performance improvement...")

try:
    # Create indexes using the sqlite3 connection
    conn.execute("CREATE INDEX IF NOT EXISTS idx_orders_customer_id ON large_orders(customer_id);")
    conn.execute("CREATE INDEX IF NOT EXISTS idx_orders_order_date ON large_orders(order_date);")
    conn.execute("CREATE INDEX IF NOT EXISTS idx_orders_product_id ON large_orders(product_id);")
    conn.execute("CREATE INDEX IF NOT EXISTS idx_orders_composite ON large_orders(customer_id, order_date);")
    conn.execute("CREATE INDEX IF NOT EXISTS idx_customers_city ON customers(city);")
    print("✅ Indexes created successfully!")
except Exception as e:
    print(f"❌ Error creating indexes: {e}")

📊 Creating indexes for performance improvement...
✅ Indexes created successfully!


In [11]:
# Testing Fast Query (With Indexes)
print("🚀 Testing FAST queries (with indexes)...")

# Same query as before, but now with indexes
result2, time2 = time_query("Customer Aggregation (With Index)", slow_query_1)

# Calculate improvement
improvement = ((time1 - time2) / time1) * 100
print(f"\n📈 PERFORMANCE IMPROVEMENT:")
print(f"   Before indexes: {time1:.4f} seconds")
print(f"   After indexes:  {time2:.4f} seconds")
print(f"   Speed improvement: {improvement:.1f}%")

🚀 Testing FAST queries (with indexes)...

⏱️  Testing: Customer Aggregation (With Index)
   Execution time: 0.0477 seconds
   Rows returned: 523

📈 PERFORMANCE IMPROVEMENT:
   Before indexes: 0.0159 seconds
   After indexes:  0.0477 seconds
   Speed improvement: -200.0%


In [12]:
# Query Optimization Examples
print("\n🔧 QUERY OPTIMIZATION EXAMPLES:")

# BAD: Multiple separate queries
print("\n❌ BAD APPROACH (Multiple queries):")
start_time = time.time()

high_value_count = pd.read_sql("SELECT COUNT(*) as count FROM large_orders WHERE total_amount > 500", conn)
low_value_count = pd.read_sql("SELECT COUNT(*) as count FROM large_orders WHERE total_amount <= 500", conn)
avg_value = pd.read_sql("SELECT AVG(total_amount) as avg FROM large_orders", conn)

bad_time = time.time() - start_time
print(f"   Time: {bad_time:.4f} seconds")

# GOOD: Single optimized query
print("\n✅ GOOD APPROACH (Single query):")
start_time = time.time()

optimized_query = """
SELECT
    COUNT(CASE WHEN total_amount > 500 THEN 1 END) as high_value_orders,
    COUNT(CASE WHEN total_amount <= 500 THEN 1 END) as low_value_orders,
    ROUND(AVG(total_amount), 2) as avg_order_value,
    COUNT(*) as total_orders
FROM large_orders
"""

good_result = pd.read_sql(optimized_query, conn)
good_time = time.time() - start_time

print(f"   Time: {good_time:.4f} seconds")
print(f"   Speed improvement: {((bad_time - good_time) / bad_time * 100):.1f}%")
print(good_result)


🔧 QUERY OPTIMIZATION EXAMPLES:

❌ BAD APPROACH (Multiple queries):
   Time: 0.0186 seconds

✅ GOOD APPROACH (Single query):
   Time: 0.0119 seconds
   Speed improvement: 36.1%
   high_value_orders  low_value_orders  avg_order_value  total_orders
0              25687             24313           512.48         50000


In [14]:
# Check Query Execution Plans using pandas
explain_query = """
EXPLAIN QUERY PLAN
SELECT
    c.customer_id,
    c.first_name,
    COUNT(o.order_id) as order_count,
    SUM(o.total_amount) as total_spent
FROM customers c
JOIN large_orders o ON c.customer_id = o.customer_id
WHERE o.order_date >= '2024-01-01'
GROUP BY c.customer_id, c.first_name
ORDER BY total_spent DESC
LIMIT 10;
"""

execution_plan_df = pd.read_sql(explain_query, conn)
display(execution_plan_df)

Unnamed: 0,id,parent,notused,detail
0,10,0,0,SCAN c
1,12,0,0,SEARCH o USING INDEX idx_orders_composite (cus...
2,20,0,0,USE TEMP B-TREE FOR GROUP BY
3,72,0,0,USE TEMP B-TREE FOR ORDER BY


In [17]:
# Memory-Efficient Aggregations
# This shows how to handle large datasets efficiently
print("\n💾 MEMORY-EFFICIENT PROCESSING:")

# Process data in chunks for very large datasets
def process_orders_in_chunks(chunk_size=10000):
    """Process large dataset in smaller chunks"""

    total_orders = pd.read_sql("SELECT COUNT(*) as count FROM large_orders", conn).iloc[0]['count']
    print(f"Processing {total_orders:,} orders in chunks of {chunk_size:,}...")

    all_chunks_df = []

    for offset in range(0, total_orders, chunk_size):
        chunk_query = f"""
        SELECT
            customer_id,
            total_amount,
            order_id
        FROM large_orders
        LIMIT {chunk_size} OFFSET {offset}
        """

        chunk_result = pd.read_sql(chunk_query, conn)
        all_chunks_df.append(chunk_result)
        print(f"   Processed chunk {offset//chunk_size + 1}: {len(chunk_result)} rows")

    # Combine all chunks and then group and aggregate
    if all_chunks_df:
        combined_df = pd.concat(all_chunks_df, ignore_index=True)
        final_result = combined_df.groupby('customer_id').agg({
            'total_amount': 'sum',
            'order_id': 'count'  # Count order_ids for the number of orders
        }).rename(columns={'total_amount': 'chunk_total', 'order_id': 'chunk_orders'}).reset_index()
    else:
        final_result = pd.DataFrame(columns=['customer_id', 'chunk_total', 'chunk_orders'])


    return final_result

# Run chunked processing
chunked_results = process_orders_in_chunks(chunk_size=15000)
print(f"\n✅ Final result: {len(chunked_results)} customers processed")
display(chunked_results.head())


💾 MEMORY-EFFICIENT PROCESSING:
Processing 50,000 orders in chunks of 15,000...
   Processed chunk 1: 15000 rows
   Processed chunk 2: 15000 rows
   Processed chunk 3: 15000 rows
   Processed chunk 4: 5000 rows

✅ Final result: 5000 customers processed


Unnamed: 0,customer_id,chunk_total,chunk_orders
0,1,7925.1,15
1,2,2525.59,5
2,3,1327.78,5
3,4,6630.35,14
4,5,4282.16,7
