# INDEXING AND PERFORMANCE

In [15]:
import pandas as pd
from datetime import datetime, timedelta
from pymongo import MongoClient
import time
import statistics
from pymongo import ASCENDING , TEXT

# Connect to MongoDB
client = MongoClient('mongodb://localhost:27017/')
db = client['eduhub_db']


## Index Creation

In [16]:
#User email lookup (unique index)
try:
    # Create a non-unique ascending index on email
    db.users.create_index([("email", ASCENDING)], name="email_index")
    print("Index created on users.email")
except Exception as e:
    print(f"Error creating index on users.email: {e}")



#Course search by title and category (compound + optional text index)
try:
    db.courses.create_index([("title", ASCENDING), ("category", ASCENDING)])
    print("Index created on courses.title and courses.category")
except Exception as e:
    print(f"Error creating index on courses.title and courses.category: {e}")


# Course search by title 
try:
    db.courses.create_index([("title", TEXT)])
    print("Text index created on courses.title (for full-text search)")
except Exception as e:
    print(f"Error creating text index on courses.title: {e}")



#Assignment queries by due date
try:
    db.assignments.create_index([("dueDate", ASCENDING)])
    print("Index created on assignments.dueDate")
except Exception as e:
    print(f"Error creating index on assignments.dueDate: {e}")



#Enrollment queries by student and course (compound index)
try:
    db.enrollments.create_index([("studentId", ASCENDING), ("courseId", ASCENDING)])
    print("Index created on enrollments.studentId and enrollments.courseId")
except Exception as e:
    print(f"Error creating index on enrollments.studentId and enrollments.courseId: {e}")

Index created on users.email
Index created on courses.title and courses.category
Text index created on courses.title (for full-text search)
Index created on assignments.dueDate
Index created on enrollments.studentId and enrollments.courseId


## Query Optimization


### Defining the Explain Function

In [17]:
#Define explain function to return explain plan execution stats.

def explain_query(collection, query, projection=None):
    try:
        explain_result = collection.find(query, projection).explain()
        exec_stats = explain_result.get('executionStats', {})
        return {
            'total_docs_examined': exec_stats.get('totalDocsExamined', 0),
            'total_keys_examined': exec_stats.get('totalKeysExamined', 0),
            'winning_plan': explain_result.get('queryPlanner', {}).get('winningPlan', {}).get('stage', 'Unknown')
        }
    except Exception as e:
        return {'error': str(e)}


### Defining the Time function

In [18]:
#Define timing function to Time a query function multiple times and return avg, min, max.

def time_query(query_func, iterations=5):
    import time
    try:
        times = []
        result = []
        for _ in range(iterations):
            start = time.perf_counter()
            result = list(query_func())
            end = time.perf_counter()
            times.append((end - start) * 1000)  # milliseconds
        return {
            'avg_time_ms': sum(times) / len(times),
            'min_time_ms': min(times),
            'max_time_ms': max(times),            
        }
    except Exception as e:
        return {'error': str(e)}

### Defining Queries To Test

In [19]:
#Defining queries to test

queries = [
    {
        'name': 'Find user by email',
        'collection': db['users'],
        'query': {'email': 'stephensrobert@example.org'},
        'projection': {'_id': 0, 'userId': 1, 'email': 1}
    },
    {
        'name': 'Courses in category Data Science',
        'collection': db['courses'],
        'query': {'category': 'Web Development'},
        'projection': {'_id': 0, 'courseId': 1, 'title': 1}
    },
    {
        'name': 'Assignments due in next 7 days',
        'collection': db['assignments'],
        'query': {'dueDate': {'$lte': datetime.now() + timedelta(days=7)}},
        'projection': {'_id': 0, 'assignmentId': 1, 'dueDate': 1}
    }
]


### Defining Functions 

In [20]:
# List to store results
query_results = []

# Defining the  function to get execution stats of a query safely using explain()
def safe_explain(collection, query, projection=None):
    try:
        explain_result = collection.find(query, projection).explain()
        exec_stats = explain_result.get('executionStats', {})
        return {
            'total_docs_examined': exec_stats.get('totalDocsExamined', 0),
            'total_keys_examined': exec_stats.get('totalKeysExamined', 0),
            'winning_plan': explain_result.get('queryPlanner', {}).get('winningPlan', {}).get('stage', 'Unknown')
        }
    except Exception as e:
        return {'Error': str(e)}


# Defining the function to get execution stats of a query using time()
def safe_time(query_func, runs=3):
    import time, statistics
    try:
        times = []
        for _ in range(runs):
            start = time.perf_counter()
            _ = list(query_func())
            times.append((time.perf_counter() - start) * 1000)
        return {
            'avg_time_ms': sum(times)/len(times),
            'min_time_ms': min(times),
            'max_time_ms': max(times)
        }
    except Exception as e:
        return {'Error': str(e)}


### Analyzing Query Performance

In [21]:
#Analyze all queries
for stage in ['Before', 'After']:
    if stage == 'After':
        # Add indexes to optimize
        try:
            db['courses'].create_index([('category', 1)])
            db['assignments'].create_index([('dueDate', 1)])
            
        except Exception as e:
            print("Error creating indexes:", e)

    for q in queries:
        collection = q['collection']
        query = q['query']
        projection = q.get('projection')

        explain_stats = safe_explain(collection, query, projection)
        timing_stats = safe_time(lambda: collection.find(query, projection))

        # Combine stats and add to list
        query_results.append({
            'Query Name': q['name'],
            'Optimization': stage,
            **explain_stats,
            **timing_stats
        })

# Convert to DataFrame
df_stats = pd.DataFrame(query_results)

# Split into Before and After DataFrames
df_before = df_stats[df_stats['Optimization'] == 'Before'].reset_index(drop=True)
df_after = df_stats[df_stats['Optimization'] == 'After'].reset_index(drop=True)

print("BEFORE Optimization")
display(df_before)

print("\nAFTER Optimization")
display(df_after)



BEFORE Optimization


Unnamed: 0,Query Name,Optimization,total_docs_examined,total_keys_examined,winning_plan,avg_time_ms,min_time_ms,max_time_ms
0,Find user by email,Before,1,1,PROJECTION_SIMPLE,3.662264,0.624416,7.031625
1,Courses in category Data Science,Before,3,3,PROJECTION_SIMPLE,1.549639,0.61275,3.377333
2,Assignments due in next 7 days,Before,3,3,PROJECTION_SIMPLE,0.291041,0.269916,0.328708



AFTER Optimization


Unnamed: 0,Query Name,Optimization,total_docs_examined,total_keys_examined,winning_plan,avg_time_ms,min_time_ms,max_time_ms
0,Find user by email,After,1,1,PROJECTION_SIMPLE,0.33493,0.269791,0.424125
1,Courses in category Data Science,After,3,3,PROJECTION_SIMPLE,1.435667,0.308542,3.203875
2,Assignments due in next 7 days,After,3,3,PROJECTION_SIMPLE,0.503167,0.236042,0.765792
