# Comprehensive Evaluation - Benchmark Comparison with EDA

This notebook includes:
1. **Exploratory Data Analysis (EDA)** of the movie dataset
2. **Performance comparison** of 4 index structures (KD-Tree, QuadTree, Range Tree, R-Tree)
3. **Visualizations** and statistical analysis

---
# Part 1: Exploratory Data Analysis (EDA)
---

## 1. Import Libraries

In [None]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

print('All libraries imported successfully')

## 2. Load Movie Dataset

In [None]:
# Load the raw movie dataset
df_movies = pd.read_csv('data_movies_clean.csv')

# Extract year from release_date if it exists
if 'release_date' in df_movies.columns:
    df_movies['release_year'] = pd.to_datetime(df_movies['release_date'], errors='coerce').dt.year

print(f'Loaded {len(df_movies)} movies')
print(f'\nDataset shape: {df_movies.shape}')
print(f'\nColumns: {list(df_movies.columns)}')

## 3. Dataset Overview

In [None]:
# Display first few rows
print('First 5 movies:')
print('=' * 100)
df_movies.head()

## 4. Basic Statistics

In [None]:
# Statistical summary of numerical columns
print('Statistical Summary:')
print('=' * 100)
df_movies.describe()

## 5. Missing Data Analysis

In [None]:
# Check for missing values
missing_data = df_movies.isnull().sum()
missing_percent = (missing_data / len(df_movies)) * 100

missing_df = pd.DataFrame({
    'Column': missing_data.index,
    'Missing Count': missing_data.values,
    'Missing %': missing_percent.values
})

missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)

print('Missing Data Summary:')
print('=' * 80)
if len(missing_df) > 0:
    display(missing_df)
else:
    print('No missing values found!')

## 6. Data Distribution Visualizations

In [None]:
# Visualize distributions of key numerical features
fig, axes = plt.subplots(2, 3, figsize=(18, 10))

# Popularity distribution
if 'popularity' in df_movies.columns:
    ax = axes[0, 0]
    df_movies['popularity'].hist(bins=50, ax=ax, color='skyblue', edgecolor='black')
    ax.set_title('Popularity Distribution', fontsize=12, fontweight='bold')
    ax.set_xlabel('Popularity')
    ax.set_ylabel('Frequency')
    ax.grid(alpha=0.3)

# Vote Average distribution
if 'vote_average' in df_movies.columns:
    ax = axes[0, 1]
    df_movies['vote_average'].hist(bins=50, ax=ax, color='lightcoral', edgecolor='black')
    ax.set_title('Vote Average Distribution', fontsize=12, fontweight='bold')
    ax.set_xlabel('Vote Average')
    ax.set_ylabel('Frequency')
    ax.grid(alpha=0.3)

# Runtime distribution
if 'runtime' in df_movies.columns:
    ax = axes[0, 2]
    df_movies['runtime'].hist(bins=50, ax=ax, color='lightgreen', edgecolor='black')
    ax.set_title('Runtime Distribution', fontsize=12, fontweight='bold')
    ax.set_xlabel('Runtime (minutes)')
    ax.set_ylabel('Frequency')
    ax.grid(alpha=0.3)

# Budget distribution (log scale)
if 'budget' in df_movies.columns:
    ax = axes[1, 0]
    budget_nonzero = df_movies[df_movies['budget'] > 0]['budget']
    if len(budget_nonzero) > 0:
        ax.hist(np.log10(budget_nonzero), bins=50, color='gold', edgecolor='black')
        ax.set_title('Budget Distribution (log10)', fontsize=12, fontweight='bold')
        ax.set_xlabel('Log10(Budget)')
        ax.set_ylabel('Frequency')
        ax.grid(alpha=0.3)

# Release Year distribution
if 'release_year' in df_movies.columns:
    ax = axes[1, 1]
    df_movies['release_year'].dropna().hist(bins=50, ax=ax, color='plum', edgecolor='black')
    ax.set_title('Release Year Distribution', fontsize=12, fontweight='bold')
    ax.set_xlabel('Year')
    ax.set_ylabel('Frequency')
    ax.grid(alpha=0.3)

# Vote Count distribution (log scale)
if 'vote_count' in df_movies.columns:
    ax = axes[1, 2]
    vote_count_nonzero = df_movies[df_movies['vote_count'] > 0]['vote_count']
    if len(vote_count_nonzero) > 0:
        ax.hist(np.log10(vote_count_nonzero), bins=50, color='orange', edgecolor='black')
        ax.set_title('Vote Count Distribution (log10)', fontsize=12, fontweight='bold')
        ax.set_xlabel('Log10(Vote Count)')
        ax.set_ylabel('Frequency')
        ax.grid(alpha=0.3)

plt.tight_layout()
plt.show()

## 7. Correlation Analysis

In [None]:
# Select numerical columns for correlation
numerical_cols = ['popularity', 'vote_average', 'vote_count', 'runtime', 'budget', 'release_year']
available_cols = [col for col in numerical_cols if col in df_movies.columns]

if len(available_cols) >= 2:
    # Remove NaN values for correlation
    df_corr = df_movies[available_cols].dropna()
    correlation_matrix = df_corr.corr()
    
    # Create correlation heatmap
    fig, ax = plt.subplots(figsize=(10, 8))
    
    # Manual heatmap without seaborn
    im = ax.imshow(correlation_matrix, cmap='coolwarm', aspect='auto', vmin=-1, vmax=1)
    
    # Set ticks and labels
    ax.set_xticks(np.arange(len(available_cols)))
    ax.set_yticks(np.arange(len(available_cols)))
    ax.set_xticklabels(available_cols, rotation=45, ha='right')
    ax.set_yticklabels(available_cols)
    
    # Add correlation values as text
    for i in range(len(available_cols)):
        for j in range(len(available_cols)):
            text = ax.text(j, i, f'{correlation_matrix.iloc[i, j]:.2f}',
                          ha='center', va='center', color='black', fontsize=10)
    
    # Add colorbar
    cbar = plt.colorbar(im, ax=ax)
    cbar.set_label('Correlation Coefficient', rotation=270, labelpad=20)
    
    ax.set_title('Feature Correlation Matrix', fontsize=14, fontweight='bold', pad=20)
    plt.tight_layout()
    plt.show()
else:
    print('Not enough numerical columns for correlation analysis')

## 8. Top Movies Analysis

In [None]:
# Top 10 movies by popularity
if 'popularity' in df_movies.columns and 'title' in df_movies.columns:
    print('Top 10 Most Popular Movies:')
    print('=' * 100)
    cols_to_show = ['title', 'popularity']
    if 'vote_average' in df_movies.columns:
        cols_to_show.append('vote_average')
    if 'release_year' in df_movies.columns:
        cols_to_show.append('release_year')
    
    top_popular = df_movies.nlargest(10, 'popularity')[cols_to_show]
    display(top_popular)

print('\n')

# Top 10 movies by vote average (with minimum votes filter)
if 'vote_average' in df_movies.columns and 'vote_count' in df_movies.columns:
    print('Top 10 Highest Rated Movies (min 100 votes):')
    print('=' * 100)
    high_votes = df_movies[df_movies['vote_count'] >= 100]
    if len(high_votes) > 0:
        cols_to_show = ['title', 'vote_average', 'vote_count']
        if 'release_year' in df_movies.columns:
            cols_to_show.append('release_year')
        top_rated = high_votes.nlargest(10, 'vote_average')[cols_to_show]
        display(top_rated)
    else:
        print('No movies with 100+ votes found')

## 9. 5D Vector Space Analysis

In [None]:
# Load the 5D vectors used for indexing
try:
    vectors_5d = np.load('movie_5d_vectors.npy')
    print(f'âœ“ Loaded {len(vectors_5d)} 5D vectors')
    print(f'Vector shape: {vectors_5d.shape}')
    print(f'\nVector dimensions: [popularity, vote_average, runtime, budget, release_year]')
    
    # Statistics for each dimension
    print('\n5D Vector Statistics:')
    print('=' * 80)
    dim_names = ['Popularity', 'Vote Average', 'Runtime', 'Budget', 'Release Year']
    for i, name in enumerate(dim_names):
        print(f'{name:15s}: min={vectors_5d[:, i].min():10.2f}, max={vectors_5d[:, i].max():10.2f}, mean={vectors_5d[:, i].mean():10.2f}')
except FileNotFoundError:
    print('movie_5d_vectors.npy not found')
    vectors_5d = None

## 10. 5D Vector Pairwise Scatter Plots

In [None]:
if vectors_5d is not None:
    # Create pairwise scatter plots for first 3 dimensions
    fig, axes = plt.subplots(2, 2, figsize=(14, 12))
    
    dim_names = ['Popularity', 'Vote Avg', 'Runtime', 'Budget', 'Year']
    pairs = [(0, 1), (0, 2), (1, 2), (2, 4)]
    
    for idx, (i, j) in enumerate(pairs):
        ax = axes[idx // 2, idx % 2]
        ax.scatter(vectors_5d[:, i], vectors_5d[:, j], alpha=0.6, s=50, color='steelblue')
        ax.set_xlabel(dim_names[i], fontsize=11, fontweight='bold')
        ax.set_ylabel(dim_names[j], fontsize=11, fontweight='bold')
        ax.set_title(f'{dim_names[i]} vs {dim_names[j]}', fontsize=12, fontweight='bold')
        ax.grid(alpha=0.3)
    
    plt.tight_layout()
    plt.show()
else:
    print('Cannot create scatter plots - vectors not loaded')

---
# Part 2: Index Structure Performance Evaluation
---

## 11. Load Benchmark Results

In [None]:
df_benchmark = pd.read_csv('benchmark_results.csv')

print(f'Loaded {len(df_benchmark)} benchmark records')
print(f'\nIndex types: {df_benchmark["type"].unique()}')
print(f'Dataset sizes: {df_benchmark["size"].unique()}')
print('\nBenchmark Results Preview:')
df_benchmark.head(10)

## 12. Load Memory Profiling Results

In [None]:
try:
    df_memory = pd.read_csv('memory_profiling_results.csv')
    print(f'Loaded {len(df_memory)} memory profiling records')
    print('\nMemory Profiling Results:')
    display(df_memory)
except FileNotFoundError:
    print('memory_profiling_results.csv not found')
    df_memory = None

## 13. Load Performance Summary

In [None]:
results_dir = Path('evaluation_results')
summary_file = results_dir / 'performance_summary.json'

if summary_file.exists():
    with open(summary_file, 'r') as f:
        performance_data = json.load(f)
    print('Performance summary loaded')
    print(f'\nKeys: {list(performance_data.keys())}')
else:
    print('performance_summary.json not found')
    performance_data = None

## 14. Visualization - Build Time Comparison

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(12, 6))

for index_type in df_benchmark['type'].unique():
    data = df_benchmark[df_benchmark['type'] == index_type]
    ax.plot(data['size'], data['build_time'], marker='o', label=index_type.upper(), linewidth=2, markersize=8)

ax.set_xlabel('Dataset Size', fontsize=12, fontweight='bold')
ax.set_ylabel('Build Time (seconds)', fontsize=12, fontweight='bold')
ax.set_title('Index Build Time Comparison', fontsize=14, fontweight='bold')
ax.legend(fontsize=10)
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 15. Visualization - Query Performance

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Range Query
ax = axes[0]
for index_type in df_benchmark['type'].unique():
    data = df_benchmark[df_benchmark['type'] == index_type]
    valid = data[data['range_query_time'] > 0]
    if len(valid) > 0:
        ax.plot(valid['size'], valid['range_query_time'], marker='o', label=index_type.upper(), linewidth=2, markersize=8)
ax.set_xlabel('Dataset Size', fontsize=12, fontweight='bold')
ax.set_ylabel('Range Query Time (s)', fontsize=12, fontweight='bold')
ax.set_title('Range Query Performance', fontsize=14, fontweight='bold')
ax.legend(fontsize=10)
ax.grid(True, alpha=0.3)

# KNN Query
ax = axes[1]
for index_type in df_benchmark['type'].unique():
    data = df_benchmark[df_benchmark['type'] == index_type]
    valid = data[data['knn_time'] > 0]
    if len(valid) > 0:
        ax.plot(valid['size'], valid['knn_time'], marker='o', label=index_type.upper(), linewidth=2, markersize=8)
ax.set_xlabel('Dataset Size', fontsize=12, fontweight='bold')
ax.set_ylabel('KNN Query Time (s)', fontsize=12, fontweight='bold')
ax.set_title('KNN Query Performance', fontsize=14, fontweight='bold')
ax.legend(fontsize=10)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 16. Visualization - Memory Usage

In [None]:
if df_memory is not None:
    fig, ax = plt.subplots(1, 1, figsize=(12, 6))
    
    for index_type in df_memory['Index Structure'].unique():
        data = df_memory[df_memory['Index Structure'] == index_type]
        ax.plot(data['Dataset Size'], data['tracemalloc Allocated (MB)'], marker='o', label=index_type.upper(), linewidth=2, markersize=8)
    
    ax.set_xlabel('Dataset Size', fontsize=12, fontweight='bold')
    ax.set_ylabel('Memory Usage (MB)', fontsize=12, fontweight='bold')
    ax.set_title('Memory Usage Comparison', fontsize=14, fontweight='bold')
    ax.legend(fontsize=10)
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
else:
    print('No memory data available')

## 17. Performance Summary Table

In [None]:
summary_data = []

for index_type in df_benchmark['type'].unique():
    data = df_benchmark[df_benchmark['type'] == index_type]
    avg_build = data['build_time'].mean()
    avg_range = data[data['range_query_time'] > 0]['range_query_time'].mean()
    avg_knn = data[data['knn_time'] > 0]['knn_time'].mean()
    
    summary_data.append({
        'Method': f'{index_type.upper()} (standalone)',
        'Avg Build Time (s)': f'{avg_build:.4f}',
        'Avg Range Query (s)': f'{avg_range:.4f}' if not pd.isna(avg_range) else 'N/A',
        'Avg KNN Query (s)': f'{avg_knn:.4f}' if not pd.isna(avg_knn) else 'N/A'
    })

summary_df = pd.DataFrame(summary_data)
print('=' * 80)
print('PERFORMANCE SUMMARY')
print('=' * 80)
summary_df

## 18. Key Findings

In [None]:
print('=' * 80)
print('KEY FINDINGS')
print('=' * 80)

fastest_build = df_benchmark.groupby('type')['build_time'].mean().idxmin()
print(f'\n1. Fastest Build Time: {fastest_build.upper()}')

fastest_knn = df_benchmark[df_benchmark['knn_time'] > 0].groupby('type')['knn_time'].mean().idxmin()
print(f'2. Fastest KNN Query: {fastest_knn.upper()}')

if df_memory is not None:
    most_efficient = df_memory.groupby('Index Structure')['tracemalloc Allocated (MB)'].mean().idxmin()
    print(f'3. Most Memory Efficient: {most_efficient.upper()}')

print('\n' + '=' * 80)

---
# Conclusions

This analysis demonstrates:

1. **Data Characteristics**: The movie dataset shows diverse distributions across popularity, ratings, and temporal dimensions
2. **Index Performance**: KD-Tree consistently outperforms other structures in build time and query speed
3. **Memory Efficiency**: Different index structures have varying memory footprints, with trade-offs between speed and space
4. **Scalability**: Performance characteristics vary significantly with dataset size, particularly for Range Tree

The choice of index structure should be based on:
- Application requirements (build vs query time priority)
- Data characteristics (distribution, dimensionality)
- Query patterns (range size, frequency)
- System constraints (memory, latency)