## Section 1: Project Structure Setup

**Key Design**: Implementation logic is in `.py` files, configuration in YAML, and inference/evaluation in this notebook.

## Section 2: Import Implementation Modules

In [None]:
# Add src directory to path
import sys
from pathlib import Path

project_root = Path.cwd()
src_path = project_root / 'src'
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

print(f"Project root: {project_root}")
print(f"Source path: {src_path}")
print(f"Source path exists: {src_path.exists()}")

In [None]:
# Import all modules from src package
from src import (
    IndexInfo, DataStore, Compression, QueryProc, Optimizations,
    CompressionUtils,
    TextPreprocessor,
    InvertedIndex,
    BooleanExprParser, QueryProcessor,
    MetricsCollector, Reporter,
    IndexBuilder, TestQueryGenerator
)

import yaml
import numpy as np
import matplotlib.pyplot as plt
from tabulate import tabulate
import os

print("‚úì All modules imported successfully")

## Section 3: Load Configuration

In [None]:
# Load configuration from YAML
config_path = project_root / 'config' / 'index_config.yaml'

if config_path.exists():
    with open(config_path, 'r') as f:
        config = yaml.safe_load(f)
    print("‚úì Configuration loaded successfully")
else:
    print(f"‚ö†Ô∏è Config file not found at {config_path}")
    config = {}

In [None]:
# Display loaded configuration
print("\n" + "="*60)
print("LOADED CONFIGURATION")
print("="*60)

# Data configuration
data_config = config.get('data', {})
print(f"\nüìÅ Data Settings:")
print(f"   Directory: {data_config.get('directory', 'N/A')}")
print(f"   Max Docs: {data_config.get('max_docs', 'Unlimited')}")

# Index configuration
index_config = config.get('index', {})
print(f"\nüìä Index Settings:")
print(f"   Type: {index_config.get('type', 'BOOLEAN')}")
print(f"   Storage: {index_config.get('storage', 'CUSTOM')}")
print(f"   Compression: {index_config.get('compression', 'NONE')}")
print(f"   Optimization: {index_config.get('optimization', 'NULL')}")
print(f"   Query Strategy: {index_config.get('query_strategy', 'TERM_AT_A_TIME')}")

# Query configuration
query_config = config.get('query', {})
print(f"\nüîç Query Settings:")
print(f"   Default Top-K: {query_config.get('default_top_k', 10)}")
print(f"   Max Results: {query_config.get('max_results', 1000)}")

print(f"\n" + "="*60)

In [None]:
# Extract key parameters for easy access
DATA_DIR = Path(data_config.get('directory', 'free-news-datasets/News_Datasets'))
MAX_DOCS = int(data_config.get('max_docs', 1e6))
INDEX_TYPE = index_config.get('type', 'BOOLEAN')
STORAGE_TYPE = index_config.get('storage', 'CUSTOM')
COMPRESSION_TYPE = index_config.get('compression', 'NONE')
OPTIMIZATION_TYPE = index_config.get('optimization', 'NULL')
QUERY_STRATEGY = index_config.get('query_strategy', 'TERM_AT_A_TIME')
DEFAULT_TOP_K = query_config.get('default_top_k', 10)

print(f"\n‚úì Configuration parameters extracted")
print(f"  DATA_DIR: {DATA_DIR}")
print(f"  MAX_DOCS: {MAX_DOCS}")
print(f"  INDEX_TYPE: {INDEX_TYPE}")

## Section 4: Initialize Models and Services

In [None]:
# Construct version string from configuration
# Format: SelfIndex-v1.xyziqQ

# Map type names to codes
index_type_map = {'BOOLEAN': '1', 'WORDCOUNT': '2', 'TFIDF': '3'}
storage_map = {'CUSTOM': '1', 'SQLITE': '2', 'REDIS': '3'}
compression_map = {'NONE': '1', 'VARBYTE_ENCODING': '2', 'GZIP_COMPRESSION': '3'}
optimization_map = {'NULL': '0', 'SKIPPING': 'sp', 'THRESHOLDING': 'th', 'EARLY_STOPPING': 'es'}
strategy_map = {'TERM_AT_A_TIME': 'T', 'DOCUMENT_AT_A_TIME': 'D'}

x = index_type_map.get(INDEX_TYPE, '1')
y = storage_map.get(STORAGE_TYPE, '1')
z = compression_map.get(COMPRESSION_TYPE, '1')
i = optimization_map.get(OPTIMIZATION_TYPE, '0')
q = strategy_map.get(QUERY_STRATEGY, 'T')

VERSION_STRING = f'SelfIndex-v1.{x}{y}{z}{i}{q}'

print(f"‚úì Version string constructed: {VERSION_STRING}")
print(f"  x={x} (IndexInfo: {INDEX_TYPE})")
print(f"  y={y} (DataStore: {STORAGE_TYPE})")
print(f"  z={z} (Compression: {COMPRESSION_TYPE})")
print(f"  i={i} (Optimization: {OPTIMIZATION_TYPE})")
print(f"  q={q} (QueryProc: {QUERY_STRATEGY})")

In [None]:
# Initialize IndexBuilder
print(f"\nInitializing IndexBuilder with version: {VERSION_STRING}")
builder = IndexBuilder(VERSION_STRING)

print(f"\nConfiguration:")
for key, value in builder.config.items():
    print(f"  {key}: {value}")

In [None]:
# Option 1: Build a new index
# Uncomment to build from data directory

# if DATA_DIR.exists():
#     print(f"Building index from {DATA_DIR}...")
#     index = builder.build_index(DATA_DIR, max_docs=MAX_DOCS)
# else:
#     print(f"‚ö†Ô∏è Data directory not found: {DATA_DIR}")
#     print("Please update DATA_DIR in configuration or skip to Option 2")

In [None]:
# Option 2: Load an existing index
print(f"\nAttempting to load existing index: {VERSION_STRING}")
try:
    index = builder.load_index()
    print(f"‚úì Index loaded successfully")
    print(f"  Number of documents: {index.num_docs}")
    print(f"  Number of unique terms: {len(index.index)}")
    print(f"  Average document length: {index.avg_doc_length:.2f}")
except Exception as e:
    print(f"‚úó Failed to load index: {e}")
    print(f"\nTo build a new index:")
    print(f"1. Ensure data directory exists at: {DATA_DIR}")
    print(f"2. Uncomment 'Option 1: Build a new index' cell above")

In [None]:
# Initialize query processor
print(f"\nInitializing QueryProcessor...")
try:
    qp = builder.get_query_processor()
    print(f"‚úì Query processor initialized successfully")
    print(f"  Strategy: {builder.config['query_proc']}")
    print(f"  Optimization: {builder.config['optimization']}")
except Exception as e:
    print(f"‚úó Failed to initialize query processor: {e}")

## Section 5: Execute Queries and Inference

In [None]:
# Test with a simple query
test_query = "technology innovation"
print(f"\nExecuting test query: '{test_query}'")
print(f"Top {DEFAULT_TOP_K} results:\n")

try:
    results = qp.process_ranked_query(test_query, top_k=DEFAULT_TOP_K)
    
    if results:
        for i, doc in enumerate(results, 1):
            print(f"{i}. {doc['title'][:60]}..." if len(doc['title']) > 60 else f"{i}. {doc['title']}")
            print(f"   Author: {doc['author']}")
            print(f"   Published: {doc['published']}")
            print(f"   Score: {doc['score']:.4f}")
            print()
    else:
        print("No results found.")
except Exception as e:
    print(f"‚úó Query execution failed: {e}")

In [None]:
# Boolean query example
boolean_query = '"artificial" AND "intelligence"'
print(f"\nExecuting boolean query: {boolean_query}")

try:
    result_docs = qp.process_boolean_query(boolean_query)
    print(f"Found {len(result_docs)} documents matching the query")
    print(f"Doc IDs: {sorted(list(result_docs))[:20]}..." if len(result_docs) > 20 else f"Doc IDs: {sorted(result_docs)}")
except Exception as e:
    print(f"‚úó Boolean query failed: {e}")

In [None]:
# Generate multiple test queries
print(f"\nGenerating test queries...")
test_queries = TestQueryGenerator.generate_queries(index, num_queries=30)
print(f"‚úì Generated {len(test_queries)} test queries")
print(f"\nSample queries:")
for i, query in enumerate(test_queries[:5], 1):
    print(f"  {i}. {query}")

## Section 6: Generate and Visualize Metrics

In [None]:
# Measure performance metrics
print(f"\nMeasuring performance metrics...")
print(f"This may take a moment...\n")

metrics_config = config.get('metrics', {})

try:
    # Measure latency
    latency_reps = metrics_config.get('latency_repetitions', 1)
    latency_queries = test_queries[:metrics_config.get('throughput_queries', 25)]
    
    print(f"Measuring query latency ({len(latency_queries)} queries, {latency_reps} repetitions)...")
    latency_metrics = MetricsCollector.measure_query_latency(
        qp, latency_queries, top_k=DEFAULT_TOP_K, repetitions=latency_reps
    )
    print(f"‚úì Latency measurement complete")
    
    # Measure throughput
    print(f"Measuring query throughput...")
    throughput = MetricsCollector.measure_throughput(
        qp, latency_queries, duration=5, repetitions=1
    )
    print(f"‚úì Throughput measurement complete")
    
    # Measure memory and index size
    print(f"Measuring memory usage and index size...")
    memory = MetricsCollector.measure_memory()
    index_size = MetricsCollector.measure_index_size(VERSION_STRING, builder.config['datastore'])
    print(f"‚úì Memory and size measurement complete")
    
    # Compile metrics
    metrics = {
        'latency': latency_metrics,
        'throughput': throughput,
        'memory': memory,
        'index_size': index_size
    }
    
except Exception as e:
    print(f"‚úó Metrics collection failed: {e}")
    metrics = {}

In [None]:
# Display metrics report
if metrics:
    Reporter.print_metrics_report(VERSION_STRING, metrics)
else:
    print("No metrics available")

In [None]:
# Create visualization of latency metrics
if 'latency' in metrics:
    latency = metrics['latency']
    
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Latency distribution
    ax1 = axes[0]
    metrics_to_plot = ['mean', 'median', 'p95', 'p99', 'min', 'max']
    values = [latency.get(m, 0) for m in metrics_to_plot]
    colors = ['green', 'blue', 'orange', 'red', 'lightgreen', 'lightcoral']
    
    bars = ax1.bar(metrics_to_plot, values, color=colors, alpha=0.7, edgecolor='black')
    ax1.set_ylabel('Latency (ms)', fontsize=12)
    ax1.set_title('Query Latency Metrics', fontsize=14, fontweight='bold')
    ax1.grid(axis='y', alpha=0.3)
    
    # Add value labels on bars
    for bar in bars:
        height = bar.get_height()
        ax1.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.2f}',
                ha='center', va='bottom', fontsize=9)
    
    # Summary statistics table
    ax2 = axes[1]
    ax2.axis('tight')
    ax2.axis('off')
    
    summary_data = [
        ['Metric', 'Value'],
        ['Mean Latency (ms)', f"{latency.get('mean', 0):.4f}"],
        ['Median Latency (ms)', f"{latency.get('median', 0):.4f}"],
        ['P95 Latency (ms)', f"{latency.get('p95', 0):.4f}"],
        ['P99 Latency (ms)', f"{latency.get('p99', 0):.4f}"],
        ['Std Dev (ms)', f"{latency.get('std', 0):.4f}"],
        ['Throughput (q/s)', f"{metrics.get('throughput', 0):.2f}"],
        ['Memory (MB)', f"{metrics.get('memory', 0):.2f}"],
        ['Index Size (MB)', f"{metrics.get('index_size', 0):.2f}"]
    ]
    
    table = ax2.table(cellText=summary_data, cellLoc='left', loc='center',
                     colWidths=[0.6, 0.4])
    table.auto_set_font_size(False)
    table.set_fontsize(10)
    table.scale(1, 2)
    
    # Style header row
    for i in range(2):
        table[(0, i)].set_facecolor('#40466e')
        table[(0, i)].set_text_props(weight='bold', color='white')
    
    plt.suptitle(f'Performance Metrics - {VERSION_STRING}', fontsize=16, fontweight='bold', y=0.98)
    plt.tight_layout()
    
    # Save plot
    os.makedirs('plot', exist_ok=True)
    plot_path = os.path.join('plot', f'metrics_{VERSION_STRING}.png')
    plt.savefig(plot_path, dpi=150, bbox_inches='tight')
    print(f"‚úì Plot saved to {plot_path}")
    
    plt.show()
else:
    print("No latency metrics available for visualization")

In [None]:
# Create a summary report
print(f"\n" + "="*70)
print(f"INFERENCE & EVALUATION SUMMARY REPORT")
print(f"="*70)

print(f"\nüìå Configuration")
print(f"  Version: {VERSION_STRING}")
print(f"  Index Type: {INDEX_TYPE}")
print(f"  Storage: {STORAGE_TYPE}")
print(f"  Compression: {COMPRESSION_TYPE}")
print(f"  Optimization: {OPTIMIZATION_TYPE}")
print(f"  Query Strategy: {QUERY_STRATEGY}")

print(f"\nüìä Index Statistics")
if index:
    print(f"  Total Documents: {index.num_docs}")
    print(f"  Unique Terms: {len(index.index)}")
    print(f"  Avg Doc Length: {index.avg_doc_length:.2f}")
    print(f"  Avg Docs per Term: {np.mean([info['df'] for info in index.index.values() if 'df' in info]):.2f}")

print(f"\n‚ö° Performance Metrics")
if metrics:
    print(f"  Mean Latency: {metrics['latency'].get('mean', 0):.4f} ms")
    print(f"  P95 Latency: {metrics['latency'].get('p95', 0):.4f} ms")
    print(f"  Throughput: {metrics.get('throughput', 0):.2f} queries/second")
    print(f"  Memory Usage: {metrics.get('memory', 0):.2f} MB")
    print(f"  Index Size: {metrics.get('index_size', 0):.2f} MB")

print(f"\nüîç Query Examples Tested")
for i, query in enumerate(test_queries[:3], 1):
    print(f"  {i}. {query}")

print(f"\n" + "="*70)

## Additional: Run Multiple Benchmark Configurations

Uncomment the cells below to run benchmarks with different index configurations.

In [None]:
# # Benchmark different index configurations
# configurations = [
#     ('SelfIndex-v1.1110T', 'Boolean + Custom'),
#     ('SelfIndex-v1.2110T', 'WordCount + Custom'),
#     ('SelfIndex-v1.3110T', 'TF-IDF + Custom'),
# ]

# benchmark_results = {}

# for version, description in configurations:
#     print(f"\n{'='*60}")
#     print(f"Benchmarking: {description}")
#     print(f"Version: {version}")
#     print(f"{'='*60}")
#     
#     try:
#         builder = IndexBuilder(version)
#         index = builder.load_index()  # or build_index()
#         qp = builder.get_query_processor()
#         
#         queries = TestQueryGenerator.generate_queries(index, num_queries=20)
#         
#         benchmark_results[version] = {
#             'latency': MetricsCollector.measure_query_latency(qp, queries[:15]),
#             'throughput': MetricsCollector.measure_throughput(qp, queries[:15]),
#             'memory': MetricsCollector.measure_memory(),
#             'index_size': MetricsCollector.measure_index_size(version, builder.config['datastore'])
#         }
#         print("‚úì Benchmark complete")
#     except Exception as e:
#         print(f"‚úó Benchmark failed: {e}")

# # Compare results
# Reporter.compare_metrics(benchmark_results)