**# Setup and Installation**

In [None]:
!pip install langchain langchain_openai faiss-cpu chromadb qdrant-client transformers sentence-transformers datasets matplotlib numpy tqdm

import os
import numpy as np
import matplotlib.pyplot as plt
import time
from tqdm import tqdm
from contextlib import contextmanager

# Set up for timing measurements
@contextmanager
def timing(label):
    start_time = time.time()
    try:
        yield
    finally:
        end_time = time.time()
        print(f"{label}: {end_time - start_time:.3f} seconds")

# Import necessary packages
import faiss
from langchain.vectorstores import FAISS, Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
# Note: If using a newer version of LangChain, we'll create a custom reranker
from langchain.retrievers import MultiQueryRetriever
from langchain_openai import OpenAI
from datasets import load_dataset

# Set up OpenAI API key - You'll need to provide your own API key
os.environ["OPENAI_API_KEY"] = "YOUR_OPENAI_API_KEY"  # Replace with your API key

print("Setup complete!")

**Part 1: Generate Sample Data**

In [4]:
# =================================================================
# Part 1: Generate Sample Data for Experimentation
# =================================================================

# We'll use a subset of the arxiv dataset for academic papers
print("Loading sample dataset...")

try:
    # Load a small sample of ArXiv papers
    dataset = load_dataset("arxiv_dataset", split="train[:100]")

    # Create documents from the dataset
    documents = []
    for item in dataset:
        doc = Document(
            page_content=f"{item['title']}\n\n{item['abstract']}",
            metadata={"title": item["title"],
                     "authors": item["authors"],
                     "categories": item["categories"],
                     "update_date": item["update_date"]}
        )
        documents.append(doc)

except Exception as e:
    print(f"Error loading dataset: {e}")
    print("Creating synthetic dataset instead...")

    # Create synthetic documents about renewable energy
    documents = [
        Document(
            page_content="Recent Advances in Perovskite Solar Cells: A Comprehensive Review\n\nThis paper presents the latest developments in perovskite solar cell technology, including improvements in efficiency, stability, and manufacturing processes. We discuss how the efficiency of perovskite cells has increased from 3.8% in 2009 to over 25.7% in recent demonstrations, surpassing many conventional silicon technologies. The paper also addresses remaining challenges including long-term stability, lead toxicity concerns, and scalable manufacturing techniques.",
            metadata={"title": "Recent Advances in Perovskite Solar Cells",
                     "authors": "Zhang, J., Williams, R., Johnson, T.",
                     "categories": "Physics, Materials Science",
                     "update_date": "2023-02-15"}
        ),
        # Add more synthetic documents (20+ documents for meaningful indexing)
    ]

    # Generate synthetic documents if needed
    topics = ["Solar Energy", "Wind Power", "Hydroelectric Systems", "Nuclear Fusion",
             "Battery Storage", "Smart Grids", "Hydrogen Fuel Cells", "Bioenergy",
             "Geothermal Power", "Quantum Computing", "Artificial Intelligence",
             "Climate Modeling", "Carbon Capture", "Renewable Integration"]

    for i in range(80):
        topic = topics[i % len(topics)]
        doc = Document(
            page_content=f"Advances in {topic}: New Frontiers and Opportunities\n\n" +
                         f"This research explores the latest developments in {topic} technology, " +
                         f"with a focus on improving efficiency and reducing costs. " +
                         f"We present novel approaches that have demonstrated significant " +
                         f"improvements over traditional methods and discuss potential " +
                         f"applications in both industrial and consumer settings.",
            metadata={"title": f"Advances in {topic}",
                     "authors": f"Author{i+1}, A., Author{i+2}, B.",
                     "categories": "Energy, Technology",
                     "update_date": f"2023-{(i%12)+1:02d}-{(i%28)+1:02d}"}
        )
        documents.append(doc)

print(f"Created dataset with {len(documents)} documents")

# Choose an embedding model
# Option A: OpenAI Embeddings (requires API key)
try:
    embeddings = OpenAIEmbeddings()
    print("Using OpenAI embeddings")
except Exception as e:
    # Option B: Local Hugging Face embeddings (free alternative)
    print(f"Error with OpenAI embeddings: {e}")
    print("Falling back to local Hugging Face embeddings")
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

Loading sample dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.25k [00:00<?, ?B/s]

arxiv_dataset.py:   0%|          | 0.00/4.67k [00:00<?, ?B/s]

The repository for arxiv_dataset contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/arxiv_dataset.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y
Error loading dataset:                 The dataset arxiv_dataset with config default requires manual data.
                Please follow the manual download instructions:
                     You need to go to https://www.kaggle.com/Cornell-University/arxiv,
and manually download the dataset. Once it is completed,
a zip folder named archive.zip will be appeared in your Downloads folder
or whichever folder your browser chooses to save files to. Extract that folder
and you would get a arxiv-metadata-oai-snapshot.json file
You can then move that file under <path/to/folder>.
The <path/to/folder> can e.g. be "~/manual_data".
arxiv_dataset can then be loaded using the following 

**Part 2: Indexing Strategies - HNSW**

In [None]:
# =================================================================
# Part 2: Indexing Strategies - HNSW
# =================================================================
print("\n=== Exploring HNSW Indexing ===")

# Create a simple function to generate document vectors for experimenting with FAISS directly
def get_vectors(documents, embedding_model):
    vectors = []
    for doc in tqdm(documents, desc="Creating embeddings"):
        vector = embedding_model.embed_query(doc.page_content)
        vectors.append(vector)
    return np.array(vectors, dtype=np.float32)

# Generate vectors
try:
    vectors = get_vectors(documents[:50], embeddings)  # Using a subset for speed
    dimension = vectors.shape[1]
    print(f"Generated {len(vectors)} vectors of dimension {dimension}")
except Exception as e:
    print(f"Error generating vectors: {e}")
    # Create random vectors for demonstration if real embeddings fail
    dimension = 384  # Default for many models
    vectors = np.random.random((50, dimension)).astype(np.float32)
    print(f"Created random vectors for demonstration: {vectors.shape}")

# Experiment with HNSW parameters
print("\nTesting HNSW with different parameters...")

# Define a sample query vector
query_vector = vectors[0]  # Using the first vector as a query for demonstration

# Test different M values (number of connections)
m_values = [4, 16, 32, 64]
ef_search_values = [16, 64, 128, 256]

# Results storage
results = []

for m in m_values:
    for ef_search in ef_search_values:
        try:
            # Create an HNSW index
            index = faiss.IndexHNSWFlat(dimension, m)
            index.hnsw.efConstruction = 200  # Fixed for comparison
            index.hnsw.efSearch = ef_search

            # Add vectors to the index
            with timing(f"HNSW Build (M={m})"):
                index.add(vectors)

            # Test search performance
            with timing(f"HNSW Search (M={m}, efSearch={ef_search})"):
                D, I = index.search(query_vector.reshape(1, -1), 5)

            results.append({
                'M': m,
                'efSearch': ef_search,
                'top_indices': I[0],
                'distances': D[0]
            })

            print(f"  M={m}, efSearch={ef_search}: Top indices: {I[0]}, Distances: {D[0]}")

        except Exception as e:
            print(f"Error with HNSW (M={m}, efSearch={ef_search}): {e}")

**Part 3: Indexing Strategies - IVF**

In [None]:
# =================================================================
# Part 3: Indexing Strategies - IVF
# =================================================================
print("\n=== Exploring IVF Indexing ===")

# Test different numbers of clusters
n_clusters_values = [4, 10, 20, 50]
nprobe_values = [1, 5, 10, 20]

for n_clusters in n_clusters_values:
    try:
        # Create quantizer
        quantizer = faiss.IndexFlatL2(dimension)

        # Create an IVF index
        index = faiss.IndexIVFFlat(quantizer, dimension, n_clusters)

        # Train the index
        with timing(f"IVF Training (clusters={n_clusters})"):
            index.train(vectors)

        # Add vectors to the index
        with timing(f"IVF Add (clusters={n_clusters})"):
            index.add(vectors)

        # Test different nprobe values
        for nprobe in nprobe_values:
            index.nprobe = nprobe

            # Search
            with timing(f"IVF Search (clusters={n_clusters}, nprobe={nprobe})"):
                D, I = index.search(query_vector.reshape(1, -1), 5)

            print(f"  Clusters={n_clusters}, nprobe={nprobe}: Top indices: {I[0]}, Distances: {D[0]}")

    except Exception as e:
        print(f"Error with IVF (clusters={n_clusters}): {e}")

**Part 4: Vector Compression Techniques**

In [None]:
# =================================================================
# Part 4: Vector Compression Techniques
# =================================================================
print("\n=== Exploring Vector Compression ===")

try:
    # Original size benchmark
    original_size = vectors.nbytes / 1024
    print(f"Original vectors size: {original_size:.2f} KB")

    # Scalar Quantization (8-bit)
    index_8bit = faiss.IndexScalarQuantizer(dimension, faiss.ScalarQuantizer.QT_8bit)
    with timing("8-bit Scalar Quantization"):
        index_8bit.add(vectors)

    # Scalar Quantization (4-bit)
    index_4bit = faiss.IndexScalarQuantizer(dimension, faiss.ScalarQuantizer.QT_4bit)
    with timing("4-bit Scalar Quantization"):
        index_4bit.add(vectors)

    # Product Quantization
    # m must divide dimension evenly, adjust if needed
    m = 8
    while dimension % m != 0:
        m -= 1

    index_pq = faiss.IndexPQ(dimension, m, 8)  # m subquantizers with 8 bits each
    with timing(f"Product Quantization (m={m})"):
        index_pq.train(vectors)
        index_pq.add(vectors)

    # Compare accuracy of different compression methods
    print("\nAccuracy Comparison:")

    # Ground truth (using exact search)
    index_flat = faiss.IndexFlatL2(dimension)
    index_flat.add(vectors)
    D_ref, I_ref = index_flat.search(query_vector.reshape(1, -1), 5)

    # Results with 8-bit quantization
    D_8bit, I_8bit = index_8bit.search(query_vector.reshape(1, -1), 5)

    # Results with 4-bit quantization
    D_4bit, I_4bit = index_4bit.search(query_vector.reshape(1, -1), 5)

    # Results with PQ
    D_pq, I_pq = index_pq.search(query_vector.reshape(1, -1), 5)

    print(f"Reference results: {I_ref[0]}")
    print(f"8-bit quant results: {I_8bit[0]} (matching: {np.sum(np.isin(I_8bit[0], I_ref[0]))})")
    print(f"4-bit quant results: {I_4bit[0]} (matching: {np.sum(np.isin(I_4bit[0], I_ref[0]))})")
    print(f"PQ results: {I_pq[0]} (matching: {np.sum(np.isin(I_pq[0], I_ref[0]))})")

    # Estimated size comparison
    print("\nMemory Usage Comparison:")
    print(f"Original (32-bit float): {original_size:.2f} KB")
    print(f"8-bit quantization: ~{original_size/4:.2f} KB")
    print(f"4-bit quantization: ~{original_size/8:.2f} KB")
    print(f"Product Quantization: ~{len(vectors) * m:.2f} KB")

except Exception as e:
    print(f"Error in compression experiments: {e}")

**Part 5: Semantic Chunking**

In [None]:
# =================================================================
# Part 5: Semantic Chunking
# =================================================================
print("\n=== Exploring Semantic Chunking ===")

# Create a sample document with clear structure for demonstration
sample_document = Document(
    page_content="""# Introduction to Renewable Energy

## Solar Power
Solar power is a clean, renewable energy source that harnesses energy from the sun.
Photovoltaic cells convert sunlight directly into electricity.
Solar thermal systems use the sun's heat for water heating or power generation.

## Wind Energy
Wind turbines convert the kinetic energy of wind into mechanical power.
This mechanical power can be used for specific tasks or converted into electricity.
Offshore wind farms can take advantage of strong, consistent ocean winds.

## Hydroelectric Power
Hydroelectric power captures energy from flowing water.
Large dams create reservoirs, releasing water through turbines to generate electricity.
Run-of-river systems use natural river flow with minimal environmental impact.

### Micro-Hydro Systems
Smaller hydroelectric systems can power individual communities.
These systems have minimal environmental impact and are suitable for remote areas.

## Conclusion
Renewable energy sources provide sustainable alternatives to fossil fuels.
Continued innovation will increase efficiency and reduce costs.
""",
    metadata={"title": "Introduction to Renewable Energy"}
)

# Compare different chunking strategies
print("\nComparing different chunking approaches:")

# Basic character splitter
basic_splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,
    chunk_overlap=0,
    separators=["\n\n", "\n", " ", ""]
)

# Semantic splitter with hierarchy
semantic_splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,
    chunk_overlap=50,
    separators=["# ", "## ", "### ", "\n\n", "\n", " ", ""]
)

# Apply both splitting strategies
basic_chunks = basic_splitter.split_documents([sample_document])
semantic_chunks = semantic_splitter.split_documents([sample_document])

print(f"\nBasic splitting created {len(basic_chunks)} chunks")
print(f"Semantic splitting created {len(semantic_chunks)} chunks")

# Display examples of both chunking methods
print("\nBasic Chunking Example:")
for i, chunk in enumerate(basic_chunks[:3]):  # Show first 3 chunks
    print(f"Chunk {i+1}: {chunk.page_content[:100]}...")

print("\nSemantic Chunking Example:")
for i, chunk in enumerate(semantic_chunks[:3]):  # Show first 3 chunks
    print(f"Chunk {i+1}: {chunk.page_content[:100]}...")

**Part 6: Re-ranking with Contextual Compression**

In [None]:
# =================================================================
# Part 6: Re-ranking with Contextual Compression
# =================================================================
print("\n=== Exploring Re-ranking ===")

# Create a simple vector store with our documents for retrieval experiments
try:
    vectorstore = FAISS.from_documents(documents, embeddings)
    print(f"Created FAISS vectorstore with {len(documents)} documents")

    # Define a test query
    test_query = "advances in solar cell efficiency and materials"

    # Standard retrieval
    with timing("Standard retrieval"):
        standard_results = vectorstore.similarity_search(test_query, k=5)

    print("\nStandard Retrieval Results:")
    for i, doc in enumerate(standard_results):
        print(f"{i+1}. {doc.metadata['title']}")

    # Create LLM re-ranker
    try:
        llm = OpenAI(temperature=0)
        reranker = LLMRerank.from_llm(
            llm=llm,
            k=5  # Number of documents to return after re-ranking
        )

        rerank_retriever = ContextualCompressionRetriever(
            base_compressor=reranker,
            base_retriever=vectorstore.as_retriever(
                search_kwargs={"k": 10}  # Retrieve more docs initially for re-ranking
            )
        )

        # Get re-ranked documents
        with timing("LLM re-ranking"):
            reranked_docs = compression_retriever.get_relevant_documents(test_query)

        print("\nContextually Compressed/Extracted Results:")
        for i, doc in enumerate(reranked_docs):
            print(f"{i+1}. {doc.metadata['title']}")

    except Exception as e:
        print(f"Error with LLM re-ranking (may require API key): {e}")
        print("Skipping LLM-based re-ranking demonstration")

    # Multi-Query Retrieval demonstration
    try:
        # Create a retriever that generates multiple query variations
        multi_query_retriever = MultiQueryRetriever.from_llm(
            retriever=vectorstore.as_retriever(),
            llm=OpenAI()
        )

        # The retriever will generate variations of the query,
        # perform separate searches, and combine the results
        with timing("Multi-query retrieval"):
            multi_query_results = multi_query_retriever.get_relevant_documents(
                "How do newer types of solar cells compare to silicon?"
            )

        print("\nMulti-Query Retrieval Results:")
        for i, doc in enumerate(multi_query_results[:5]):  # Display top 5
            print(f"{i+1}. {doc.metadata['title']}")

    except Exception as e:
        print(f"Error with multi-query retrieval (may require API key): {e}")
        print("Skipping multi-query retrieval demonstration")

except Exception as e:
    print(f"Error in retrieval experiments: {e}")


**Part 7: Performance Analysis and Visualization**

In [None]:
# =================================================================
# Part 7: Performance Analysis and Visualization
# =================================================================
print("\n=== Performance Analysis and Visualization ===")

try:
    # Prepare data for visualization from our experiments
    print("Generating performance visualizations...")

    # Example 1: HNSW parameter impact
    plt.figure(figsize=(10, 6))

    # Create synthetic data based on theoretical performance
    m_values_extended = [4, 8, 16, 32, 64, 128]
    times_relative = [0.8, 0.85, 1.0, 1.3, 1.8, 2.5]  # Relative search times
    accuracy_relative = [0.6, 0.75, 0.88, 0.95, 0.98, 0.99]  # Relative accuracy

    plt.plot(m_values_extended, times_relative, 'o-', label='Relative Search Time')
    plt.plot(m_values_extended, accuracy_relative, 's-', label='Relative Accuracy')
    plt.xlabel('M (connections per layer)')
    plt.ylabel('Relative Performance')
    plt.title('Impact of HNSW Parameters on Search Performance')
    plt.legend()
    plt.grid(True)
    plt.savefig('hnsw_performance.png')

    # Example 2: Vector Compression Methods
    plt.figure(figsize=(10, 6))

    # Theoretical data
    compression_methods = ['32-bit Float', '8-bit Quant', '4-bit Quant', 'PQ (m=8)']
    memory_usage = [100, 25, 12.5, 3]  # Percentage of original memory
    accuracy = [100, 95, 85, 75]  # Percentage of original accuracy

    x = range(len(compression_methods))
    width = 0.35

    plt.bar(x, memory_usage, width, label='Memory Usage (%)')
    plt.bar([i + width for i in x], accuracy, width, label='Accuracy (%)')

    plt.xlabel('Compression Method')
    plt.ylabel('Percentage')
    plt.title('Trade-off: Memory Usage vs. Accuracy')
    plt.xticks([i + width/2 for i in x], compression_methods)
    plt.legend()
    plt.grid(True, axis='y')
    plt.savefig('compression_tradeoff.png')

    print("Visualizations created: hnsw_performance.png and compression_tradeoff.png")

except Exception as e:
    print(f"Error creating visualizations: {e}")

print("\nNotebook execution complete!")

# Note: Some sections of this notebook require API keys and may be skipped if they are not provided.
# The core concepts and techniques are demonstrated regardless of API availability.