Vectore Storage using Hugging Face Embeddings

In [2]:
import os
import os.path
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.core import StorageContext, load_index_from_storage
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.langchain import LangChainLLM
from langchain_groq import ChatGroq
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor
from llama_index.core.response.pprint_utils import pprint_response

# Ensure your GROQ_API_KEY is set
# os.environ["GROQ_API_KEY"] = "your-groq-api-key"
from dotenv import load_dotenv
load_dotenv()

# Create LangChain Groq LLM
langchain_llm = ChatGroq(
    model="llama3-70b-8192",
    temperature=0.0,
    max_retries=2,
)

# Wrap it for LlamaIndex
llm = LangChainLLM(llm=langchain_llm)

# Set the global LLM in Settings
Settings.llm = llm

# Use a local embedding model
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
Settings.embed_model = embed_model

# Define persistence directory
PERSIST_DIR = "./storage"

# Check if storage already exists
if not os.path.exists(PERSIST_DIR):
    # Load the documents and create the index
    print("Creating new index...")
    documents = SimpleDirectoryReader("data").load_data()
    index = VectorStoreIndex.from_documents(
        documents,
        show_progress=True
    )
    # Store it for later
    print(f"Persisting index to {PERSIST_DIR}...")
    index.storage_context.persist(persist_dir=PERSIST_DIR)
else:
    # Load the existing index
    print(f"Loading existing index from {PERSIST_DIR}...")
    storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
    index = load_index_from_storage(storage_context)

# Create retriever with similarity search
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=4
)

# Create postprocessor to filter low-relevance results
postprocessor = SimilarityPostprocessor(similarity_cutoff=0.60)

# Create query engine with retriever and postprocessor
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    node_postprocessors=[postprocessor]
)

# Example queries
queries = [
    "What is invehicle infotainment?",
    "What are subwoofers?",
    "What are amplifiers?"
]

# Run queries
for query in queries:
    print("\n" + "="*50)
    print(f"QUERY: {query}")
    print("="*50)
    
    response = query_engine.query(query)
    
    # Print full response with sources
    print("\nRESPONSE WITH SOURCES:")
    pprint_response(response, show_source=True)
    
    # Print just the response text
    print("\nRESPONSE TEXT ONLY:")
    print(response)
    print("="*50)

python-dotenv could not parse statement starting at line 1
python-dotenv could not parse statement starting at line 9


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Creating new index...


Parsing nodes: 100%|██████████| 15/15 [00:01<00:00, 11.39it/s]
Generating embeddings: 100%|██████████| 19/19 [00:03<00:00,  5.56it/s]


Persisting index to ./storage...

QUERY: What is invehicle infotainment?

RESPONSE WITH SOURCES:
Final Response: In-vehicle infotainment (IVI) is a technology that
integrates entertainment, multimedia, and driver-assisting
technologies into a single module. It provides a range of features,
including rear seat entertainment, external connectivity, connectivity
to mobile devices, advanced driver assistance systems, and security
systems. IVI systems are designed to provide excellent entertainment
facilities, assist drivers while parking, alert them to congested
traffic routes, and suggest alternative paths. They also provide
internet connectivity inside the car.
______________________________________________________________________
Source Node 1/4
Node ID: 88cb1410-693e-4702-ac47-60401adaeb4b
Similarity: 0.7787228429712563
Text: International Journal of Advanced Computational Engineering and
Networking , ISSN: 2320-2106  Volume- 1, Issue- 7, Sept-2013   In-
Vehicle Infotainment Systems   

vectore storage using sentence window embeddings

In [1]:
import os
import shutil
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, StorageContext
from llama_index.core.node_parser import SentenceWindowNodeParser
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.langchain import LangChainLLM
from langchain_groq import ChatGroq
from llama_index.core import Settings
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Configuration
DATA_DIR = "./data"  # Directory containing your documents
PERSIST_DIR = "./storage_sentence_window"  # Storage for sentence window index
WINDOW_SIZE = 3  # Same as your default sentence_window_size
MODEL_NAME = "llama3-70b-8192"  # Same as your default llm_model
TEMPERATURE = 0.1  # Same as your default temperature

# Initialize LLM and embeddings
langchain_llm = ChatGroq(model=MODEL_NAME, temperature=TEMPERATURE, max_retries=2)
llm = LangChainLLM(llm=langchain_llm)
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
Settings.llm = llm
Settings.embed_model = embed_model

# Check if data directory exists
if not os.path.exists(DATA_DIR):
    raise FileNotFoundError(f"Data directory '{DATA_DIR}' not found. Please create it and add documents.")

# Check if storage directory is valid
def is_valid_storage_dir(directory):
    required_files = ["docstore.json", "index_store.json"]
    if not os.path.exists(directory):
        return False
    return all(os.path.exists(os.path.join(directory, f)) for f in required_files)

# Create or rebuild sentence window index
try:
    if os.path.exists(PERSIST_DIR) and not is_valid_storage_dir(PERSIST_DIR):
        print(f"Invalid or corrupted storage directory '{PERSIST_DIR}'. Deleting and rebuilding...")
        shutil.rmtree(PERSIST_DIR)
    
    if not os.path.exists(PERSIST_DIR):
        print(f"Creating sentence window index in '{PERSIST_DIR}'...")
        # Load documents
        documents = SimpleDirectoryReader(DATA_DIR).load_data()
        if not documents:
            raise ValueError(f"No documents found in '{DATA_DIR}'.")
        
        # Create node parser
        node_parser = SentenceWindowNodeParser.from_defaults(
            window_size=WINDOW_SIZE,
            window_metadata_key="window",
            original_text_metadata_key="original_text"
        )
        
        # Parse nodes
        nodes = node_parser.get_nodes_from_documents(documents)
        
        # Create and persist index
        storage_context = StorageContext.from_defaults()
        index = VectorStoreIndex(nodes, storage_context=storage_context)
        index.storage_context.persist(persist_dir=PERSIST_DIR)
        print(f"Index successfully created and saved to '{PERSIST_DIR}'.")
    else:
        print(f"Valid index found in '{PERSIST_DIR}'. Ready to use.")
except Exception as e:
    print(f"Error creating index: {str(e)}")
    raise

  from .autonotebook import tqdm as notebook_tqdm
python-dotenv could not parse statement starting at line 1
python-dotenv could not parse statement starting at line 9


Valid index found in './storage_sentence_window'. Ready to use.
