In [7]:
from langchain_core.documents import Document
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_ibm import WatsonxEmbeddings
from ibm_watsonx_ai.metanames import EmbedTextParamsMetaNames
from langchain.chains import RetrievalQA
from ibm_watsonx_ai.foundation_models import ModelInference
from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
from ibm_watsonx_ai.foundation_models.utils.enums import ModelTypes
from ibm_watson_machine_learning.foundation_models.extensions.langchain import WatsonxLLM

# 1. Load a document about AI
loader = WebBaseLoader("https://python.langchain.com/v0.2/docs/introduction/")
documents = loader.load()

# 2. Split the document into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_documents(documents)

# 3. Set up the embedding model
embed_params = {
    EmbedTextParamsMetaNames.TRUNCATE_INPUT_TOKENS: 3,
    EmbedTextParamsMetaNames.RETURN_OPTIONS: {"input_text": True},
}

embedding_model = WatsonxEmbeddings(
    model_id="ibm/slate-125m-english-rtrvr-v2",
    url="https://us-south.ml.cloud.ibm.com",
    project_id="skills-network",
    params=embed_params,
)

# 4. Create a vector store
vector_store = Chroma.from_documents(chunks, embedding_model)

# 5. Create a retriever
retriever = vector_store.as_retriever(search_kwargs={"k": 3})

# 6. Define a function to search for relevant information
def search_documents(query, top_k=3):
    """Search for documents relevant to a query"""
    # Use the retriever to get relevant documents
    docs = retriever.get_relevant_documents(query)
    
    # Limit to top_k if specified
    return docs[:top_k]

# 7. Test with a few queries
test_queries = [
    "What is LangChain?",
    "How do retrievers work?",
    "Why is document splitting important?"
]

for query in test_queries:
    print(f"\nQuery: {query}")
    results = search_documents(query)
    
    # Print the results
    print(f"Found {len(results)} relevant documents:")
    for i, doc in enumerate(results):
        print(f"\nResult {i+1}: {doc.page_content[:150]}...")
        print(f"Source: {doc.metadata.get('source', 'Unknown')}")


 Query: What is Langchain?
Found 3 relevant documents:

Result 1: ecosystem evolvesCopy pageLangChain is the easy way to start building completely custom agents and applications powered by LLMs....
Source: https://python.langchain.com/v0.2/docs/introduction/

Result 2: ecosystem evolvesCopy pageLangChain is the easy way to start building completely custom agents and applications powered by LLMs....
Source: https://python.langchain.com/v0.2/docs/introduction/

Result 3: ecosystem evolvesCopy pageLangChain is the easy way to start building completely custom agents and applications powered by LLMs....
Source: https://python.langchain.com/v0.2/docs/introduction/

 Query: how do retreivers work?
Found 3 relevant documents:

Result 1: ecosystem evolvesCopy pageLangChain is the easy way to start building completely custom agents and applications powered by LLMs....
Source: https://python.langchain.com/v0.2/docs/introduction/

Result 2: ecosystem evolvesCopy pageLangChain is the easy way to s