In [4]:
from langchain_core.documents import Document
from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter

#Load the LangChain Paper
paper_url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/96-FDF8f7coh0ooim7NyEQ/langchain-paper.pdf"
pdf_loader = PyPDFLoader(paper_url)
pdf_document = pdf_loader.load()

#Load content from LangChain website
web_url = "https://python.langchain.com/v0.2/docs/introduction/"
web_loader = WebBaseLoader(web_url)
web_document = web_loader.load()

# Create two different text splitters
splitter_1 = CharacterTextSplitter(chunk_size=300,chunk_overlap=30,separator="\n")
splitter_2 = RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=50,separators=["\n\n","\n",".","",""])

chunks_1 = splitter_1.split_documents(pdf_document)
chunks_2 = splitter_2.split_documents(pdf_document)

def display_document_stats(docs, name):
    """Display statistics about a list of document chunks"""
    total_chunks = len(docs)
    total_chars = sum(len(doc.page_content) for doc in docs)
    avg_chunk_size = total_chars / total_chunks if total_chunks > 0 else 0
    
    # Count unique metadata keys across all documents
    all_metadata_keys = set()
    for doc in docs:
        all_metadata_keys.update(doc.metadata.keys())
    
    # Print the statistics
    print(f"\n=== {name} Statistics ===")
    print(f"Total number of chunks: {total_chunks}")
    print(f"Average chunk size: {avg_chunk_size:.2f} characters")
    print(f"Metadata keys preserved: {', '.join(all_metadata_keys)}")
    
    if docs:
        print("\nExample chunk:")
        example_doc = docs[min(5, total_chunks-1)]  # Get the 5th chunk or the last one if fewer
        print(f"Content (first 150 chars): {example_doc.page_content[:150]}...")
        print(f"Metadata: {example_doc.metadata}")
        
        # Calculate length distribution
        lengths = [len(doc.page_content) for doc in docs]
        min_len = min(lengths)
        max_len = max(lengths)
        print(f"Min chunk size: {min_len} characters")
        print(f"Max chunk size: {max_len} characters")

# Display stats for both chunk sets
display_document_stats(chunks_1, "Splitter 1")
display_document_stats(chunks_2, "Splitter 2")

USER_AGENT environment variable not set, consider setting it to identify your requests.



=== Splitter 1 Statistics ===
Total number of chunks: 95
Average chunk size: 263.80 characters
Metadata keys preserved: page_label, source, moddate, total_pages, creationdate, author, title, producer, page, creator

Example chunk:
Content (first 150 chars): comprehensive support within the field of mental health. 
Additionally, the paper discusses the implementation of 
Streamlit to enhance the user ex pe...
Metadata: {'producer': 'PyPDF', 'creator': 'Microsoft Word', 'creationdate': '2023-12-31T03:50:13+00:00', 'author': 'IEEE', 'moddate': '2023-12-31T03:52:06+00:00', 'title': 's8329 final', 'source': 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/96-FDF8f7coh0ooim7NyEQ/langchain-paper.pdf', 'total_pages': 6, 'page': 0, 'page_label': '1'}
Min chunk size: 49 characters
Max chunk size: 299 characters

=== Splitter 2 Statistics ===
Total number of chunks: 57
Average chunk size: 452.74 characters
Metadata keys preserved: page_label, source, moddate, total_pages, crea

In [5]:
from langchain_ollama import OllamaEmbeddings

In [6]:
embeddings = OllamaEmbeddings(
    model = "llama3"
)

In [7]:
texts = [text.page_content for text in chunks_1]

embedding_result =embeddings.embed_documents(texts)
embedding_result[0][:5]

[0.008193675, -0.009552899, -0.012233876, 0.0048862672, 0.0035001717]

# Vector stores


One of the most common ways to store and search over unstructured data is to embed the text data and store the resulting embedding vectors, and then at query time to embed the unstructured query and retrieve the embedding vectors that are 'most similar' to the embedded query. You can use a [vector store](https://python.langchain.com/v0.1/docs/modules/data_connection/vectorstores/) to store embedded data and perform vector search for you.


In [8]:
from langchain_chroma import Chroma

In [9]:
docsearch = Chroma.from_documents(chunks_1 , embeddings)

In [10]:
query = "Langchain"
docs = docsearch.similarity_search(query)
print(docs[0].page_content)

Keywords —Large Language models , LangChain, Chatbot, 
Pretrained models, Mental health, Mental health support. 
I. INTRODUCTION 
The issue of mental health is an international situation, 
affecting people in each particularly developed nations and 
emerging markets. According to the World Health


#### Retrievers
A retriever is an interface that returns documents using an unstructured query. Retrievers are more general than a vector store. A retriever does not need to be able to store documents, only to return (or retrieve) them. You can still use vector stores as the backbone of a retriever. Note that other types of retrievers also exist.

Retrievers accept a string query as input and return a list of Documents as output.

##### **Vector store-backed retrievers** 
Vector store retrievers are retrievers that use a vector store to retrieve documents. They are a lightweight wrapper around the vector store class to make it conform to the retriever interface. They use the search methods implemented by a vector store, such as similarity search and MMR (Maximum marginal relevance), to query the texts in the vector store.

Now that you have constructed a vector store docsearch, you can easily construct a retriever such as seen in the following code.

In [11]:
# Use the docsearch vector store as a retriever
# This converts the vector store into a retriever interface that can fetch relevant documents
retriever = docsearch.as_retriever()

# Invoke the retriever with the query "Langchain"
# This will:
# 1. Convert the query text "Langchain" into an embedding vector
# 2. Perform a similarity search in the vector store using this embedding
# 3. Return the most semantically similar documents to the query
docs = retriever.invoke("Langchain")
# Access the first (most relevant) document from the retrieval results
# This returns the full Document object including:
# - page_content: The text content of the document
# - metadata: Any associated metadata like source, page numbers, etc.
# The returned document is the one most semantically similar to "Langchain"
print(docs[0].page_content)

Keywords —Large Language models , LangChain, Chatbot, 
Pretrained models, Mental health, Mental health support. 
I. INTRODUCTION 
The issue of mental health is an international situation, 
affecting people in each particularly developed nations and 
emerging markets. According to the World Health


##### **Parent document retrievers**
When splitting documents for retrieval, there are often conflicting goals:

- You want small documents so their embeddings can most accurately reflect their meaning. If the documents are too long, then the embeddings can lose meaning.
- You want to have long enough documents to retain the context of each chunk of text.

The `ParentDocumentRetriever` strikes that balance by splitting and storing small chunks of data. During retrieval, this retriever first fetches the small chunks, but then looks up the parent IDs for the data and returns those larger documents.

In [12]:
from langchain_classic.retrievers import ParentDocumentRetriever
from langchain_classic.storage import InMemoryStore
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [13]:
# Set up two different text splitters for a hierarchical splitting approach:

# 1. Parent splitter creates larger chunks (2000 characters)
# This is used to split documents into larger, more contextually complete sections
parent_splitter = CharacterTextSplitter(chunk_size=2000, chunk_overlap=20, separator='\n')

# 2. Child splitter creates smaller chunks (400 characters)
# This is used to split the parent chunks into smaller pieces for more precise retrieval
child_splitter = CharacterTextSplitter(chunk_size=400, chunk_overlap=20, separator='\n')

# Create a Chroma vector store with:
# - A specific collection name "split_parents" for organization
# - The previously configured Watson embeddings function
vectorstore = Chroma(
    collection_name="split_parents", embedding_function=embeddings
)

# Set up an in-memory storage layer for the parent documents
# This will store the larger chunks that provide context, but won't be directly embedded
store = InMemoryStore()

In [16]:
# Create a ParentDocumentRetriever instance that implements hierarchical document retrieval
retriever = ParentDocumentRetriever(
    # The vector store where child document embeddings will be stored and searched
    # This Chroma instance will contain the embeddings for the smaller chunks
    vectorstore=vectorstore,
    
    # The document store where parent documents will be stored
    # These larger chunks won't be embedded but will be retrieved by ID when needed
    docstore=store,
    
    # The splitter used to create small chunks (400 chars) for precise vector search
    # These smaller chunks are embedded and used for similarity matching
    child_splitter=child_splitter,
    
    # The splitter used to create larger chunks (2000 chars) for better context
    # These parent chunks provide more complete information when retrieved
    parent_splitter=parent_splitter,
)

In [17]:
retriever.add_documents(pdf_document) 

In [18]:
len(list(store.yield_keys()))

16

In [19]:
sub_docs = vectorstore.similarity_search("Langchain")

In [20]:
print(sub_docs[0].page_content)

and the chatbot's responses, allowing for a 
dynamic and coherent conversation flow. 
• Chatmodel Class of LangChain: The LangChain


In [21]:
retrieved_docs = retriever.invoke("Langchain")

In [22]:
print(retrieved_docs[0].page_content)

their mental health questions, kicking off a series of 
interactions with the LangChain framework. This is where the 
magic happens – LangChain acts as the brain behind the 
chatbot, working through various components like chat 
message templates and a memory concept to create a 
personalized and responsive support system.  Each step is 
broken down. 
Step 1. User Interface:  Developed using the Streamlit 
framework, the user interface welcomes users with a 
message explaining the role of the chatbot in providing 
mental health support. It assures users of a safe and 
confidential space to express their concerns.  
Step 2. User Input - Prompt: Users can input mental health-
related questions or seek advice by typing their queries 
into the input box integrated into the Streamlit interface. 
Step 3. Data Transfer to LangChain: Implement the 
functionality that sends the user's input (question) as a 
chat prompt template to the LangChain framework. This 
input serves as the "human messag

##### **RetrievalQA** 

Now that you understand how to retrieve information from a document, you might be interested in exploring some more exciting applications. For instance, you could have the Language Model (LLM) read the paper and summarize it for you, or create a QA bot that can answer your questions based on the paper.

Here's an example using LangChain's `RetrievalQA`.

In [27]:
from langchain_ollama import ChatOllama 

chat_llm = ChatOllama(
    model = "llama3",
    temperature = 0.8,
    num_predict = 256,
)

In [26]:
from langchain_classic.chains import RetrievalQA

In [29]:
# Create a RetrievalQA chain by configuring:
qa = RetrievalQA.from_chain_type(
    #The language model to use for generating answers
    llm = chat_llm,
    # The chain type "stuff" means all retrieved documents are simply concatenated and passed to the LLM
    
    chain_type = "stuff",
    # The retriever component that will fetch relevant documents
    # docsearch.as_retriever() converts the vector store into a retriever interface
    retriever = docsearch.as_retriever(),
    # Whether to include the source documents in the response
    # Set to False to return only the generated answer
    return_source_documents = False
    
)
query = "What is this paper discussing?"

qa.invoke(query)

{'query': 'What is this paper discussing?',
 'result': 'This paper appears to be discussing mental health challenges, particularly suicide attempts and the remedies that are lacking for individuals struggling with these issues. It touches on the idea that there is a gap in the assistance and care available for people dealing with intellectual health troubles, which can exacerbate their struggles.'}