In [1]:
import os
import openai
import sys
import json
sys.path.append('../..')

In [2]:
openai.api_key = os.environ['OPENAI_API_KEY']

<h2>Load Source Text</h2>

In [3]:
from langchain_community.document_loaders import TextLoader
loader = TextLoader("data/fruits_and_veggies.txt")
documents = loader.load()

In [4]:
documents[0].metadata

{'source': 'data/fruits_and_veggies.txt'}

In [5]:
print(documents[0].page_content)

The Amanita phalloides has a large and imposing epigeous (above ground) fruiting body (basidiocrap). 
A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all white. 
AA. Phalloides, a.k.a Death Cap, is one of the most poisonous of all known mushrooms.
Gala apples are a popular variety known for their sweet flavor and crisp texture. 
They have a distinctive reddish-orange skin with yellow striping, making them visually appealing in fruit displays. 
Originally developed in New Zealand in the 1930s, they have since become a favorite in many countries and are widely cultivated for consumption. 
Their versatility makes them perfect for both eating fresh and using in various culinary dishes.
Radishes are small, root vegetables with a sharp, peppery flavor that can range from mild to spicy. 
They are usually round or cylindrical in shape and can come in various colors, including red, white, purple, and black. 
Rich in vitamins and minerals, radishes are often c

<h2>Split the document</h2>

In [6]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [7]:
chunk_size = 100
chunk_overlap = 0

In [8]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = chunk_size,
    chunk_overlap = chunk_overlap
)

In [9]:
chunks = text_splitter.split_documents(documents)

In [10]:
for chunk in chunks:
    print(chunk.page_content)
    print("-----")

The Amanita phalloides has a large and imposing epigeous (above ground) fruiting body (basidiocrap).
-----
A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all white.
-----
AA. Phalloides, a.k.a Death Cap, is one of the most poisonous of all known mushrooms.
-----
Gala apples are a popular variety known for their sweet flavor and crisp texture.
-----
They have a distinctive reddish-orange skin with yellow striping, making them visually appealing in
-----
fruit displays.
-----
Originally developed in New Zealand in the 1930s, they have since become a favorite in many
-----
countries and are widely cultivated for consumption.
-----
Their versatility makes them perfect for both eating fresh and using in various culinary dishes.
-----
Radishes are small, root vegetables with a sharp, peppery flavor that can range from mild to spicy.
-----
They are usually round or cylindrical in shape and can come in various colors, including red,
-----
white, purple, and

## Index chunks into a vector db (ChromaDB)

In [11]:
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma

In [12]:
vectorstore = Chroma.from_documents(documents=chunks, embedding=OpenAIEmbeddings())

### Test Similarity Search

In [13]:
vectorstore.similarity_search("Amanita phalloides")

[Document(metadata={'source': 'data/fruits_and_veggies.txt'}, page_content='The Amanita phalloides has a large and imposing epigeous (above ground) fruiting body (basidiocrap).'),
 Document(metadata={'source': 'data/fruits_and_veggies.txt'}, page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all white.'),
 Document(metadata={'source': 'data/fruits_and_veggies.txt'}, page_content='AA. Phalloides, a.k.a Death Cap, is one of the most poisonous of all known mushrooms.'),
 Document(metadata={'source': 'data/fruits_and_veggies.txt'}, page_content='fruit displays.')]

In [14]:
# Note that providers implement different scores; Chroma here
# returns a distance metric that should vary inversely with similarity.
vectorstore.similarity_search_with_score("Amanita phalloides")

[(Document(metadata={'source': 'data/fruits_and_veggies.txt'}, page_content='The Amanita phalloides has a large and imposing epigeous (above ground) fruiting body (basidiocrap).'),
  0.15804393589496613),
 (Document(metadata={'source': 'data/fruits_and_veggies.txt'}, page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all white.'),
  0.17331014573574066),
 (Document(metadata={'source': 'data/fruits_and_veggies.txt'}, page_content='AA. Phalloides, a.k.a Death Cap, is one of the most poisonous of all known mushrooms.'),
  0.2606315314769745),
 (Document(metadata={'source': 'data/fruits_and_veggies.txt'}, page_content='fruit displays.'),
  0.44001755118370056)]

## Init LLM

In [15]:
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model="gpt-4o")

## Prepare prompt (Augmentation Step)

In [16]:
from langchain_core.prompts import PromptTemplate

template = """
    You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. 
    If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
    
    Question: {question} 
    
    Context: {context} 
    
    Answer:
"""
prompt = PromptTemplate.from_template(template)

In [None]:
from langchain import hub
prompt = hub.pull("rlm/rag-prompt")

## Setup retrieval

In [17]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 1})

In [18]:
retrieved_docs = retriever.invoke("Amanita phalloides")

len(retrieved_docs)

1

In [20]:
retrieved_docs[0].page_content

'The Amanita phalloides has a large and imposing epigeous (above ground) fruiting body (basidiocrap).'

In [21]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [22]:
format_docs(retriever.invoke("Amanita phalloides"))

'The Amanita phalloides has a large and imposing epigeous (above ground) fruiting body (basidiocrap).'

## Build RAG chain

In [24]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [25]:
rag_chain.invoke("tell me about Amanita phalloides")

'The Amanita phalloides has a large and imposing above-ground fruiting body known as a basidiocarp.'

## Context Aware Text Splitter

In [None]:
documents

In [26]:
def llm_similarity(text1, text2):
    template = """
    Analyze the contextual relationship between the following two texts:
    
    Text 1: {text1}
    Text 2: {text2}
    
    Evaluate whether Text 2 completes or extends the context of Text 1, or if they are separate and unrelated. Assign a float score from 0 to 1, where:
    
    0 = The texts are entirely unrelated and should be split
    1 = The texts are strongly connected and belong to the same context
    
    Consider factors such as:
    
    Thematic continuity
    Logical flow
    Shared subject matter
    Narrative or argumentative progression
    Linguistic cohesion
    Provide only a single float value between 0 and 1 as your response, with up to two decimal places. For example: 0.75
    
    Ensure your answer contains nothing but the float value. Double-check your response before submitting"""
    
    prompt = PromptTemplate.from_template(template)
    chain = prompt | llm | StrOutputParser()

    similarity = chain.invoke({"text1": text1, "text2": text2})
    return float(similarity.replace('.\n\n', ''))

In [28]:
llm_similarity("""The Amanita phalloides has a large and imposing epigeous (above ground) fruiting body (basidiocrap).""",
               """A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all white.""")

0.85

In [29]:
def merge_metadata(metadatas):
    merged_metadata = {}
    for metadata in metadatas:
        for key, value in metadata.items():
            if key in merged_metadata:
                merged_metadata[key] += " " + value  
            else:
                merged_metadata[key] = value
    return merged_metadata

In [30]:
from langchain_core.documents import Document

def context_text_splitter_with_llm(documents, step_size, chunk_size, max_chunk_size):
    # Split the text
    # Ensure you have RecursiveCharacterTextSplitter defined and available
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=step_size, chunk_overlap=0)

    docs = text_splitter.split_documents(documents)

    step_chunks = [doc.page_content for doc in docs]
    step_metadata = [doc.metadata for doc in docs]

    merged_chunks = []
    merged_metadata_chunks = []

    while len(step_chunks) > 0:
        if len(''.join(step_chunks)) < chunk_size:
            break
        
        current_chunk = ''.join(step_chunks[:chunk_size//step_size])
        current_metadata = merge_metadata(step_metadata[:chunk_size//step_size])
        
        step_chunks = step_chunks[chunk_size//step_size:]
        step_metadata = step_metadata[chunk_size//step_size:]

        chunk_appended = False

        while len(step_chunks) > 0:
            next_step_chunk = step_chunks.pop(0)
            next_step_chunk_metadata = step_metadata.pop(0)

            similarity_score = llm_similarity(current_chunk, next_step_chunk)

            if similarity_score > 0.49 and len(current_chunk) + len(next_step_chunk) <= max_chunk_size:
                current_chunk += " " + next_step_chunk
                current_metadata = merge_metadata([current_metadata, next_step_chunk_metadata])
            else:
                merged_chunks.append(" " + current_chunk)
                merged_metadata_chunks.append(current_metadata)
                
                chunk_appended = True
                
                step_chunks.insert(0, next_step_chunk)
                step_metadata.insert(0, next_step_chunk_metadata)
                
                break

        if not chunk_appended:
            merged_chunks.append(" " + current_chunk)
            merged_metadata_chunks.append(current_metadata)

    if len(step_chunks) > 0:
        merged_chunks.append(' '.join(step_chunks))
        merged_metadata_chunks.append(merge_metadata(step_metadata))

    merged_docs = []
    for chunk, metadata in zip(merged_chunks, merged_metadata_chunks):
        merged_docs.append(Document(page_content=chunk, metadata=metadata))

    return merged_docs

In [31]:
chunks = context_text_splitter_with_llm(documents, 100, 200, 1200)

In [32]:
for chunk in chunks:
    print(chunk.page_content)
    print("-----")

 The Amanita phalloides has a large and imposing epigeous (above ground) fruiting body (basidiocrap).A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all white. AA. Phalloides, a.k.a Death Cap, is one of the most poisonous of all known mushrooms.
-----
 Gala apples are a popular variety known for their sweet flavor and crisp texture.They have a distinctive reddish-orange skin with yellow striping, making them visually appealing in fruit displays. Originally developed in New Zealand in the 1930s, they have since become a favorite in many countries and are widely cultivated for consumption. Their versatility makes them perfect for both eating fresh and using in various culinary dishes.
-----
 Radishes are small, root vegetables with a sharp, peppery flavor that can range from mild to spicy.They are usually round or cylindrical in shape and can come in various colors, including red, white, purple, and black. Rich in vitamins and minerals, radishes are of

### Index into Chroma DB

In [33]:
import chromadb

# Initialize ChromaDB client
client = chromadb.Client()

# Access a specific collection
collection_name = "langchain"
client.delete_collection(collection_name)

In [34]:
vectorstore = Chroma.from_documents(documents=chunks, embedding=OpenAIEmbeddings())

### Test default retrieval

In [35]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 1})

In [36]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [37]:
rag_chain.invoke("tell me about Amanita phalloides")

'Amanita phalloides, also known as the Death Cap, is one of the most poisonous mushrooms known. It has a large and imposing fruiting body, and some varieties are all white.'

## Bonus Vector Similarity

In [None]:
embeddings = OpenAIEmbeddings()

In [None]:
v1 = embeddings.embed_documents(texts=["Apple"])[0]
v2 = embeddings.embed_documents(texts=["Orange"])[0]

In [None]:
from scipy.spatial.distance import cosine
def cosine_similarity(vec1, vec2):
    """Compute the cosine similarity between two vectors using SciPy."""
    return 1 - cosine(vec1, vec2)  # cosine function from SciPy computes the distance, not similarity

In [None]:
cosine_similarity(v1, v2)

In [None]:
from langchain_core.documents import Document
def context_text_splitter(documents, step_size, chunk_size, max_chunk_size):
    # Split the text
    # Ensure you have RecursiveCharacterTextSplitter defined and available
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=step_size, chunk_overlap=0)

    docs = text_splitter.split_documents(documents)

    step_chunks = [doc.page_content for doc in docs]
    step_metadata = [doc.metadata for doc in docs]

    merged_chunks = []
    merged_metadata_chunks = []

    while len(step_chunks) > 0:
        if len(''.join(step_chunks)) < chunk_size:
            break
        
        current_chunk = ''.join(step_chunks[:chunk_size//step_size])
        current_metadata = merge_metadata(step_metadata[:chunk_size//step_size])
        
        step_chunks = step_chunks[chunk_size//step_size:]
        step_metadata = step_metadata[chunk_size//step_size:]

        chunk_appended = False

        while len(step_chunks) > 0:
            next_step_chunk = step_chunks.pop(0)
            next_step_chunk_metadata = step_metadata.pop(0)

            similarity_score = cosine_similarity(embeddings.embed_query(current_chunk), embeddings.embed_query(next_step_chunk))

            if similarity_score > 0.79 and len(current_chunk) + len(next_step_chunk) <= max_chunk_size:
                current_chunk += " " + next_step_chunk
                current_metadata = merge_metadata([current_metadata, next_step_chunk_metadata])
            else:
                merged_chunks.append(" " + current_chunk)
                merged_metadata_chunks.append(current_metadata)
                
                chunk_appended = True
                
                step_chunks.insert(0, next_step_chunk)
                step_metadata.insert(0, next_step_chunk_metadata)
                
                break

        if not chunk_appended:
            merged_chunks.append(" " + current_chunk)
            merged_metadata_chunks.append(current_metadata)

    if len(step_chunks) > 0:
        merged_chunks.append(' '.join(step_chunks))
        merged_metadata_chunks.append(merge_metadata(step_metadata))

    merged_docs = []
    for chunk, metadata in zip(merged_chunks, merged_metadata_chunks):
        merged_docs.append(Document(page_content=chunk, metadata=metadata))

    return merged_docs

In [None]:
chunks = context_text_splitter(documents, 100, 200, 1200)

In [None]:
for chunk in chunks:
    print(chunk.page_content)
    print("-----")