In [48]:
from langchain_openai import AzureChatOpenAI
import os

In [63]:
os.environ["AZURE_OPENAI_API_KEY"] = "sk-af8V6rBrwtDdr6h2gD4OGw"
os.environ["AZURE_OPENAI_ENDPOINT"] = "https://api.ai.it.cornell.edu" # https://<XXX>.openai.azure.com/"

llm = AzureChatOpenAI(
    azure_deployment="gpt-4o",
    temperature=0.2,
    api_version="2023-06-01-preview",
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

In [66]:
from langchain_core.messages import HumanMessage

llm.invoke([HumanMessage(content="Hi! I'm Bob")])

AIMessage(content='Hello Bob! How can I assist you today?', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 10, 'prompt_tokens': 11, 'total_tokens': 21, 'completion_tokens_details': None, 'prompt_tokens_details': None}, 'model_name': 'gpt-4o-2024-08-06', 'system_fingerprint': 'fp_d54531d9eb', 'prompt_filter_results': [{'prompt_index': 0, 'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'}, 'jailbreak': {'filtered': False, 'detected': False}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': False, 'severity': 'safe'}}}], 'finish_reason': 'stop', 'logprobs': None, 'content_filter_results': {}}, id='run-8903eca2-4d3e-41e6-ae09-67cb4a54134b-0', usage_metadata={'input_tokens': 11, 'output_tokens': 10, 'total_tokens': 21, 'input_token_details': {}, 'output_token_details': {}})

<h2>Load Source Text</h2>

In [68]:
from langchain_community.document_loaders import TextLoader
loader = TextLoader("cornell.txt") # "/workspace/data/knowledge_base/fruits_and_veggies.txt")
documents = loader.load()

In [69]:
documents[0].metadata

{'source': 'cornell.txt'}

In [70]:
print(documents[0].page_content)

Cornell University
20th Century History
In 1967, Cornell experienced a fire in the Residential Club dormitory that killed eight students and one professor. In the late 1960s, Cornell was among the Ivy League universities that experienced heightened student activism related to cultural issues, civil rights, and opposition to U.S. involvement in the Vietnam War. In 1969, armed anti-Vietnam War protesters occupied Willard Straight Hall, an incident that led to a restructuring of the university's governance and forced the resignation of then Cornell president James Alfred Perkins.
Since the 20th century, rankings of universities and colleges, Cornell University and its academic programs have routinely ranked among the best in the world. In 1995, the National Research Council ranked Cornell's Ph.D. programs as sixth-best in the nation. It also ranked the academic quality of 18 individual Cornell Ph.D. programs among the top ten in the nation, which included astrophysics (ninth-best), chemis

<h2>Split the document</h2>

In [71]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [72]:
chunk_size = 100
chunk_overlap = 0

In [73]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = chunk_size,
    chunk_overlap = chunk_overlap
)

In [74]:
chunks = text_splitter.split_documents(documents)

In [75]:
for chunk in chunks:
    print(chunk.page_content)
    print("-----")

Cornell University
20th Century History
-----
In 1967, Cornell experienced a fire in the Residential Club dormitory that killed eight students
-----
and one professor. In the late 1960s, Cornell was among the Ivy League universities that
-----
experienced heightened student activism related to cultural issues, civil rights, and opposition to
-----
U.S. involvement in the Vietnam War. In 1969, armed anti-Vietnam War protesters occupied Willard
-----
Straight Hall, an incident that led to a restructuring of the university's governance and forced
-----
the resignation of then Cornell president James Alfred Perkins.
-----
Since the 20th century, rankings of universities and colleges, Cornell University and its academic
-----
programs have routinely ranked among the best in the world. In 1995, the National Research Council
-----
ranked Cornell's Ph.D. programs as sixth-best in the nation. It also ranked the academic quality of
-----
18 individual Cornell Ph.D. programs among the top ten in 

## Index chunks into a vector db (ChromaDB)

In [99]:
from langchain_openai import AzureOpenAIEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma

In [100]:
#vectorstore = Chroma.from_documents(documents=chunks, embedding=AzureOpenAIEmbeddings(model="text-embedding-3-large"))
vectorstore = Chroma.from_documents(documents=chunks, embedding=AzureOpenAIEmbeddings(model="text-embedding-ada-002"))

BadRequestError: Error code: 400 - {'error': {'message': '{"error": "/embeddings: Invalid model name passed in model=text-embedding-ada-002. Call `/v1/models` to view available models for your key."}', 'type': 'None', 'param': 'None', 'code': '400'}}

## Test Similarity Search

In [83]:
vectorstore.similarity_search("Amanita phalloides")

NameError: name 'vectorstore' is not defined

In [None]:
# Note that providers implement different scores; Chroma here
# returns a distance metric that should vary inversely with similarity.
vectorstore.similarity_search_with_score("Amanita phalloides")

## Prepare prompt (Augmentation Step)

In [None]:
from langchain_core.prompts import PromptTemplate

template = """
    You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. 
    If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
    
    Question: {question} 
    
    Context: {context} 
    
    Answer:
"""
prompt = PromptTemplate.from_template(template)

## Setup retrieval

In [None]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 1})

In [None]:
	
retrieved_docs = retriever.invoke("Amanita phalloides")

len(retrieved_docs)

In [None]:
retrieved_docs[0].page_content

In [None]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [None]:
format_docs(retriever.invoke("Amanita phalloides"))

## Build RAG chain

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
rag_chain.invoke("tell me about Amanita phalloides")

## Context Aware Text Splitter

In [None]:
documents

In [None]:
def llm_similarity(text1, text2):
    template = """
    Analyze the contextual relationship between the following two texts:
    
    Text 1: {text1}
    Text 2: {text2}
    
    Evaluate whether Text 2 completes or extends the context of Text 1, or if they are separate and unrelated. Assign a float score from 0 to 1, where:
    
    0 = The texts are entirely unrelated and should be split
    1 = The texts are strongly connected and belong to the same context
    
    Consider factors such as:
    
    Thematic continuity
    Logical flow
    Shared subject matter
    Narrative or argumentative progression
    Linguistic cohesion
    Provide only a single float value between 0 and 1 as your response, with up to two decimal places. For example: 0.75
    
    Ensure your answer contains nothing but the float value. Double-check your response before submitting"""
    
    prompt = PromptTemplate.from_template(template)
    chain = prompt | llm | StrOutputParser()

    similarity = chain.invoke({"text1": text1, "text2": text2})
    return float(similarity.replace('.\n\n', ''))

In [None]:
llm_similarity("""The Amanita phalloides has a large and imposing epigeous (above ground) fruiting body (basidiocrap).""",
               """A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all white.""")

In [None]:
def merge_metadata(metadatas):
    merged_metadata = {}
    for metadata in metadatas:
        for key, value in metadata.items():
            if key in merged_metadata:
                merged_metadata[key] += " " + value  
            else:
                merged_metadata[key] = value
    return merged_metadata

In [None]:
from langchain_core.documents import Document

def context_text_splitter_with_llm(documents, step_size, chunk_size, max_chunk_size):
    # Split the text
    # Ensure you have RecursiveCharacterTextSplitter defined and available
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=step_size, chunk_overlap=0)

    docs = text_splitter.split_documents(documents)

    step_chunks = [doc.page_content for doc in docs]
    step_metadata = [doc.metadata for doc in docs]

    merged_chunks = []
    merged_metadata_chunks = []

    while len(step_chunks) > 0:
        if len(''.join(step_chunks)) < chunk_size:
            break
        
        current_chunk = ''.join(step_chunks[:chunk_size//step_size])
        current_metadata = merge_metadata(step_metadata[:chunk_size//step_size])
        
        step_chunks = step_chunks[chunk_size//step_size:]
        step_metadata = step_metadata[chunk_size//step_size:]

        chunk_appended = False

        while len(step_chunks) > 0:
            next_step_chunk = step_chunks.pop(0)
            next_step_chunk_metadata = step_metadata.pop(0)

            similarity_score = llm_similarity(current_chunk, next_step_chunk)

            if similarity_score > 0.49 and len(current_chunk) + len(next_step_chunk) <= max_chunk_size:
                current_chunk += " " + next_step_chunk
                current_metadata = merge_metadata([current_metadata, next_step_chunk_metadata])
            else:
                merged_chunks.append(" " + current_chunk)
                merged_metadata_chunks.append(current_metadata)
                
                chunk_appended = True
                
                step_chunks.insert(0, next_step_chunk)
                step_metadata.insert(0, next_step_chunk_metadata)
                
                break

        if not chunk_appended:
            merged_chunks.append(" " + current_chunk)
            merged_metadata_chunks.append(current_metadata)

    if len(step_chunks) > 0:
        merged_chunks.append(' '.join(step_chunks))
        merged_metadata_chunks.append(merge_metadata(step_metadata))

    merged_docs = []
    for chunk, metadata in zip(merged_chunks, merged_metadata_chunks):
        merged_docs.append(Document(page_content=chunk, metadata=metadata))

    return merged_docs

In [None]:
chunks = context_text_splitter_with_llm(documents, 100, 200, 1200)

In [None]:
for chunk in chunks:
    print(chunk.page_content)
    print("-----")

## Index into Chroma DB

In [None]:
import chromadb

# Initialize ChromaDB client
client = chromadb.Client()

# Access a specific collection
collection_name = "langchain"
client.delete_collection(collection_name)

In [None]:
vectorstore = Chroma.from_documents(documents=chunks, embedding=AzureOpenAIEmbeddings(model="text-embedding-3-large"))

## Setup Retrieval

In [None]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 1})

In [None]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
rag_chain.invoke("tell me about Amanita phalloides")

## Bonus Vector Similarity

In [None]:
embeddings = AzureOpenAIEmbeddings(model="text-embedding-3-large")

In [None]:
v1 = embeddings.embed_documents(texts=["Apple"])[0]
v2 = embeddings.embed_documents(texts=["Orange"])[0]

In [None]:
from scipy.spatial.distance import cosine
def cosine_similarity(vec1, vec2):
    """Compute the cosine similarity between two vectors using SciPy."""
    return 1 - cosine(vec1, vec2)  # cosine function from SciPy computes the distance, not similarity

In [None]:
cosine_similarity(v1, v2)

In [None]:
from langchain_core.documents import Document
def context_text_splitter(documents, step_size, chunk_size, max_chunk_size):
    # Split the text
    # Ensure you have RecursiveCharacterTextSplitter defined and available
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=step_size, chunk_overlap=0)

    docs = text_splitter.split_documents(documents)

    step_chunks = [doc.page_content for doc in docs]
    step_metadata = [doc.metadata for doc in docs]

    merged_chunks = []
    merged_metadata_chunks = []

    while len(step_chunks) > 0:
        if len(''.join(step_chunks)) < chunk_size:
            break
        
        current_chunk = ''.join(step_chunks[:chunk_size//step_size])
        current_metadata = merge_metadata(step_metadata[:chunk_size//step_size])
        
        step_chunks = step_chunks[chunk_size//step_size:]
        step_metadata = step_metadata[chunk_size//step_size:]

        chunk_appended = False

        while len(step_chunks) > 0:
            next_step_chunk = step_chunks.pop(0)
            next_step_chunk_metadata = step_metadata.pop(0)

            similarity_score = cosine_similarity(embeddings.embed_query(current_chunk), embeddings.embed_query(next_step_chunk))

            if similarity_score > 0.79 and len(current_chunk) + len(next_step_chunk) <= max_chunk_size:
                current_chunk += " " + next_step_chunk
                current_metadata = merge_metadata([current_metadata, next_step_chunk_metadata])
            else:
                merged_chunks.append(" " + current_chunk)
                merged_metadata_chunks.append(current_metadata)
                
                chunk_appended = True
                
                step_chunks.insert(0, next_step_chunk)
                step_metadata.insert(0, next_step_chunk_metadata)
                
                break

        if not chunk_appended:
            merged_chunks.append(" " + current_chunk)
            merged_metadata_chunks.append(current_metadata)

    if len(step_chunks) > 0:
        merged_chunks.append(' '.join(step_chunks))
        merged_metadata_chunks.append(merge_metadata(step_metadata))

    merged_docs = []
    for chunk, metadata in zip(merged_chunks, merged_metadata_chunks):
        merged_docs.append(Document(page_content=chunk, metadata=metadata))

    return merged_docs

In [None]:
chunks = context_text_splitter(documents, 100, 200, 1200)

In [None]:
for chunk in chunks:
    print(chunk.page_content)
    print("-----")