In [1]:
import os
from dotenv import load_dotenv
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.utils import filter_complex_metadata
from langchain_mistralai.embeddings import MistralAIEmbeddings
from langchain_mistralai.chat_models import ChatMistralAI
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain

from langchain_core.runnables import (
    RunnableBranch,
    RunnableLambda,
    RunnableParallel,
    RunnablePassthrough,
)
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts.prompt import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from typing import Tuple, List, Optional
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.output_parsers import StrOutputParser
import os
from langchain_community.graphs import Neo4jGraph
from langchain.document_loaders import WikipediaLoader
from langchain.text_splitter import TokenTextSplitter
from langchain_openai import ChatOpenAI
from langchain_experimental.graph_transformers import LLMGraphTransformer
from neo4j import GraphDatabase
from langchain_community.vectorstores import Neo4jVector
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars
from langchain_core.runnables import ConfigurableField, RunnableParallel, RunnablePassthrough

In [19]:
load_dotenv()
llm = ChatOpenAI(temperature=0.7, model_name="gpt-3.5-turbo-0125")
emb_model = MistralAIEmbeddings(model="mistral-embed")
graph = Neo4jGraph()
graph.query("CREATE FULLTEXT INDEX entity IF NOT EXISTS FOR (e:__Entity__) ON EACH [e.id]")
question = "What is paged attention and how is it different from the regular attention? How paged attention helped LLMs in training? And what is its relationship with vLLM?"
# idea: https://github.com/tomasonjo/blogs/blob/master/llm/enhancing_rag_with_graph.ipynb

In [16]:
class Entities(BaseModel):
    # extract entities from text to be used for Neo4j graph query
    entities: List[str] = Field(
        ...,
        description="All the __Entity__ recognizable in a Neo4j graph that appear in the text"
    )

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are extracting organization and person entities from the text.",
        ),
        (
            "human",
            "Use the given format to extract information from the following "
            "input: {question}",
        ),
    ]
)

entity_chain = prompt | llm.with_structured_output(Entities)

def generate_full_text_query(input: str) -> str:
    # generate a full text query from the graph query results
    full_text_query = ""
    words = [el for el in remove_lucene_chars(input).split() if el]
    for word in words[:-1]:
        full_text_query += f" {word}~2 AND"
    full_text_query += f" {words[-1]}~2"
    return full_text_query.strip()

# Fulltext index query
def structured_retriever(question: str, q_limit: int, o_limit: int) -> str:
    # query the neo4j graph for relationships
    result = ""
    entities = entity_chain.invoke({"question": question})
    for entity in entities.entities:
        response = graph.query(
            """CALL db.index.fulltext.queryNodes('entity', $query, {limit:$q_limit})
            YIELD node,score
            CALL {
              MATCH (node)-[r:!MENTIONS]->(neighbor)
              RETURN node.id + ' - ' + type(r) + ' -> ' + neighbor.id AS output
              UNION
              MATCH (node)<-[r:!MENTIONS]-(neighbor)
              RETURN neighbor.id + ' - ' + type(r) + ' -> ' +  node.id AS output
            }
            RETURN output LIMIT $o_limit
            """,
            {"query": generate_full_text_query(entity), "q_limit": q_limit, "o_limit": o_limit},
        )
        result += "\n".join([el['output'] for el in response])
    return result

In [14]:
# load the vector store for standard retriever
vector_store = Chroma(
    embedding_function=emb_model, 
    persist_directory="pdfs/chroma_db"
)
chroma_retriever = vector_store.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"k": 5, "score_threshold": 0.0}
)

In [22]:
def retriever(question: str, q_limit: int=2, o_limit: int=20):
    """
    from question generate both structured query (neo4j) and unstructured query (chroma)
    """
    print(f"Search query: {question}")
    structured_data = structured_retriever(question, q_limit, o_limit)
    unstructured_data = [el.page_content for el in chroma_retriever.invoke(question)]
    final_data = \
        f"""
        Structured data:
        {structured_data}
        Unstructured data:
        {"#Document ". join(unstructured_data)}
        """
    return final_data

In [23]:
# build the chat chain

# Condense a chat history and follow-up question into a standalone question
_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question,
in its original language.
Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""  # noqa: E501
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)

def _format_chat_history(chat_history: List[Tuple[str, str]]) -> List:
    buffer = []
    for human, ai in chat_history:
        buffer.append(HumanMessage(content=human))
        buffer.append(AIMessage(content=ai))
    return buffer

_search_query = RunnableBranch(
    # If input includes chat_history, we condense it with the follow-up question
    (
        RunnableLambda(lambda x: bool(x.get("chat_history"))).with_config(
            run_name="HasChatHistoryCheck"
        ),  # Condense follow-up question and chat into a standalone_question
        RunnablePassthrough.assign(
            chat_history=lambda x: _format_chat_history(x["chat_history"])
        )
        | CONDENSE_QUESTION_PROMPT
        | ChatOpenAI(temperature=0)
        | StrOutputParser(),
    ),
    # Else, we have no chat history, so just pass through the question
    RunnableLambda(lambda x : x["question"]),
)

template = """Answer the question based only on the following context:
    {context}
    
    Question: {question}
    """
prompt = ChatPromptTemplate.from_template(template)

chain = (
    RunnableParallel(
        {
            "context": _search_query | retriever,
            "question": RunnablePassthrough(),
        }
    )
    | prompt
    | llm
    | StrOutputParser()
)

In [24]:
chain.invoke({"question": question})

Search query: What is paged attention and how is it different from the regular attention? How paged attention helped LLMs in training? And what is its relationship with vLLM?
paged attention
regular attention
LLMs
vLLM


'PagedAttention is a new attention algorithm that allows attention keys and values to be stored in non-contiguous paged memory. It differs from regular attention by enabling more efficient memory management and handling of decoding algorithms. PagedAttention helped LLMs in training by reducing memory fragmentation and enabling sharing, allowing for near-zero waste in KV cache memory and flexible sharing of KV. Its relationship with vLLM is that vLLM is a high-throughput LLM serving system that utilizes PagedAttention for efficient memory management.'