In [14]:
from langchain.document_loaders.arxiv import ArxivAPIWrapper, ArxivLoader
from langchain.retrievers import ArxivRetriever
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.callbacks import StdOutCallbackHandler
from dotenv import load_dotenv
load_dotenv()

embeddings = OpenAIEmbeddings()

In [15]:
# generate vector store

def get_arxiv_doc(id: str):
    arxiv_client = ArxivAPIWrapper(
        doc_content_chars_max=6000,
        load_all_available_meta=True
    )
    docs = arxiv_client.load(query=id)
    for doc in docs:
        print(doc.metadata)
        doc.metadata = {
            "source": doc.metadata["entry_id"].split("/")[-1],
            "title": doc.metadata["Title"]
        }
        return doc
    
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=0
)
 
docs = text_splitter.split_documents(documents=[
    get_arxiv_doc("2304.03442"),
    # get_arxiv_doc("2305.16291")
])

db = Chroma.from_documents(docs, embedding=embeddings)


{'Published': '2023-04-07', 'Title': 'Generative Agents: Interactive Simulacra of Human Behavior', 'Authors': "Joon Sung Park, Joseph C. O'Brien, Carrie J. Cai, Meredith Ringel Morris, Percy Liang, Michael S. Bernstein", 'Summary': "Believable proxies of human behavior can empower interactive applications\nranging from immersive environments to rehearsal spaces for interpersonal\ncommunication to prototyping tools. In this paper, we introduce generative\nagents--computational software agents that simulate believable human behavior.\nGenerative agents wake up, cook breakfast, and head to work; artists paint,\nwhile authors write; they form opinions, notice each other, and initiate\nconversations; they remember and reflect on days past as they plan the next\nday. To enable generative agents, we describe an architecture that extends a\nlarge language model to store a complete record of the agent's experiences\nusing natural language, synthesize those memories over time into higher-level\n

In [16]:
from langchain import LLMChain
from langchain.chains.conversational_retrieval.prompts import CONDENSE_QUESTION_PROMPT
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

llm = ChatOpenAI(temperature=0.2, model="gpt-3.5-turbo", max_tokens=300)

metadata_info = [
    AttributeInfo(
        name="source",
        description="arXiv paper ID",
        type="string"
    ),
    AttributeInfo(
        name="title",
        description="paper title",
        type="string"
    )
]
print(db.as_retriever().get_relevant_documents("voyager AI"))
retriever = SelfQueryRetriever.from_llm(
    llm=llm,
    vectorstore=db,
    metadata_field_info=metadata_info,
    document_contents="arXiv PDF texts",
    verbose=True
)
# retriever = db.as_retriever(search_type="similarity")

memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
doc_chain = load_qa_with_sources_chain(llm, chain_type="stuff")
question_generator = LLMChain(llm=llm, prompt=CONDENSE_QUESTION_PROMPT)

qa = ConversationalRetrievalChain(
    retriever=retriever,
    memory=memory,
    question_generator=question_generator,
    combine_docs_chain=doc_chain,
    callbacks=[StdOutCallbackHandler(color="yellow")]
)
# qa = ConversationalRetrievalChain(
#     memory=memory,
#     llm=llm,
#     retriever=retriever,
#     callbacks=[StdOutCallbackHandler(color="yellow")]
# )

[Document(page_content='to applications such as cognitive models [21] and virtual environ-\nments [9, 58], for over four decades researchers and practitioners\nhave envisioned computational agents that can serve as believ-\nable proxies of human behavior. In these visions, computationally-\npowered agents act consistently with their past experiences and\nreact believably to their environments. Such simulations of human\nbehavior could populate virtual spaces and communities with real-', metadata={'source': '2304.03442v1', 'title': 'Generative Agents: Interactive Simulacra of Human Behavior'}), Document(page_content='Human-AI Interaction, agents, generative AI, large language models\nACM Reference Format:\nJoon Sung Park, Joseph C. O’Brien, Carrie J. Cai, Meredith Ringel Morris,\nPercy Liang, and Michael S. Bernstein. 2023. Generative Agents: Interactive\nSimulacra of Human Behavior. In . ACM, New York, NY, USA, 22 pages.\nhttps://doi.org/xx.xx/xx.xx\n1\nINTRODUCTION\nHow might we craft

In [17]:
result = qa({"question": "what is a skill curriculum?"})
print(result["answer"])



[1m> Entering new  chain...[0m




query='skill curriculum' filter=None limit=None

[1m> Finished chain.[0m
A skill curriculum is a curriculum that focuses on teaching specific skills to individuals. It can be used to train people on how to handle difficult interpersonal situations, test social science theories, power applications such as social robots, and more. It is a way to provide structured learning and development in specific areas of expertise.
SOURCES: 2304.03442v1


In [26]:
qa({"question": "what is a skill curriculum in the paper with id 2304.03442v1"})



[1m> Entering new  chain...[0m




query='skill curriculum' filter=Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='source', value='2304.03442v1') limit=None

[1m> Finished chain.[0m


{'question': 'what is a skill curriculum in the paper with id 2304.03442v1',
 'chat_history': [HumanMessage(content='what is a skill curriculum?', additional_kwargs={}, example=False),
  AIMessage(content='A skill curriculum is a curriculum that focuses on teaching specific skills to individuals. It can be used to train people on how to handle difficult interpersonal situations, test social science theories, power applications such as social robots, and more. It is a way to provide structured learning and development in specific areas of expertise.\nSOURCES: 2304.03442v1', additional_kwargs={}, example=False),
  HumanMessage(content='can you make up a short code snippet in python of it?', additional_kwargs={}, example=False),
  AIMessage(content="I don't know the answer.", additional_kwargs={}, example=False),
  HumanMessage(content='what is the title of the 2304.03442 paper?', additional_kwargs={}, example=False),
  AIMessage(content="I don't know.", additional_kwargs={}, example=Fals