In [1]:
import os
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Pinecone, Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains import ConversationalRetrievalChain
from langchain.chat_models import ChatOpenAI


In [3]:
import os
import getpass
os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API Key:')

OpenAI API Key: ········


In [7]:
from langchain.document_loaders import DirectoryLoader

pdf_loader = DirectoryLoader('../documents/', glob="**/*.pdf")
readme_loader = DirectoryLoader('../documents/', glob="**/*.md")
txt_loader = DirectoryLoader('../documents/', glob="**/*.txt")

In [8]:
loaders = [pdf_loader, readme_loader, txt_loader]

#lets create document 
documents = []
for loader in loaders:
    documents.extend(loader.load())

In [10]:
print (f'Found {len(documents)} document(s) the provided directory')
print (f'The total number of characters is {len(documents[0].page_content)} characters in your document(s)')

You have 2 document(s) in your data
There are 41339 characters in your document


In [11]:
documents[0]


Document(page_content='3 2 0 2\n\ng u A 9\n\n] L C . s c [\n\n4 v 1 2 6 8 0 . 7 0 3 2 : v i X r a\n\nRetentive Network: A Successor to Transformer for Large Language Models\n\nYutao Sun∗ †‡ Li Dong∗ † Shaohan Huang† Shuming Ma† Yuqing Xia† Jilong Xue† Jianyong Wang‡ Furu Wei†⋄\n\n† Microsoft Research\n\n‡ Tsinghua University\n\nhttps://aka.ms/GeneralAI\n\nAbstract\n\nIn this work, we propose Retentive Network (RETNET) as a foundation archi- tecture for large language models, simultaneously achieving training parallelism, low-cost inference, and good performance. We theoretically derive the connection between recurrence and attention. Then we propose the retention mechanism for sequence modeling, which supports three computation paradigms, i.e., parallel, recurrent, and chunkwise recurrent. Specifically, the parallel representation allows for training parallelism. The recurrent representation enables low-cost O(1) infer- ence, which improves decoding throughput, latency, and GPU memory 

In [12]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=40) #chunk overlap seems to work better
documents = text_splitter.split_documents(documents)
print(len(documents))

Created a chunk of size 1161, which is longer than the specified 1000
Created a chunk of size 5132, which is longer than the specified 1000
Created a chunk of size 1031, which is longer than the specified 1000
Created a chunk of size 1172, which is longer than the specified 1000
Created a chunk of size 1010, which is longer than the specified 1000
Created a chunk of size 1398, which is longer than the specified 1000
Created a chunk of size 1017, which is longer than the specified 1000
Created a chunk of size 1186, which is longer than the specified 1000
Created a chunk of size 1232, which is longer than the specified 1000
Created a chunk of size 1318, which is longer than the specified 1000
Created a chunk of size 1158, which is longer than the specified 1000
Created a chunk of size 1399, which is longer than the specified 1000
Created a chunk of size 1111, which is longer than the specified 1000
Created a chunk of size 1338, which is longer than the specified 1000
Created a chunk of s

111


In [13]:
documents[0]

Document(page_content='3 2 0 2\n\ng u A 9\n\n] L C . s c [\n\n4 v 1 2 6 8 0 . 7 0 3 2 : v i X r a\n\nRetentive Network: A Successor to Transformer for Large Language Models\n\nYutao Sun∗ †‡ Li Dong∗ † Shaohan Huang† Shuming Ma† Yuqing Xia† Jilong Xue† Jianyong Wang‡ Furu Wei†⋄\n\n† Microsoft Research\n\n‡ Tsinghua University\n\nhttps://aka.ms/GeneralAI\n\nAbstract', metadata={'source': '../documents/2307.08621.pdf'})

In [14]:
documents[1]

Document(page_content='In this work, we propose Retentive Network (RETNET) as a foundation archi- tecture for large language models, simultaneously achieving training parallelism, low-cost inference, and good performance. We theoretically derive the connection between recurrence and attention. Then we propose the retention mechanism for sequence modeling, which supports three computation paradigms, i.e., parallel, recurrent, and chunkwise recurrent. Specifically, the parallel representation allows for training parallelism. The recurrent representation enables low-cost O(1) infer- ence, which improves decoding throughput, latency, and GPU memory without sacrificing performance. The chunkwise recurrent representation facilitates effi- cient long-sequence modeling with linear complexity, where each chunk is encoded parallelly while recurrently summarizing the chunks. Experimental results on language modeling show that RETNET achieves favorable scaling results, parallel training, low-cost 

In [15]:
embeddings = OpenAIEmbeddings()

In [16]:
from langchain.vectorstores import Chroma
vectorstore = Chroma.from_documents(documents, embeddings)


In [21]:
text = "What makes Retentive Networks good?"
docs = vectorstore.similarity_search(text)

In [24]:
print(docs[0].page_content)


4 Conclusion

In this work, we propose retentive networks (RetNet) for sequence modeling, which enables various representations, i.e., parallel, recurrent, and chunkwise recurrent. RetNet achieves significantly better inference efficiency (in terms of memory, speed, and latency), favorable training parallelization, and competitive performance compared with Transformers. The above advantages make RetNet an ideal successor to Transformers for large language models, especially considering the deployment benefits brought by the O(1) inference complexity. In the future, we would like to scale up RetNet in terms of model size [CDH+22] and training steps. Moreover, retention can efficiently work with structured prompting [HSD+22b] by compressing long-term memory. We will also use RetNet as the backbone architecture to train multimodal large language models [HSD+22a, HDW+23, PWD+23]. In addition, we are interested in deploying RetNet models on various edge devices, such as mobile phones.


In [25]:
from langchain.llms import OpenAI
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k":2})
qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0), retriever)


In [26]:
chat_history = []
query = "What makes Retentive Networks good?"
result = qa({"question": query, "chat_history": chat_history})
result["answer"]

' Retentive Networks (RetNet) are good because they enable various representations, have significantly better inference efficiency, have favorable training parallelization, and have competitive performance compared to Transformers.'

In [27]:
chat_history.append((query, result["answer"]))
chat_history

[('What makes Retentive Networks good?',
  ' Retentive Networks (RetNet) are good because they enable various representations, have significantly better inference efficiency, have favorable training parallelization, and have competitive performance compared to Transformers.')]