# Generating text from a vector db
This is a simple proof of concept for a QA from the rollux tutorials

In [1]:
import os
import getpass

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma 

os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API Key:')


## Clone github repository locally 

In [2]:
repo_url='https://github.com/SYS-Labs/rollux-tutorial'

In [3]:
from git import Repo

# Cloning the base repository

# if repo already exists, assign Repo(path), if not, run Repo.clone_from
# this conditional is here simply for convenience in case the notebook is run more than once
 
if os.path.exists(f'./repos/{repo_url.split("/")[-1]}'):
    repo = Repo(f'./repos/{repo_url.split("/")[-1]}')
else:
    repo = Repo.clone_from(
        repo_url, to_path=f'./repos/{repo_url.split("/")[-1]}'
    )
repo_path_local = f'./repos/{repo_url.split("/")[-1]}/'



## Loading the raw repository into documents

In [4]:
# allow user to specify the branch to use
# default to main if not specified

repo_branch = 'main'

In [7]:
from langchain.document_loaders import GitLoader

git_loader = GitLoader(repo_path=repo_path_local, branch=repo_branch,file_filter=lambda x: x.endswith('.md'))

In [8]:
code_documents = git_loader.load()

In [9]:
file_types = [doc.metadata['file_type'] for doc in code_documents]
file_types.sort()

# print the number of occurences of each element in the list
from collections import Counter

type_counts = Counter(file_types)
print(type_counts)



Counter({'.md': 17})


In [10]:
# Create a dict with the number of characters per file type, as an easy proxy for token count

characters_by_file_type = {}
for doc in code_documents:
    file_type = doc.metadata['file_type']
    page_content_length = len(doc.page_content)

    if file_type in characters_by_file_type:
        characters_by_file_type[file_type] += page_content_length
    else:
        characters_by_file_type[file_type] = page_content_length

print(characters_by_file_type)


{'.md': 158182}


In [11]:
# sort dictionary by value
characters_by_file_type = dict(sorted(characters_by_file_type.items(), key=lambda item: item[1], reverse=True))
print(characters_by_file_type)

{'.md': 158182}


In [12]:
# relevant_code = [documents for documents in code_documents if documents.metadata['file_type'] not in doc_ignore_types]
relevant_code = code_documents

In [13]:
content = []
for doc in relevant_code:
    content.append(len(doc.page_content))    
print(sorted(content))

[444, 1132, 2046, 3075, 5818, 7140, 7518, 7847, 8055, 8373, 12332, 12377, 12735, 13819, 15575, 17110, 22786]


## As we can see, some of the documents are very long to be included in an LLM prompt
The content will have to be compressed or split before it ever interacts with a language model.

In [14]:
# We split the code using a default code splitter.
# In this case, it is likely to work well enough for the main filetypes in the codebase. 
# Otherwise, separators for other langauges can easily be defined here. 

from langchain.text_splitter import RecursiveCharacterTextSplitter

markdown_splitter= RecursiveCharacterTextSplitter(chunk_size=800)

markdown_splitter.from_language('markdown')


<langchain.text_splitter.RecursiveCharacterTextSplitter at 0x10da55350>

In [17]:
relevant_code_split = markdown_splitter.split(relevant_code) 


In [18]:
lengths = []
for i in range(len(relevant_code_split)):
    if type(relevant_code_split[i]) != list:
        lengths.append(len(relevant_code_split[i].page_content))
    elif type(relevant_code_split[i]) == list:
        print(len(relevant_code_split[i]))

In [21]:
# Just making sure our splitters actually worked

content = [len(d.page_content) for d in relevant_code_split if len(d.page_content) > 700]
print(sorted(content))

[701, 703, 704, 705, 709, 711, 712, 712, 714, 714, 714, 716, 718, 718, 719, 719, 721, 722, 722, 724, 725, 726, 726, 726, 728, 728, 730, 732, 733, 734, 735, 735, 736, 737, 738, 740, 740, 742, 742, 742, 744, 745, 746, 750, 750, 750, 751, 752, 752, 752, 752, 752, 752, 752, 752, 753, 755, 755, 755, 756, 757, 757, 757, 758, 758, 758, 759, 761, 761, 762, 763, 763, 763, 764, 764, 765, 765, 765, 767, 767, 768, 768, 768, 768, 768, 768, 769, 769, 771, 771, 771, 772, 773, 773, 774, 774, 774, 774, 775, 775, 777, 777, 777, 778, 779, 779, 780, 780, 780, 781, 781, 782, 783, 784, 784, 784, 784, 784, 785, 785, 785, 785, 786, 786, 787, 787, 787, 789, 789, 789, 790, 790, 790, 790, 791, 791, 792, 792, 793, 793, 793, 793, 794, 794, 795, 795, 795, 796, 796, 796, 796, 796, 797, 798]


In [22]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

tutorials_db = Chroma.from_documents(relevant_code_split, embedding=embeddings,persist_directory='./chroma_code/rolux-tutorial')

The process above may take a few minutes.

But once we do it once, we can load the persisted database from disk, and use it as normal.

In [26]:
# embeddings = OpenAIEmbeddings()
# persist_directory = "./chroma_code/split/index"
# chroma_code = Chroma(persist_directory=persist_directory, embedding_function=embeddings)

In [23]:
from langchain.callbacks import get_openai_callback
from langchain import OpenAI
from langchain.chat_models import ChatOpenAI

gpt4 = ChatOpenAI(model_name="gpt-4",temperature=0) # Waitlist only
gpt3 = ChatOpenAI(model_name="gpt-3.5-turbo",temperature=0)
davinci = OpenAI(model_name="text-davinci-003",temperature=0)
davinci002 = OpenAI(model_name="text-davinci-002",temperature=0)
curie = OpenAI(model_name="text-curie-001",temperature=0)


In [32]:
base_retriever = tutorials_db.as_retriever()
base_retriever.search_kwargs['distance_metric'] = 'cos'
base_retriever.search_kwargs['fetch_k'] = 15 
base_retriever.search_kwargs['maximal_marginal_relevance'] = True
base_retriever.search_kwargs['k'] = 6 

## RetrievalQA + base retriever (gpt-3.5-turbo)

In [33]:
from langchain.chains import RetrievalQA

code_qa = RetrievalQA.from_chain_type(llm=gpt3,chain_type='stuff', retriever=base_retriever)

In [34]:
code_qa.run('Explain the bridge')



[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the users question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
For an L1/L2 token pair to work on the Standard Bridge, there has to be a layer of original mint (where the minting and burning of tokens is controlled by the business logic), and a bridged layer where the Standard Bridge controls minting and burning.
The most common configuration is to have L1 as the layer of original mint, and L2 as the bridged layer, this allows for ERC-20 contracts that were written with no knowledge of Optimism to be bridged.

## Conclusion

You should now be able to write applications that use our SDK and bridge to transfer ERC-20 assets between layer 1 and layer 2.

Note that for withdrawals of a commonl

'The bridge is a mechanism that allows for the transfer of assets between two different blockchain networks. In this case, the bridge is used to transfer ERC-20 tokens between layer 1 and layer 2 of the Optimism network. The bridge consists of two layers: the layer of original mint, where the minting and burning of tokens is controlled by the business logic, and the bridged layer, where the Standard Bridge controls minting and burning. The most common configuration is to have layer 1 as the layer of original mint and layer 2 as the bridged layer, which allows for ERC-20 contracts that were written with no knowledge of Optimism to be bridged. To bridge a custom ERC-20 token to Optimism using the Standard Bridge, the contract on the bridged layer has to implement either the legacy `IL2StandardERC20` interface (only if the bridged layer is L2) or the new `IOptimismMintableERC20` interface.'

In [35]:
code_qa.run('How is rollux different from optimism?')



[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the users question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
As you can see in the different development stacks below, the way you deploy contracts and interact with them on Rollux is almost identical to the way you do it with L1 Syscoin.
The most visible difference is that you have to specify a different endpoint (of course).
The list of other differences is [here](https://community.optimism.io/docs/developers/build/differences/).

## Development stacks

- [Apeworx](#apeworx)
- [Brownie](#brownie)
- [Foundry](#foundry)
- [Hardhat](#hardhat)
- [Remix](#remix)
- [Truffle](#truffle)
- [Waffle](#waffle)

### Hardhat

In [Hardhat](https://hardhat.org/) you use a configuration similar to [thi

'Rollux is based on Optimism Bedrock and is EVM equivalent, meaning it runs a slightly modified version of the same `geth` you run on Syscoin mainnet. Therefore, the differences between Rollux development and Syscoin development are minor. However, there are a few differences between Rollux and Optimism, which you can find listed in the following link: https://community.optimism.io/docs/developers/build/differences/.'