In [1]:
from langchain.chat_models import ChatOpenAI
from langchain import ConversationChain
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
    MessagesPlaceholder, 
)
from langchain.memory import ConversationBufferMemory
from dotenv import load_dotenv

from bootstrap.base_prompts import system_prompt

load_dotenv()

%load_ext autoreload
%autoreload 2

In [2]:

human_message = HumanMessagePromptTemplate.from_template("{input}")

chat_prompt = ChatPromptTemplate.from_messages(
    [
        system_prompt, 
        MessagesPlaceholder(variable_name="history"),
        human_message])

memory = ConversationBufferMemory(return_messages=True)
convo_chain = ConversationChain(
    llm=ChatOpenAI(model_name='gpt-4-0314',),
    prompt=chat_prompt,
    memory=memory,
)


In [3]:
initial_instructions = f'''
Hi, I'd like to remind you of your instructions:
{system_prompt.format().content}
'''
# initial
outputs = []
output = convo_chain.predict(input=initial_instructions)
outputs.append(output)
print(output)


Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised APIError: HTTP code 502 from API (<html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
<hr><center>nginx</center>
</body>
</html>
).


Thank you for the reminder. I'm ready to help you with any coding tasks related to test-driven development in Python. Please provide a task or a problem that you'd like us to work on together.


In [7]:
human_message = '''
I would like for you to be able to read langchain docs. here is some code that I found online and edited a bit:

```python
import pathlib
import subprocess
import tempfile
from langchain.docstore.document import Document
import requests
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores.faiss import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.llms import OpenAI

def get_github_docs(repo_owner, repo_name):
    with tempfile.TemporaryDirectory() as d:
        subprocess.check_call(
            f"git clone --depth 1 https://github.com/{repo_owner}/{repo_name}.git .",
            cwd=d,
            shell=True,
        )
        git_sha = (
            subprocess.check_output("git rev-parse HEAD", shell=True, cwd=d)
            .decode("utf-8")
            .strip()
        )
        repo_path = pathlib.Path(d)
        doc_files = []
        for extension in ['.md', '.mdx', '.ipynb']:
            
            doc_files.extend(list(repo_path.glob(f"**/*{extension}")))

        for doc_file in doc_files:
            with open(doc_file, "r") as f:
                relative_path = doc_file.relative_to(repo_path)
                github_url = f"https://github.com/{repo_owner}/{repo_name}/blob/{git_sha}/{relative_path}"
                yield Document(page_content=f.read(), metadata={"source": github_url})
                
sources = get_github_docs("hwchase17", "langchain")
# list(sources)

source_chunks = []
splitter = CharacterTextSplitter(separator=" ", chunk_size=1024, chunk_overlap=0)
for source in sources:
    for chunk in splitter.split_text(source.page_content):
        source_chunks.append(Document(page_content=chunk, metadata=source.metadata))
        
search_index = FAISS.from_documents(source_chunks, OpenAIEmbeddings())
chain = load_qa_with_sources_chain(OpenAI(temperature=0, model_name='gpt-4'))
```
'''
output = convo_chain.predict(input=human_message)
outputs.append(output)
print(output)

Thank you for providing the code. From my understanding, this code does the following:

1. Clone a GitHub repository.

2. Find documents in the repository with specific file extensions (`.md`, `.mdx`, `.ipynb`).

3. Create `Document` objects for each file in the repository and store the file content and GitHub URL as metadata.

4. Split the text content of each `Document` object into smaller chunks using the `CharacterTextSplitter`.

5. Create a search index (`FAISS`) from the document chunks using embeddings generated from `OpenAIEmbeddings`.

6. Load a language model chain (in this case, a QA chain) from the `langchain` library using the `OpenAI` LLM.

To better help you, please specify a specific task, question, or problem you'd like to accomplish or solve using this code.


In [8]:
human_message = '''
I want to add the ability to read langchain docs to your capabilities. Propose a plan for how you would do this.
'''
output = convo_chain.predict(input=human_message)
outputs.append(output)
print(output)

To add the ability to read Langchain docs to my capabilities, we can take the following steps:

1. **Retrieve and store documents**: We will need to integrate or adapt the provided code to retrieve and store documents from the Langchain repository. We can store the document contents and relevant metadata such as URLs in the AI's internal memory.

2. **Process and split documents**: We will use the `CharacterTextSplitter` to split the documents into smaller chunks. This will help in handling large documents and extracting relevant information for a given query.

3. **Generate embeddings**: We will generate embeddings for the document chunks using `OpenAIEmbeddings` or any other preferred language model.

4. **Create a search index**: We will create a search index such as the `FAISS` index to store and search the document embedding efficiently.

5. **Develop query function**: Create a function that takes a user query, generates its embedding using the same language model, and searches th

In [9]:
human_message = '''
continue
'''
output = convo_chain.predict(input=human_message)
outputs.append(output)
print(output)

Now that we have gone through the plan to add the ability to read Langchain docs, the next steps would be to implement each part of the plan. I will outline the tasks that need to be done for each step.

TASKS
- [ ] T1: Integrate or adapt the provided code to retrieve and store documents.
- [ ] T2: Process and split documents using `CharacterTextSplitter`.
- [ ] T3: Generate embeddings for the document chunks.
- [ ] T4: Create a search index with `FAISS` or another indexing method.
- [ ] T5: Develop a function to process user queries and search the index.
- [ ] T6: Extract relevant information from the top documents using a language model chain.

Once you are ready to work on a specific task, let me know, and I can help you further with that task.


In [10]:
human_message = '''
continue
'''
output = convo_chain.predict(input=human_message)
outputs.append(output)
print(output)

As you're looking to continue with the implementation, let's consider T1 first, since it is the first task in the plan:

T1: Integrate or adapt the provided code to retrieve and store documents.

For this task, we can build a function that takes a repository owner and repository name as input, and returns a list of documents found. The provided code can be adapted and integrated into this function.

Would you like me to create a function to retrieve and store documents based on the provided code, or do you have any specific questions or additional requirements you'd like me to address for this task?


In [11]:
human_message = '''
Go ahead and use your actions to execute T1
'''
output = convo_chain.predict(input=human_message)
outputs.append(output)
print(output)

To execute T1, we will create the following function `get_langchain_docs` which takes two required arguments `repo_owner` and `repo_name`. This function is based on the provided code and returns a list of `Document` objects containing the text content and associated metadata.

```python
def get_langchain_docs(repo_owner: str, repo_name: str) -> list[Document]:
    def get_github_docs(repo_owner, repo_name):
        with tempfile.TemporaryDirectory() as d:
            subprocess.check_call(
                f"git clone --depth 1 https://github.com/{repo_owner}/{repo_name}.git .",
                cwd=d,
                shell=True,
            )
            git_sha = (
                subprocess.check_output("git rev-parse HEAD", shell=True, cwd=d)
                .decode("utf-8")
                .strip()
            )
            repo_path = pathlib.Path(d)
            doc_files = []
            for extension in ['.md', '.mdx', '.ipynb']:
                
                doc_files.extend(

In [12]:
human_message = '''
Continue
'''
output = convo_chain.predict(input=human_message)
outputs.append(output)
print(output)

                    github_url = f"https://github.com/{repo_owner}/{repo_name}/blob/{git_sha}/{relative_path}"
                    yield Document(page_content=f.read(), metadata={"source": github_url})
                    
    sources = get_github_docs(repo_owner, repo_name)
    return [source for source in sources]

```

With this `get_langchain_docs` function, you can now retrieve and store Langchain documents, given the repository owner and repository name. This function returns a list of `Document` objects containing the document text and associated metadata, such as the source URL. If you have any questions about this function or would like help executing the tasks T2-T6, please let me know.


In [None]:
human_message = '''
continue
'''
output = convo_chain.predict(input=human_message)
outputs.append(output)
print(output)