In [16]:
from dotenv import load_dotenv
import os
import requests
import fnmatch
import argparse
import base64
import time

from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import OpenAI
from langchain.docstore.document import Document
from langchain.chains import RetrievalQA
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains.question_answering import load_qa_chain

In [18]:
load_dotenv()

GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
repo_owner = "RasaHQ"
repo_name = "rasa"

In [6]:
def get_files_from_github_repo(owner, repo, token):
    url = f"https://api.github.com/repos/{owner}/{repo}/git/trees/main?recursive=1"
    headers = {
        "Authorization": f"token {token}",
        "Accept": "application/vnd.github+json"
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        content = response.json()
        return content["tree"]
    else:
        raise ValueError(f"Error fetching repo contents: {response.status_code}")
    

def fetch_md_contents(files):
    md_contents = []
    for file in files:
        if file["type"] == "blob" and fnmatch.fnmatch(file["path"], "*.mdx"):
            response = requests.get(file["url"])
            if response.status_code == 200:
                content = response.json()["content"]
                decoded_content = base64.b64decode(content).decode('utf-8')
                print("Fetching Content from ", file['path'])
                md_contents.append(Document(page_content=decoded_content, metadata={"source": file['path']}))
            else:
                print(f"Error downloading file {file['path']}: {response.status_code}")
    return md_contents


def get_source_chunks(files):
    print("In get_source_chunks ...")
    source_chunks = []
    splitter = CharacterTextSplitter(separator=" ", chunk_size=1024, chunk_overlap=0)
    for source in fetch_md_contents(files):
        for chunk in splitter.split_text(source.page_content):
            source_chunks.append(Document(page_content=chunk, metadate=source.metadata))
    return source_chunks


In [7]:
repo_files = get_files_from_github_repo(repo_owner, repo_name, GITHUB_TOKEN)
docs_path = [file for file in repo_files if file["type"] == "blob" and fnmatch.fnmatch(file["path"], "*.mdx")]

In [90]:
len(docs_path[60:])

47

In [None]:
md_contents = fetch_md_contents(docs_path)

In [11]:
len(md_contents)

60

In [93]:
import pickle as pkl

with open('0_60_md_files.pkl', 'wb') as f:
    pkl.dump(md_contents, f)

In [78]:
source_chunks = []
splitter = CharacterTextSplitter(separator="\n", chunk_size=2048, chunk_overlap=0)
for source in md_contents:
    for chunk in splitter.split_text(source.page_content):
        source_chunks.append(Document(page_content=chunk, metadata=source.metadata))

In [61]:
source_chunks[0].metadata

{'source': 'CHANGELOG.mdx'}

In [79]:
CHROMA_DB_PATH = f'./chroma/{os.path.basename(repo_name)}'

chroma_db = None

# if not os.path.exists(CHROMA_DB_PATH):
#     print(f'Creating Chroma DB at {CHROMA_DB_PATH}...')
chroma_db = Chroma.from_documents(source_chunks, OpenAIEmbeddings(), persist_directory=CHROMA_DB_PATH)
chroma_db.persist()

In [36]:
chroma_db = Chroma(persist_directory=CHROMA_DB_PATH, embedding_function=OpenAIEmbeddings())

In [80]:
qa_chain = load_qa_chain(OpenAI(temperature=0), chain_type="stuff")
qa = RetrievalQA(combine_documents_chain=qa_chain, retriever=chroma_db.as_retriever())

In [81]:
res = qa.run("How can I load rasa model with an agent class/?")

In [87]:
from IPython.display import display, Markdown
display(Markdown(res.replace('. ', '.\n\n')))

 You can use the `rasa.core.agent.Agent` class to load a trained Rasa model.

The `load` method takes the path to the model as an argument.

For example: `agent = Agent.load('./models/<model_name>.tar.gz')`.