In [None]:
import bs4
import requests
import re
import chromadb
from langchain.text_splitter import SentenceTransformersTokenTextSplitter
from dspy.retrieve.chromadb_rm import ChromadbRM
from chromadb.utils import embedding_functions
from redis import Redis


## Crawler

In [None]:
URL = "https://en.wikipedia.org/wiki/Go_(game)"
URL_LOCAL = URL.split("/")[-1]
URL_LOCAL = re.sub(r'\W+', '', URL_LOCAL)

In [None]:
response = requests.get(url=URL)
soup = bs4.BeautifulSoup(response.content, "html.parser")

parsed = {}
p_counter = 0
all_titles = soup.find_all("h2")[1:]
for title in all_titles:
    header = title.span["id"].strip()
    textContent = {}
    for para in title.find_next_siblings("p"):
        if header in para.find_previous_siblings("h2")[0].span["id"].strip():
            textContent[p_counter] = para.text.strip()
            p_counter += 1
    if textContent:
        parsed[header] = textContent


## Vectorize

In [None]:
CHROMA_COLLECTION_NAME = f"wiki_{URL_LOCAL}"
CHROMADB_DIR = "../db/"

In [None]:
chroma_client = chromadb.PersistentClient(path=CHROMADB_DIR)
collection = chroma_client.get_or_create_collection(name=CHROMA_COLLECTION_NAME)
text_splitter = SentenceTransformersTokenTextSplitter()

In [None]:
num_paragraphs = list(parsed[list(parsed.keys())[-1]].keys())[-1]
for header, paragraphs in parsed.items():
    for id, text in paragraphs.items():
        # split the text into chunks and insert into chromadb
        ids = []
        documents = []
        metadatas = []
        chunks = text_splitter.create_documents([text]) # takes array of documents
        for chunk_no, chunk in enumerate(chunks):
            ids.append(f"pid_{id}#{chunk_no}")
            documents.append(chunk.page_content)
            metadatas.append({"title": header, "source": URL})
        if ids:
            collection.upsert(ids=ids, documents=documents, metadatas=metadatas)
        print(f"{int(0.5 + 100.0 * id / num_paragraphs)}% ({collection.count()})", end=" ", flush=True)
        if id % 10 == 0:
            print()



### Test retriever

In [None]:
def Retriever():
    """
    Retreives rules for bidding in bridge.
    This is just a retriever and does not have any language model.
    """
    default_ef = embedding_functions.DefaultEmbeddingFunction()
    return ChromadbRM(CHROMA_COLLECTION_NAME, CHROMADB_DIR, default_ef, k=3)

In [None]:
question = "What is GO?"
retrieved = Retriever()(question)
print("vector store:", retrieved)

## Cache

In [None]:
redis_host = "127.0.0.1"
r = Redis(host=redis_host, port=6379, decode_responses=True)

In [None]:
r.set("foo", "bar")
r.get("foo")

### TODO:
- check if given url exists in Redis
  - If yes: Fetch path of persisted vector db
  - If no or update=True: scrape wiki page, preprocess and persist to vector dd and persist vdb obj to storage 