<a href="https://colab.research.google.com/github/AnDDoanf/LLM-repo/blob/master/RAG_classes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

others: "https://edersoncorbari.github.io/friends/"

## Construct Retriever from website

In [5]:
import requests
from bs4 import BeautifulSoup
url = "https://edersoncorbari.github.io/friends-scripts/"
url_paths = []
res = requests.get(url).content
soup = BeautifulSoup(res, 'html.parser')
for path in soup.find_all('a', href=True):
  url_paths.append(url+path['href'])

In [17]:
%%capture
!pip install langchain html2text sentence-transformers faiss-gpu langchain-community playwright langchain-huggingface pypdf
!playwright install

from typing import List
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import AsyncChromiumLoader
from langchain.document_transformers import Html2TextTransformer
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain.schema import Document
import nest_asyncio
import os

nest_asyncio.apply()

class UltimateRAG:
  def __init__(self, chunk_size, chunk_overlap) -> None:
    self.chunk_size = chunk_size
    self.chunk_overlap = chunk_overlap
    self.text_splitter = CharacterTextSplitter(
        separator="\n\n",
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap,
        length_function=len,
        is_separator_regex= False)

  def url_text_loader(self, url_paths):
    if url_paths == []: return []
    loader = AsyncChromiumLoader(url_paths)
    docs = loader.load()
    html2text = Html2TextTransformer()
    documents = html2text.transform_documents(docs)
    chunked_documents = self.text_splitter.split_documents(documents)
    print(f"DEBUG:", chunked_documents[0])
    return chunked_documents

  def txt_text_loader(self, dir_path):
    if dir_path == "": return []
    documents = []
    for file in os.listdir(dir_path):
        if file.endswith(".txt"):
            filepath = os.path.join(dir_path, file)
            with open(filepath, 'r') as f:
                documents.append(Document(page_content=f.read()))

    chunked_documents = self.text_splitter.split_documents(documents)
    print(f"DEBUG:", chunked_documents[0])
    return chunked_documents

  def pdf_text_loader(self, dir_path):
    if dir_path == "": return []
    documents = []
    for file in os.listdir(dir_path):
      if file.endswith(".pdf"):
        filepath = os.path.join(dir_path, file)
        loader = PyPDFLoader(filepath)
        documents += loader.load()
    chunked_documents = self.text_splitter.split_documents(documents)
    print(f"DEBUG:", chunked_documents[0])
    return chunked_documents

  def create_retriever(self, url_paths=[], dir_path=""):
    documents_from_url = self.url_text_loader(url_paths)
    documents_from_txt = self.txt_text_loader(dir_path)
    documents_from_pdf = self.pdf_text_loader(dir_path)
    chunked_documents = documents_from_txt + documents_from_pdf + documents_from_url

    db = FAISS.from_documents(
        chunked_documents,
        HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2'))

    self.retriever = db.as_retriever(
        search_type="similarity",
        search_kwargs={'k': 4}
    )
    return self.retriever


In [20]:
ultimate_rag = UltimateRAG(chunk_size=1000, chunk_overlap=200)
retriever = ultimate_rag.create_retriever(url_paths=url_paths[:200], dir_path="/content/data")



DEBUG: page_content="# The One Where Monica Gets a New Roommate (The Pilot-The Uncut Version)\n\n* * *\n\nWritten by: Marta Kauffman & David Crane  \nTranscribed by: guineapig  \nAdditional transcribing by: Eric Aasen  \n(Note: The previously unseen parts of this episode are shown in blue text.)\n\n* * *\n\n****\n\n****[Scene: Central Perk, Chandler, Joey, Phoebe, and Monica are there.]\n\n**Monica:** There's nothing to tell! He's just some guy I work with!\n\n**Joey:** C'mon, you're going out with the guy! There's gotta be something\nwrong with him!\n\n**Chandler:** All right Joey, be nice.  So does he have a hump? A hump and a\nhairpiece?\n\n**Phoebe:** Wait, does he eat chalk?\n\n(They all stare, bemused.)\n\n**Phoebe:** Just, 'cause, I don't want her to go through what I went through\nwith Carl- oh!\n\n**Monica:** Okay, everybody relax. This is not even a date. It's just two\npeople going out to dinner and- not having sex.\n\n**Chandler:** Sounds like a date to me.\n\n[Time Lapse]"

In [21]:
from sys import getsizeof
getsizeof(retriever)

56

In [22]:
retriever

VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x7992c6c1e6e0>, search_kwargs={'k': 4})