## the notebook uses RAG for a webpage, specifically the LoRA documentation on HuggingFace

In [11]:
import os
import streamlit as st
import pickle
import time
from langchain import OpenAI
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter # splits bases on multiple args
from langchain_community.document_loaders.url import UnstructuredURLLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from dotenv import load_dotenv

In [14]:
load_dotenv()
openai_api_key = os.getenv("openai_api_key")

## openai llm

In [16]:
llm = OpenAI(temperature=0.9, max_tokens=500, openai_api_key = openai_api_key)

## loader

In [17]:
lora_api_link = "https://huggingface.co/docs/peft/main/en/developer_guides/lora#merge-lora-weights-into-the-base-model"

In [18]:
loader = UnstructuredURLLoader(urls=[lora_api_link])
data = loader.load()
len(data)

1

In [19]:
data[0].metadata

{'source': 'https://huggingface.co/docs/peft/main/en/developer_guides/lora#merge-lora-weights-into-the-base-model'}

## text splitter
provided list of contexts' token length has to be less than the context length of the LLM. so we split data.
- merged split so the chunks' context lengths are close to LLM's max context length
- Overlap chunks will have common data between chunks

In [20]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)

In [21]:
docs = text_splitter.split_documents(data)

In [25]:
len(docs)

25

## embedding

In [28]:
embedding_model = OpenAIEmbeddings(openai_api_key = openai_api_key)

In [36]:
index = FAISS.from_documents(docs, embedding_model)

## query

In [40]:
# uses map reduce by default
# template='Given the following extracted parts of a long document and a question, create a final answer with references ("SOURCES"). 
# \nIf you don\'t know the answer, just say that you don\'t know. 
# Don\'t try to make up an answer.\nALWAYS return a "SOURCES" part in your answer

chain = RetrievalQAWithSourcesChain.from_llm(llm = llm, retriever=index.as_retriever())

In [41]:
query = "How merge_an_unload() works?"

In [44]:
import langchain
langchain.debug = True
chain({"question" : query}, return_only_outputs=True)

[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain] Entering Chain run with input:
[0m{
  "question": "How merge_an_unload() works?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "input_list": [
    {
      "context": "Merge LoRA weights into the base model\n\nWhile LoRA is significantly smaller and faster to train, you may encounter latency issues during inference due to separately loading the base model and the LoRA adapter. To eliminate latency, use the merge_and_unload() function to merge the adapter weights with the base model. This allows you to use the newly merged model as a standalone model. The merge_and_unload() function doesn’t keep the adapter weights in memory.\n\nBelow is a diagram that explain

{'answer': ' The function merge_an_unload() merges the adapter weights with the base model to create a standalone model, and unload() is used to return the base model or delete the adapter entirely. ',
 'sources': 'https://huggingface.co/docs/peft/main/en/developer_guides/lora#merge-lora-weights-into-the-base-model'}