<a href="https://colab.research.google.com/github/AnDDoanf/LLM-repo/blob/master/web_retriever_llama2_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
%%capture
!pip install langchain torch transformers faiss-gpu bitsandbytes accelerate langchain-community==0.2.1 langchain-core==0.2.1 llama-index-embeddings-langchain llama-index  sentence-transformers langchain_google_community html2text gradio

In [4]:
from langchain.callbacks.base import BaseCallbackHandler
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.retrievers.web_research import WebResearchRetriever
from langchain.embeddings import LlamaCppEmbeddings
import os
from langchain.llms.huggingface_pipeline import HuggingFacePipeline
from torch import cuda, bfloat16
import transformers
from langchain.embeddings import HuggingFaceEmbeddings
from llama_index.embeddings.langchain import LangchainEmbedding
import gradio as gr

import faiss
from langchain.vectorstores import FAISS
from langchain.docstore import InMemoryDocstore

In [5]:
os.environ["GOOGLE_API_KEY"] = "AIzaSyCzRX6KLC-Vz1Evduq8bAWLiZ_hlyOgqqs"
os.environ["GOOGLE_CSE_ID"] = "d793d7e247bc848a7"

In [6]:
%%capture
class LLMConfig:
  def __init__(self):
    self.model_id = 'meta-llama/Llama-2-7b-chat-hf'
    self.device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
    self.hf_auth = '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'
    self.task = 'text-generation'
    self.temperature = 1
    self.max_new_tokens = 512
    self.repetition_penalty = 1.2

class BuildLLM:
  def __init__(self) -> None:
    self.config = LLMConfig()
    model_id = self.config.model_id
    device = self.config.device
    hf_auth = self.config.hf_auth

    bnb_config = transformers.BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=bfloat16
    )

    model_config = transformers.AutoConfig.from_pretrained(
        model_id,
        use_auth_token=hf_auth
    )

    model = transformers.AutoModelForCausalLM.from_pretrained(
        model_id,
        trust_remote_code=True,
        config=model_config,
        quantization_config=bnb_config,
        device_map='auto',
        use_auth_token=hf_auth
    )

    tokenizer = transformers.AutoTokenizer.from_pretrained(
        model_id,
        use_auth_token=hf_auth
    )

    generate_text = transformers.pipeline(
        model=model,
        tokenizer=tokenizer,
        return_full_text=True,
        task=self.config.task,
        temperature=self.config.temperature,
        max_new_tokens=self.config.max_new_tokens,
        repetition_penalty=self.config.repetition_penalty
    )

    self.llm = HuggingFacePipeline(pipeline=generate_text)
  def get_llm(self):
    return self.llm

In [7]:
%%capture
builder = BuildLLM()
llm = builder.get_llm()

lc_embed_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2",
)

In [10]:
%%capture
def settings():
    # Vectorstore
    embeddings_model = LangchainEmbedding(lc_embed_model)
    embedding_size = len(embeddings_model.get_text_embedding("test"))
    # print(f"Embedding size: {embedding_size}")
    index = faiss.IndexFlatL2(embedding_size)
    vectorstore_public = FAISS(embeddings_model.get_text_embedding, index, InMemoryDocstore({}), {})

    # Search
    from langchain_google_community import GoogleSearchAPIWrapper
    search = GoogleSearchAPIWrapper()

    # Initialize
    web_retriever = WebResearchRetriever.from_llm(
        vectorstore=vectorstore_public,
        llm=llm,
        search=search,
        num_search_results=3
    )

    return web_retriever, llm

In [19]:
# Make retriever and llm
from langchain.prompts import PromptTemplate

web_retriever, llm = settings()

# User input
question = "what is love"

prompt_template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, please think rationally and answer from your own knowledge base.
{context}
{summaries}
Question: {question}
"""
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question", "summaries"]
)
chain_type_kwargs = {"prompt": PROMPT}
qa_chain = RetrievalQAWithSourcesChain.from_chain_type(llm, retriever=web_retriever, chain_type_kwargs=chain_type_kwargs, return_source_documents=True)


Fetching pages: 100%|##########| 9/9 [00:02<00:00,  3.25it/s]


{'question': 'what is love', 'context': 'what is love', 'summaries': None, 'answer': "Use the following pieces of context to answer the question at the end. \nIf you don't know the answer, please think rationally and answer from your own knowledge base \n\nwhat is love\n\nContent: And a love like this should be our goal as followers of Jesus. In the opening\nverses of this chapter, Paul told us that no matter what we do, if we do not\nhave love, it is all for naught. John echoes this in 1 John 4:7–12. He tells\nus to love one another. And that the one who does not love, does not know God,\nbecause God is love.\n\nI find it very hard to love some people. But love for each other is not\noptional. It is the one thing I am commanded to do (John 15:12, 17). And when\nI choose to obey Jesus’ command, I find that his Spirit, living in me, helps\nme to love like this (Gal. 5:22), even to those I find challenging to love.\nAnd when I love, I more fully experience God’s love (1 John 4:16) and pr

In [30]:
# Gradio interface

header = "Interweb Explorer"
info = "I am an AI that can answer questions by exploring, reading, and summarizing web pages. I can be configured to use different modes: public API or private (no data sharing)."

inputs = gr.Textbox(label="Ask a question:")
answer_box = gr.Textbox(label="Answer")
sources_box = gr.Textbox(label="Sources")
source_documents_box = gr.Textbox(label="Source Documents")
outputs = [
    answer_box,
    sources_box,
    source_documents_box
]

def get_answer(question):
  result = qa_chain({"question": question, "context": question, "summaries": None})
  return {
      answer_box: result["answer"],
      sources_box: result["sources"],
      source_documents_box: "\n".join(list(set([item.metadata['source'] for item in result["source_documents"]])))
  }

iface = gr.Interface(fn=get_answer, inputs=inputs, outputs=outputs, title=header, description=info)
iface.launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://e96206d1f3b1350072.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


