<a href="https://colab.research.google.com/github/AnDDoanf/LLM-repo/blob/master/web_crawling_RAG_llama2_langchain.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Web Crawling

In [1]:
%%capture
!pip install tiktoken

In [2]:
from bs4 import BeautifulSoup
import urllib.request
import string
import requests
import tiktoken

def get_subsub_urls(BASE_URL):
  sub_urls = [BASE_URL + item + "/" for item in list(string.ascii_lowercase)]
  subsub_urls = []
  for sub_url in sub_urls:
    page = urllib.request.urlopen(sub_url)
    soup = BeautifulSoup(page, 'html.parser')
    for item in soup.find_all('ul'):
        subsub_urls += [url.get('href') for url in item.find_all('a') if url.get('href') != '' and url.get('href') not in sub_urls]
  return subsub_urls

def scrape_jina_ai(url: str) -> str:
  response = requests.get("https://r.jina.ai/" + url)
  return response.text

BASE_URL = "https://iep.utm.edu/"
subsub_urls = get_subsub_urls(BASE_URL)

web_data = []

for index, subsub_url in enumerate(subsub_urls):
  if index %10 == 0:
    print(f'Working on {index} url')
  web_data.append(scrape_jina_ai(subsub_url))



def count_tokens(input_string: str) -> int:
    tokenizer = tiktoken.get_encoding("cl100k_base")

    tokens = tokenizer.encode(input_string)

    return len(tokens)

def calculate_cost_by_inputs(input_string: str, cost_per_million_tokens: float = 5) -> float:
    num_tokens = count_tokens(input_string)

    total_cost = (num_tokens / 1_000_000) * cost_per_million_tokens

    return total_cost*25450

Working on 0 url
Working on 10 url
Working on 20 url
Working on 30 url
Working on 40 url
Working on 50 url
Working on 60 url
Working on 70 url
Working on 80 url
Working on 90 url
Working on 100 url
Working on 110 url
Working on 120 url
Working on 130 url
Working on 140 url
Working on 150 url
Working on 160 url
Working on 170 url
Working on 180 url
Working on 190 url
Working on 200 url
Working on 210 url
Working on 220 url
Working on 230 url
Working on 240 url
Working on 250 url
Working on 260 url
Working on 270 url
Working on 280 url
Working on 290 url
Working on 300 url
Working on 310 url
Working on 320 url
Working on 330 url
Working on 340 url
Working on 350 url
Working on 360 url
Working on 370 url
Working on 380 url
Working on 390 url
Working on 400 url
Working on 410 url
Working on 420 url
Working on 430 url
Working on 440 url
Working on 450 url
Working on 460 url
Working on 470 url
Working on 480 url
Working on 490 url
Working on 500 url
Working on 510 url
Working on 520 url
Work

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [26]:
# import pandas as pd

# df = pd.DataFrame(web_data, columns = ["context"])
# df.to_csv("web_data.csv")

## RAG with Llama-2

In [1]:
import pandas as pd
data_path = "/content/drive/MyDrive/web_data.csv"
data = list(pd.read_csv(data_path,  index_col=0)["context"].values)

In [14]:
%%capture
!pip install langchain langchain_community transformers accelerate bitsandbytes sentence_transformers chromadb langchainhub

import warnings
warnings.filterwarnings("ignore")
from langchain_community.document_loaders import WebBaseLoader
import bs4
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain import hub
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers.string import StrOutputParser
import transformers
from torch import cuda, bfloat16
from langchain.llms import HuggingFacePipeline

In [None]:
def len_func(text):
    return len(text)

text_splitter = CharacterTextSplitter(
    separator="\n\n",
    chunk_size = 1000,
    chunk_overlap = 200,
    length_function = len_func,
    is_separator_regex = False,
    add_start_index = True)


splits = text_splitter.create_documents(texts = data)
print(f"Before split: {len(data)}\nAfter split: {len(splits)}")

In [10]:
%%capture
class LLMConfig:
  def __init__(self):
    self.model_id = 'meta-llama/Llama-2-7b-chat-hf'
    self.device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
    self.hf_auth = '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'
    self.task = 'text-generation'
    self.temperature = 1
    self.max_new_tokens = 512
    self.repetition_penalty = 1.2

class BuildLLM:
  def __init__(self) -> None:
    self.config = LLMConfig()
    model_id = self.config.model_id
    device = self.config.device
    hf_auth = self.config.hf_auth

    bnb_config = transformers.BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=bfloat16
    )

    model_config = transformers.AutoConfig.from_pretrained(
        model_id,
        use_auth_token=hf_auth
    )

    model = transformers.AutoModelForCausalLM.from_pretrained(
        model_id,
        trust_remote_code=True,
        config=model_config,
        quantization_config=bnb_config,
        device_map='auto',
        use_auth_token=hf_auth
    )

    tokenizer = transformers.AutoTokenizer.from_pretrained(
        model_id,
        use_auth_token=hf_auth
    )

    generate_text = transformers.pipeline(
        model=model,
        tokenizer=tokenizer,
        return_full_text=True,
        task=self.config.task,
        temperature=self.config.temperature,
        max_new_tokens=self.config.max_new_tokens,
        repetition_penalty=self.config.repetition_penalty
    )

    self.llm = HuggingFacePipeline(pipeline=generate_text)
  def get_llm(self):
    return self.llm


builder = BuildLLM()
llm = builder.get_llm()

lc_embed_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2",
)

In [23]:
vectorstore = Chroma.from_documents(documents=splits, embedding=lc_embed_model)
retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [24]:
question = "What is Plato’s Academy?"
response = rag_chain.invoke(question)
print(response)

Human: You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: What is Plato’s Academy? 
Context: Plato’s silence about the Academy adds to the difficulty of labeling his Academy with the English word “school.” Diogenes Laertius refers to Plato’s Academy as a “_hairesis_,” which can be translated as “school” or “sect”  (_Lives_ III.41). The noun “hairesis” comes from the verb “to choose,” and it thereby signifies “a choice of life” as much as “a place of instruction.” The head of the Academy after Plato was called the “scholarch,” but while _scholē_ forms the root of our word “school” and was used to refer to Plato’s Academy (_Lives_ IV.2), it originally had the meaning of “leisure.” The Greek word _diatribē_ can also be translated as “school” from its connotation of spending time together, but no

In [25]:
question = "Define Aesthetics"
response = rag_chain.invoke(question)
print(response)

Human: You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: Define Aesthetics 
Context: Markdown Content:
[Aesthetics](https://iep.utm.edu/aestheti/) is the subject matter concerning, as a paradigm, fine art, but also the special, art-like status sometimes given to applied arts like architecture or industrial design or to objects in nature. It is hard to say precisely what is shared among this motley crew of objects (often referred to as _aesthetic objects_), but the aesthetic attitude is supposed to go some way toward solving this problem. It is, at the very least, the special point of view we take toward an object that results in our having an aesthetic experience (an experience of, for example, beauty, sublimity, or even ugliness). Many aesthetic theories, however, have taken it to play a ce

In [27]:
question = "What is Democracy and Social Ethics?"
# A book of Jane Addams
response = rag_chain.invoke(question)
print(response)

Human: You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: What is Democracy and Social Ethics? 
Context: ### d. Democracy

Addams maintained a robust definition of democracy that moved far beyond understanding it merely as a political structure. For Addams, democracy represented both a mode of living and a social morality. She viewed democracy as an acknowledgement that the lives of citizens are bound up with one another and this relationship creates a duty to understand the struggles and circumstances of fellow citizens. Reciprocity of social relations is crucial for providing citizens with the empathetic foundation necessary to energize democracy. Social settlements were experiments in the kind of democracy that Addams endeavored to promote: one of active social engagement. Addams’ definiti

In [21]:
vectorstore.delete_collection()