<a href="https://colab.research.google.com/github/AnDDoanf/LLM-repo/blob/master/multi_prompts_chain.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install langchain langchain_community bitsandbytes accelerate

In [2]:
%%capture
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain.llms import HuggingFacePipeline
from langchain_core.runnables import RunnableSequence
from torch import cuda, bfloat16
import transformers
from google.colab import userdata

class LLMConfig:
  def __init__(self):
    self.model_id = 'meta-llama/Llama-2-13b-chat-hf'
    self.device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
    self.hf_auth = userdata.get('HF_TOKEN')
    self.task = 'text-generation'
    self.temperature = 1
    self.max_new_tokens = 512
    self.repetition_penalty = 1.2

class BuildLLM:
  def __init__(self) -> None:
    self.config = LLMConfig()
    model_id = self.config.model_id
    device = self.config.device
    hf_auth = self.config.hf_auth

    bnb_config = transformers.BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=bfloat16
    )

    model_config = transformers.AutoConfig.from_pretrained(
        model_id,
        use_auth_token=hf_auth
    )

    model = transformers.AutoModelForCausalLM.from_pretrained(
        model_id,
        trust_remote_code=True,
        config=model_config,
        quantization_config=bnb_config,
        device_map='auto',
        use_auth_token=hf_auth
    )

    tokenizer = transformers.AutoTokenizer.from_pretrained(
        model_id,
        use_auth_token=hf_auth
    )

    generate_text = transformers.pipeline(
        model=model,
        tokenizer=tokenizer,
        return_full_text=True,
        task=self.config.task,
        temperature=self.config.temperature,
        max_new_tokens=self.config.max_new_tokens,
        repetition_penalty=self.config.repetition_penalty
    )

    self.llm = HuggingFacePipeline(pipeline=generate_text)
  def get_llm(self):
    return self.llm

  def get_chain(self, prompt):
    return RunnableSequence(prompt | self.llm)

In [3]:
from langchain.chains.router import MultiPromptChain
from langchain.chains.router.llm_router import LLMRouterChain,RouterOutputParser
from langchain.prompts import PromptTemplate

build_llm = BuildLLM()
llm = build_llm.get_llm()



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

  warn_deprecated(


In [51]:
from langchain.schema import AgentAction, AgentFinish
from langchain.chains.router.llm_router import RouterOutputParser
from langchain_core.output_parsers.base import BaseLLMOutputParser

class CustomLLMOutputParser(BaseLLMOutputParser):
    def parse_result(self, result: list) -> str:
        text = result[0].text
        cleaned_text = text[text.rfind("[/INST]")+7:]
        return cleaned_text

class CustomRouterOutputParser(RouterOutputParser):
    def parse(self, text: str) -> AgentAction | AgentFinish:
        processed_text = "```json\n"+text[text.rfind('{'):].strip()
        return super().parse(processed_text)

In [52]:
B_INST, E_INST = "[INST]", "[/INST]"
word_limit = 200

physics_template = B_INST + f"""You are a very smart physics professor. \
You are great at answering questions about physics in a concise\
and easy to understand manner. \
When you don't know the answer to a question you admit\
that you don't know.
Please answer in {word_limit} words or less.
DO answer formally.

Please answer the question:
""" + "{input}" + E_INST


math_template = B_INST + f"""You are a very good mathematician.
Your goal is to answer math questions from user.
DO break down hard problems into their component parts, answer the component parts, then put them together to answer the broader question.
When you don't know the answer to a question you admit that you don't know.
DO NOT add anything else.
Please answer in {word_limit} words or less.
DO answer formally.

Please solve this problem:
""" + "{input}" + E_INST

history_template = B_INST + f"""You are a very good historian. \
You have an excellent knowledge of and understanding of people,\
events and contexts from a range of historical periods. \
You have the ability to think, reflect, debate, discuss and \
evaluate the past. You have a respect for historical evidence\
and the ability to make use of it to support your explanations \
and judgements.
Please answer in {word_limit} words or less.
DO answer formally.

Here is a question:
""" + "{input}" + E_INST

prompt_infos = [
    {
        "name": "physics",
        "description": "Good for answering questions about physics",
        "prompt_template": physics_template
    },
    {
        "name": "math",
        "description": "Good for answering math questions",
        "prompt_template": math_template
    },
    {
        "name": "history",
        "description": "Good for answering history questions",
        "prompt_template": history_template
    }
]

In [53]:
from langchain.chains import LLMChain
from langchain.prompts import ChatPromptTemplate

destination_chains = {}
for p_info in prompt_infos:
    name = p_info["name"]
    prompt_template = p_info["prompt_template"]
    prompt = ChatPromptTemplate.from_template(template=prompt_template)
    chain = LLMChain(llm=llm, prompt=prompt, output_parser=CustomLLMOutputParser())
    destination_chains[name] = chain

destinations = [f"{p['name']}: {p['description']}" for p in prompt_infos]
destinations_str = "\n".join(destinations)

In [54]:
MULTI_PROMPT_ROUTER_TEMPLATE = """Given a raw text input to a \
language model select the model prompt best suited for the input. \
You will be given the names of the available prompts and a description of what the prompt is best suited for.
You may also summarize the original input if you think that summerizing it will ultimately lead to a better response from the language model.
DO NOT revise the question.

<< FORMATTING >>
Return a markdown code snippet with a JSON object formatted to look like:

```json
{{{{
    "destination": string \ name of the prompt to use or "DEFAULT"
    "next_inputs": string \ a potentially modified version of the original input
}}}}
```

REMEMBER: "destination" MUST be one of the candidate prompt \
names specified below OR it can be "DEFAULT" if the input is not\
well suited for any of the candidate prompts.
REMEMBER: "next_inputs" can just be the original input \
if you don't think any modifications are needed.

<< CANDIDATE PROMPTS >>
{destinations}

<< INPUT >>
{{input}}

<< OUTPUT (remember to include the ```json)>>"""

default_prompt = ChatPromptTemplate.from_template("{input}")
default_chain = LLMChain(llm=llm, prompt=default_prompt)

In [55]:
router_template = MULTI_PROMPT_ROUTER_TEMPLATE.format(
    destinations=destinations_str
)

router_prompt = PromptTemplate(
    template=router_template,
    input_variables=["input"],
    output_parser=CustomRouterOutputParser(partial=True),
)

router_chain = LLMRouterChain.from_llm(llm, router_prompt)

In [56]:
chain = MultiPromptChain(router_chain=router_chain,
                         destination_chains=destination_chains,
                         default_chain=default_chain, verbose=True
                        )

In [58]:
import gc
import torch
gc.collect()
torch.cuda.empty_cache()
chain.invoke("What are 3 laws of Newton?")



[1m> Entering new MultiPromptChain chain...[0m
physics: {'input': "Newton's three laws of motion"}
[1m> Finished chain.[0m


{'input': "Newton's three laws of motion",
 'text': "  Certainly! Newton's Three Laws of Motion are fundamental principles that describe how objects move and respond to forces. Here is a brief overview of each law, stated as clearly and concisely as possible:\n\nFirst Law (Law of Inertia): An object at rest remains at rest, and an object in motion remains in motion, unless acted upon by an external force. This means that if there are no net forces acting on an object, it will maintain its current state of motion.\n\nSecond Law (Law of Acceleration): The acceleration of an object is directly proportional to the net force acting upon it, and inversely proportional to its mass. This means that the greater the force applied to an object, the more it will accelerate, but the more massive the object, the less it will accelerate.\n\nThird Law (Law of Action and Reaction): For every action, there is an equal and opposite reaction. This means that when one object exerts a force on another objec