<a href="https://colab.research.google.com/github/1028Luo/LLM-Domain-Specific-Assistant/blob/main/RAG_LLAMA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
!pip install llama-index
!pip install llama-index-embeddings-huggingface
!pip install -q bitsandbytes==0.45.0
!pip install -q transformers==4.47.1
!pip install -q huggingface_hub



In [15]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings, SimpleDirectoryReader, VectorStoreIndex
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor
# load fine-tuned model from hub
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch
import bitsandbytes

In [16]:
# Log in to Hugging Face
from google.colab import userdata
from huggingface_hub import login
my_hugging_face_token = userdata.get('huggingface_token')
login(token=my_hugging_face_token)

In [17]:
# load local documents
documents = SimpleDirectoryReader("drive/MyDrive/Colab Notebooks/RAG_data").load_data()
print("length of document: ", len(documents))

# import any embedding model on HF hub (https://huggingface.co/spaces/mteb/leaderboard)
# embedding models are used to convert text (query, doc) into embedding vectors
# they are usually transformers too
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
# Settings.embed_model = HuggingFaceEmbedding(model_name="thenlper/gte-large") # alternative model

Settings.llm = None
Settings.chunk_size = 512
Settings.chunk_overlap = 25


# store docs in embedding space using the embedding model
index = VectorStoreIndex.from_documents(documents)

# set number of docs to retreive
top_k = 6

# configure retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=top_k,
)

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.5)],
)


length of document:  37
LLM is explicitly disabled. Using MockLLM.


In [20]:
# load model and tokenizer
model_name = "Jiexing1028/llama-3-1b-miniguanaco-Luo"
model = AutoModelForCausalLM.from_pretrained(model_name,
                                            device_map="cuda",
                                            trust_remote_code=False,
                                            revision="main")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-instruct", use_fast=True) # use a different tokenizer for now
print(tokenizer.chat_template)

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

{{- bos_token }}
{%- if custom_tools is defined %}
    {%- set tools = custom_tools %}
{%- endif %}
{%- if not tools_in_user_message is defined %}
    {%- set tools_in_user_message = true %}
{%- endif %}
{%- if not date_string is defined %}
    {%- set date_string = "26 Jul 2024" %}
{%- endif %}
{%- if not tools is defined %}
    {%- set tools = none %}
{%- endif %}

{#- This block extracts the system message, so we can slot it into the right place. #}
{%- if messages[0]['role'] == 'system' %}
    {%- set system_message = messages[0]['content']|trim %}
    {%- set messages = messages[1:] %}
{%- else %}
    {%- set system_message = "" %}
{%- endif %}

{#- System message + builtin tools #}
{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
{%- if builtin_tools is defined or tools is not none %}
    {{- "Environment: ipython\n" }}
{%- endif %}
{%- if builtin_tools is defined %}
    {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}}
{%- endif

In [21]:
def chat():
    while True:
        question = input("You: ")
        if question.lower() == 'exit':
            break
        # query documents

        response = query_engine.query(question)

        # reformat response
        context = "Context:\n"
        for i in range(top_k):
            context = context + response.source_nodes[i].text + "\n\n"
            # # prompt with no context
        # intstructions_string = f"""You are called MyGPT, an expert assistant in Robotics and AI, providing detailed and accurate responses to any query about the knowledge that you have learnt and  in your RAG data base.\
        # MyGPT will tailor the length of its responses to match the query, providing clear explanation on papers, projects and concepts in the field of robotics and AI \
        # and should keep the answer concise.

        # Please respond to the following question.
        # """
        # prompt_template = lambda comment: f'''[INST] {intstructions_string} \n{comment} \n[/INST]'''

        # prompt = prompt_template(question)
        # # print(prompt)

        # model.eval()

        # inputs = tokenizer(prompt, return_tensors="pt")
        # outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=280)

        # print(tokenizer.batch_decode(outputs)[0])


        # ##### with Context #####
        # prompt_template_w_context = lambda context, question: f"""[INST]You are called MyGPT, an expert assistant in Robotics and AI, providing detailed and accurate responses to any query about the knowledge that you have learnt and  in your RAG data base.\
        # MyGPT will tailor the length of its responses to match the query, providing clear explanation on papers, projects and concepts in the field of robotics and AI \
        # and should keep the answer concise.

        # {context}
        # Please respond to the following question. Use the context above if it is helpful.

        # {question}
        # [/INST]
        # """


        prompt = [
            {"role": "system", "content": "You are called MyGPT, an expert assistant in Robotics and AI, providing detailed and accurate responses to any query about the knowledge that you have learnt and  in your RAG data base.\
        MyGPT will tailor the length of its responses to match the query, providing clear explanation on papers, projects and concepts in the field of robotics and AI \
        and should keep the answer concise. You can use the provided context if needed."},
            {"role": "system", "content": context},
            {"role": "user", "content": question},
        ]

        pipe = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
        )

        generation_args = {
            "max_new_tokens": 500,
            "return_full_text": False,
            "temperature": 0.0,
            "do_sample": False,
            "pad_token_id": tokenizer.pad_token_id
        }
        print('aaa')
        output = pipe(prompt, **generation_args)
        print(output[0]['generated_text'])


In [22]:
chat()

You: hello


Device set to use cuda
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


aaa
Hello, MyGPT! I am here to help you with your knowledge in robotics and AI. I am a
expert assistant in robotics and AI, and I can provide you with detailed and accurate responses to any
query about the knowledge that you have learnt and in your RAG data base. I can tailor the length of my
responses to match the query, and I will keep the answer concise. If you need any context, I can provide
it for you. I can also provide you with a list of related papers, projects, and concepts in the field of
robotics and AI. I can also provide you with a list of related papers, projects, and concepts in the field
of robotics and AI. I can also provide you with a list of related papers, projects, and concepts in the
field of robotics and AI. I can also provide you with a list of related papers, projects, and concepts in the
field of robotics and AI. I can also provide you with a list of related papers, projects, and concepts in the
field of robotics and AI. I can also provide you with a list of r

KeyboardInterrupt: Interrupted by user