<a href="https://colab.research.google.com/github/Condemor-bit/Large-Language-Models-/blob/main/Advanced_RAG_Fusion_Across_Multiple_Files.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This is an advanced RAG system that utilizes LlamaIndex and Zephyr-7b to query multiple documents. It is slower than simple systems; however, it is capable of providing more relevant content.


This notebook was created by Álvaro Morcuende
11/01/2024

In [2]:
#@title 1º) Change the runtime environment to 'T4 GPU' and install the dependencies
#%%capture
!pip install -q --upgrade git+https://github.com/huggingface/transformers
!pip install -q bitsandbytes
!pip install -q accelerate
!pip install -q llama-index
!pip install -q pypdf
!pip install -q docx2txt
!pip install -q llama_hub
#!pip install -q llama-index[local_models]
#!pip install -q llama-index[query_tools]
print("=========================")
print("Proceed to the next cell.")

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.7/270.7 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.8/15.8 MB[0m [31m41.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.0/143.0 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.9/75.9 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.9/224.9 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━



---



**The optional cell** below will mount Google Drive and check if the 'data_rag' folder exists; if it doesn't, it will create this folder. All the documentation needs to be inside this folder.

**Alternatively**, you can create a folder titled 'data_rag' and put inside all the documents that you want to work with.

In [1]:
#@title Mount Google Drive (optional) or upload the files
from google.colab import drive
drive.mount('/content/drive/')
%cd drive/MyDrive
import os

nombre_carpeta = "data_rag"
ruta_actual = os.getcwd()

ruta_nueva_carpeta = os.path.join(ruta_actual, nombre_carpeta)

if not os.path.exists(ruta_nueva_carpeta):
    os.makedirs(ruta_nueva_carpeta)
    print(f'The folder "{nombre_carpeta}" was created in: {ruta_nueva_carpeta}')
else:
    print(f'The folder "{nombre_carpeta}" already exist in: {ruta_nueva_carpeta}')


Mounted at /content/drive/
/content/drive/MyDrive
The folder "data_rag" already exist in: /content/drive/MyDrive/data_rag




---



In [3]:
#@title 2º) Load the model and data

import warnings
warnings.simplefilter("ignore", UserWarning)

import torch
from transformers import BitsAndBytesConfig
from llama_index.prompts import PromptTemplate
from llama_index.llms import HuggingFaceLLM
from llama_index import ServiceContext
from llama_index import set_global_service_context
from llama_hub.llama_packs.query.rag_fusion_pipeline.base import RAGFusionPipelinePack
from llama_index import download_loader
from llama_index import SimpleDirectoryReader


quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)


def messages_to_prompt(messages):
  prompt = ""
  for message in messages:
    if message.role == 'system':
      prompt += f"<|system|>\n{message.content}</s>\n"
    elif message.role == 'user':
      prompt += f"<|user|>\n{message.content}</s>\n"
    elif message.role == 'assistant':
      prompt += f"<|assistant|>\n{message.content}</s>\n"

  # ensure we start with a system prompt, insert blank if needed
  if not prompt.startswith("<|system|>\n"):
    prompt = "<|system|>\n</s>\n" + prompt

  # add final assistant prompt
  prompt = prompt + "<|assistant|>\n"

  return prompt


llm = HuggingFaceLLM(
    model_name="HuggingFaceH4/zephyr-7b-beta",
    tokenizer_name="HuggingFaceH4/zephyr-7b-beta",
    query_wrapper_prompt=PromptTemplate("<|system|>\n</s>\n<|user|>\n{query_str}</s>\n<|assistant|>\n"),
    context_window=3900,
    max_new_tokens= 1024,#256,
    model_kwargs={"quantization_config": quantization_config},
    # tokenizer_kwargs={},
    generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95},
    messages_to_prompt=messages_to_prompt,
    device_map="auto",
)


#embed model

service_context = ServiceContext.from_defaults(llm=llm, embed_model="local:BAAI/bge-small-en-v1.5") #chunk_size=512, chunk_overlap=50)
set_global_service_context(service_context)

docs = SimpleDirectoryReader("data_rag", filename_as_id=True).load_data()

pack = RAGFusionPipelinePack(docs, llm=llm)

print("=========================")
print("Proceed to the next cell.")

config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Proceed to the next cell.


#*At this point, you have the model and the dependencies installed. Avoid re-running these cells (1º and 2º).*


---





In [4]:
#@title 3º) Interact with the data
query = input("How can I help you?: ")
response = pack.run(query=f""" {query} """)
print(response)

How can I help you?: ¿Cómo debería de ser el abordaje de la dislipemia desde la farmacia comunitaria?
Según el protocolo de dislipidemias compartido entre médicos de familia y farmacéuticos comunitarios, el abordaje de la dislipemia desde la farmacia comunitaria debe ser centrado en la prevención y promoción de la salud, con un mayor protagonismo del farmacéutico en la detección y seguimiento de los principales factores de riesgo cardiovascular (FRCV), incluyendo la dislipidemia. El protocolo recomienda la identificación de la FAT del paciente durante la dispensación de medicamentos, utilizando métodos como el test de Haynes-Sackett, el conocimiento del paciente sobre la enfermedad y el tratamiento, y los registros de dispensación con receta electrónica. El farmacéutico debe poder elegir estrategias personalizadas para mejorar la adherencia terapéutica, según el perfil de FAT del paciente, que puede ser confundido, desconfiado o que banaliza la enfermedad. Además, se recomienda la inte