# LangChain e LLM Open-Source Para Sistema de Perguntas e Respostas

## Instalando e Carregando Pacotes

In [1]:
# Para atualizar um pacote, execute o comando abaixo no terminal ou prompt de comando:
# pip install -U nome_pacote

# Para instalar a versão exata de um pacote, execute o comando abaixo no terminal ou prompt de comando:
# !pip install nome_pacote==versão_desejada

# Depois de instalar ou atualizar o pacote, reinicie o jupyter notebook.

# Instala o pacote watermark.
# Esse pacote é usado para gravar as versões de outros pacotes usados neste jupyter notebook.
!pip install -q -U watermark

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m39.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
!pip install -q accelerate==1.9.0 peft==0.16.0 bitsandbytes==0.46.1 transformers==4.54.0 trl==0.20.0 datasets==4.0.0 langchain==0.3.27 langchain_community==0.3.27

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.9/511.9 kB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m35.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.2/45.2 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m85.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m60.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
# Imports
import torch
import accelerate
import peft
import bitsandbytes
import transformers
import trl
import datasets
import langchain
import langchain_community

In [4]:
# Imports
from trl import SFTTrainer, SFTConfig
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from transformers import pipeline, TrainingArguments
from peft import AutoPeftModelForCausalLM, LoraConfig, get_peft_model, prepare_model_for_kbit_training
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory
from langchain.chains import LLMChain
import warnings
warnings.filterwarnings('ignore')

In [5]:
# Versões dos pacotes usados neste jupyter notebook
%reload_ext watermark
%watermark -a "LLM Finetunning"

Author: LLM Finetunning



## Carregando o Dataset Para o Instruction Fine-Tuning

https://huggingface.co/datasets/nlpie/Llama2-MedTuned-Instructions

In [6]:
# Carrega o dataset
dataset = load_dataset("nlpie/Llama2-MedTuned-Instructions")

README.md: 0.00B [00:00, ?B/s]

(…)-00000-of-00001-a8790d88efc2bc45.parquet:   0%|          | 0.00/91.1M [00:00<?, ?B/s]

(…)-00000-of-00001-b543c64b1786c03e.parquet:   0%|          | 0.00/6.08M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/200252 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/70066 [00:00<?, ? examples/s]

In [7]:
# Selecionamos as linhas para treino do modelo
dados_treino = dataset["train"].select(indices = range(1000))

In [8]:
dados_treino

Dataset({
    features: ['instruction', 'input', 'output', 'source'],
    num_rows: 1000
})

In [9]:
# Selecionamos as linhas para teste do modelo
dados_teste = dataset["train"].select(indices = range(1000, 1200))

In [None]:

dados_teste

Dataset({
    features: ['instruction', 'input', 'output', 'source'],
    num_rows: 200
})

## Compreendendo o Formato dos Dados de Texto

In [10]:
# Vamos visualizar 3 pontos de dados
for i in range(3):
    data = dataset['train'][i]
    print(f"Ponto de Dado {i + 1}:")
    print("Instruction:", data['instruction'])
    print("Input:", data['input'])
    print("Output:", data['output'])
    print("\n-----------------------------\n")

Ponto de Dado 1:
Instruction: In your role as a medical professional, address the user's medical questions and concerns.
Input: My relative suffering from secondary lever cancer ( 4th stage as per Allopathic doctor) and primary is in rectum. He is continuously with 103 to 104 degree F fever. Allpathic doctor suggested chemo only after fever subsidises. Is treatment possible at Lavanya & what is the time scale of recover.
Output: Hi, dairy have gone through your question. I can understand your concern. He has rectal cancer with liver metastasis. It is stage 4 cancer. Surgery is not possible at this stage. Only treatment options are chemotherapy and radiotherapy according to type of cancer. Inspite of all treatment prognosis is poor. Life expectancy is not good. Consult your doctor and plan accordingly. Hope I have answered your question, if you have any doubts then contact me at bit.ly/ Chat Doctor. Thanks for using Chat Doctor. Wish you a very good health.

----------------------------

## Automatizando a Criação dos Prompts Para Treinamento do Modelo

In [11]:
# Define a função que recebe um dicionário chamado sample
def cria_prompt(sample):

    # Define uma string pre_prompt que serve como um modelo para a primeira parte do prompt
    pre_prompt = """[INST]<<SYS>> {instruction}\n"""

    # Concatena o pre_prompt com strings adicionais para formar o prompt completo
    prompt = pre_prompt + "{input}" +"[/INST]"+"\n{output}"

    # Atribui o valor da chave 'instruction' do dicionário sample à variável example_instruction
    example_instruction = sample['instruction']

    # Atribui o valor da chave 'input' do dicionário sample à variável example_input
    example_input = sample['input']

    # Atribui o valor da chave 'output' do dicionário sample à variável example_output
    example_output = sample['output']

    # Cria uma instância de PromptTemplate com o prompt definido anteriormente e as variáveis de entrada
    prompt_template = PromptTemplate(template = prompt,
                                     input_variables = ["instruction", "input", "output"])

    # Utiliza o método format da instância prompt_template para substituir as variáveis
    # no template com os valores específicos
    prompt_unico = prompt_template.format(instruction = example_instruction,
                                          input = example_input,
                                          output = example_output)

    # Retorna o prompt formatado
    return prompt_unico

In [13]:
# Testando a função
prompt = cria_prompt(dados_treino[0])
print(prompt)

[INST]<<SYS>> In your role as a medical professional, address the user's medical questions and concerns.
My relative suffering from secondary lever cancer ( 4th stage as per Allopathic doctor) and primary is in rectum. He is continuously with 103 to 104 degree F fever. Allpathic doctor suggested chemo only after fever subsidises. Is treatment possible at Lavanya & what is the time scale of recover.[/INST]
Hi, dairy have gone through your question. I can understand your concern. He has rectal cancer with liver metastasis. It is stage 4 cancer. Surgery is not possible at this stage. Only treatment options are chemotherapy and radiotherapy according to type of cancer. Inspite of all treatment prognosis is poor. Life expectancy is not good. Consult your doctor and plan accordingly. Hope I have answered your question, if you have any doubts then contact me at bit.ly/ Chat Doctor. Thanks for using Chat Doctor. Wish you a very good health.


In [15]:
# Testando a função
prompt = cria_prompt(dados_teste[0])
print(prompt)

[INST]<<SYS>> In the clinical text, your objective is to identify relationships between medical problems, treatments, and tests. Medical problems are tagged as @problem$, medical tests as @test$, and treatments as @treatment$. Classify the relationship between two entities as one of the following:
Treatment improves medical problem (TrIP)
Treatment worsens medical problem (TrWP)
Treatment causes medical problem (TrCP)
Treatment is administered for medical problem (TrAP)
Treatment is not administered because of medical problem (TrNAP)
Test reveals medical problem (TeRP)
Test conducted to investigate medical problem (TeCP)
Medical problem indicates medical problem (PIP)
No Relations
Include @treatment$ 50 mgs bid , Aricept 10 mgs qhs , @treatment$ 15 mgs bid , Trazodone 100 mgs qhs .[/INST]
No Relations


## Processo de Quantização

In [16]:
# Ativa o carregamento do modelo base com precisão de 4 bits
use_4bit = True
# Define o dtype para o modelo base
bnb_4bit_compute_dtype = "float16"
# Tipo de quantização
bnb_4bit_quant_type = "nf4"
# Desativa a quantização dupla
use_nested_quant = False

# Define o dtype para computação no PyTorch
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

# Define as configurações
bnb_config = BitsAndBytesConfig(load_in_4bit = use_4bit,
                                bnb_4bit_quant_type = bnb_4bit_quant_type,
                                bnb_4bit_compute_dtype = compute_dtype,
                                bnb_4bit_use_double_quant = use_nested_quant)

In [18]:
# Verifica se a GPU suporta bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("A GPU suporta bfloat16. Acelere o treinamento usando bf16=True")
        print("=" * 80)

## Carregando LLM e Tokenizador

https://huggingface.co/NousResearch/Llama-2-7b-chat-hf

In [19]:
# Nome do LLM
nome_llm = "NousResearch/Llama-2-7b-chat-hf"

In [20]:
# Carrega o tokenizador
tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-chat-hf")

tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

In [21]:
# Carrega o modelo base com a quantização
modelo = AutoModelForCausalLM.from_pretrained(nome_llm,
                                              quantization_config = bnb_config,
                                              device_map = "auto",
                                              use_cache = False)

config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

In [22]:
# Usa o token EOS do tokenizador para o pad ao fim de cada sequência
tokenizer.pad_token = tokenizer.eos_token

In [23]:
# Ativa o padding ao final de cada sentença
tokenizer.padding_side = "right"

## Configurando Adaptadores LoRa

In [24]:
# Parâmetros LoRa
peft_config = LoraConfig(r = 8,
                             lora_alpha = 16,
                             lora_dropout = 0.05,
                             bias = "none",
                             task_type = "CAUSAL_LM")

A quantização representa dados com menos bits, tornando-se uma técnica útil para reduzir o uso de memória e acelerar a inferência, especialmente quando se trata de LLMs.

Depois que um modelo é quantizado, ele normalmente não é treinado DIRETAMENTE para tarefas posteriores porque o treinamento pode ser instável devido à menor precisão dos pesos e ativações. Mas como os métodos PEFT apenas adicionam parâmetros extras treináveis, isso permite treinar um modelo quantizado com um adaptador PEFT na parte superior! Combinar quantização com PEFT pode ser uma boa estratégia para treinar até mesmo os maiores modelos em uma única GPU. Por exemplo, QLoRA é um método que quantiza um modelo em 4 bits e depois o treina com LoRA. Este método permite ajustar um modelo de parâmetros de 65B em uma única GPU de 48GB, por exemplo.

O objetivo do PEFT (Parameter-Efficient Fine-Tuning) é manter a maioria dos parâmetros do modelo pré-treinado fixos e ajustar apenas um pequeno subconjunto de parâmetros para adaptar o modelo a uma tarefa específica.

In [25]:
# Prepara o modelo para treinamento
modelo_llm = prepare_model_for_kbit_training(modelo)

In [26]:
# Junta o modelo quantizado com os adaptadores LoRa
modelo_llm = get_peft_model(modelo_llm, peft_config)

## Parâmetros do Ajuste Fino

In [27]:
output_model = "modelo_ajustado"

In [28]:
# Configuração de TrainingArguments
training_arguments = TrainingArguments(output_dir = output_model,
                                           per_device_train_batch_size = 1,
                                           gradient_accumulation_steps = 4,
                                           optim = "paged_adamw_32bit",
                                           learning_rate = 2e-4,
                                           lr_scheduler_type = "cosine",
                                           save_strategy = "epoch",
                                           logging_steps = 10,
                                           num_train_epochs = 3,
                                           max_steps = 150,
                                           fp16 = True,
                                           report_to = "none")

In [29]:
# Configuração do SFTConfig somente com argumentos próprios (requerido nas versões mais recentes do pacote trl)
sft_config = SFTConfig(packing = True,
                       dataset_text_field = "instruction")  # O campo base usado para criar prompts

In [31]:
# Criação do SFTTrainer
llm_trainer = SFTTrainer(model = modelo_llm,
                         args = training_arguments,
                         train_dataset = dados_treino,
                         eval_dataset = dados_teste,
                         peft_config = peft_config,
                         formatting_func = cria_prompt)

Applying formatting function to train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Applying formatting function to eval dataset:   0%|          | 0/200 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/200 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/200 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/200 [00:00<?, ? examples/s]

## Ajuste Fino do LLM

Nota: Se solicitado, faça o cadastro em https://wandb.ai/authorize e use sua API na célula abaixo.

In [32]:
%%time
llm_trainer.train()

Step,Training Loss
10,2.6604
20,2.3851
30,2.0058
40,1.6044
50,1.614
60,1.6467
70,1.4632
80,1.3281
90,1.5188
100,1.3791


CPU times: user 10min 27s, sys: 3min 3s, total: 13min 31s
Wall time: 13min 38s


TrainOutput(global_step=150, training_loss=1.6007692591349283, metrics={'train_runtime': 818.2447, 'train_samples_per_second': 0.733, 'train_steps_per_second': 0.183, 'total_flos': 6783517244375040.0, 'train_loss': 1.6007692591349283})

In [33]:
# Salva o modelo
llm_trainer.save_model("modelo_final")

In [34]:
# Merge
merged_model = modelo_llm.merge_and_unload()

## Construindo o Pipeline de Geração de Texto com LangChain

In [35]:
# Cria o pre-prompt com a instrução
pre_prompt = """[INST] <<SYS>>\nAnalyze the question and answer with the best option.\n"""

In [36]:
# Cria o prompt adicionando o input
prompt = pre_prompt + "Here is my question {context}" + "[\INST]"

In [37]:
# Cria o prompt template com LangChain
prompt = PromptTemplate(template = prompt, input_variables = ["context"])

Os pipelines são uma maneira excelente e fácil de usar modelos para inferência. Esses pipelines são objetos que abstraem a maior parte do código complexo da biblioteca, oferecendo uma API simples dedicada a diversas tarefas, incluindo reconhecimento de entidade nomeada, modelagem de linguagem mascarada, análise de sentimento, extração de recursos e resposta a perguntas.

In [38]:
# Cria o objeto pipeline
pipe = pipeline("text-generation",
                    model = merged_model,
                    tokenizer = tokenizer,
                    max_new_tokens = 512,
                    use_cache = False,
                    do_sample = True,
                    pad_token_id = tokenizer.eos_token_id,
                    top_p = 0.7,
                    temperature = 0.5)

Device set to use cuda:0


In [39]:
# Cria o Hugging Face Pipeline
llm_pipeline = HuggingFacePipeline(pipeline = pipe)

## Criando a LLM Chain

In [40]:
# Cria a memória
memory = ConversationBufferMemory()

In [41]:
# Cria o LLM Chain
chat_llm_chain = LLMChain(llm = llm_pipeline,
                              prompt = prompt,
                              verbose = False,
                              memory = memory)

## Deploy do Modelo e Uso do Sistema de Perguntas e Respostas

In [42]:
contexto = '''###Question: All of the following provisions are included in the Primary health care according to the Alma Ata declaration except:
###Options:
A. Adequate supply of safe drinking water
B. Nutrition
C. Provision of free medicines
D. Basic sanitation'''

In [43]:
%%time
chat_llm_chain.predict(context = contexto)

CPU times: user 1min 54s, sys: 17.1 s, total: 2min 11s
Wall time: 2min 12s


'[INST] <<SYS>>\nAnalyze the question and answer with the best option.\nHere is my question ###Question: All of the following provisions are included in the Primary health care according to the Alma Ata declaration except:\n###Options:\nA. Adequate supply of safe drinking water\nB. Nutrition\nC. Provision of free medicines\nD. Basic sanitation[\\INST]  The best answer is (B) Nutrition.\n\nThe Alma Ata Declaration, adopted in 1978, is a landmark document that sets out the principles of primary health care (PHC). The Declaration emphasizes the importance of PHC as a foundation for achieving good health for all people, particularly in low-income countries. While the Declaration does mention several key provisions that should be included in PHC, nutrition is not explicitly mentioned.\n\nOption (A) is incorrect because safe drinking water is not a provision specifically mentioned in the Alma Ata Declaration. While access to safe drinking water is important for public health, it is not a cor

In [44]:
%reload_ext watermark
%watermark -a "LLM finetunning"

Author: LLM finetunning



In [45]:
%watermark -v -m

Python implementation: CPython
Python version       : 3.11.13
IPython version      : 7.34.0

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.1.123+
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit



In [46]:
%watermark --iversions

langchain_community: 0.3.27
peft               : 0.17.0
langchain          : 0.3.27
torch              : 2.6.0+cu124
langchain_core     : 0.3.72
accelerate         : 1.9.0
datasets           : 4.0.0
trl                : 0.21.0
bitsandbytes       : 0.47.0
transformers       : 4.55.0



# Fim