# POSTECH - IA PARA DEVS - TECH CHALLENGE - FASE 3

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Instalando dependências

In [None]:
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes
!pip install transformers datasets
!pip install triton

Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-ghuqyt6k/unsloth_ace0cd9db83b4d278b68e9a8e0221e77
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-ghuqyt6k/unsloth_ace0cd9db83b4d278b68e9a8e0221e77
  Resolved https://github.com/unslothai/unsloth.git to commit c3f4e9a87d964ecee1efd9963f497119edbefaab
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tyro (from unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Downloading tyro-0.8.11-py3-none-any.whl.metadata (8.4 kB)
Collecting transformers>=4.45.1 (from unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[

## Carregando as bibliotecas necessárias

In [None]:
from unsloth import FastLanguageModel, is_bfloat16_supported
import torch
import json
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments

DATA_PATH = "/content/drive/MyDrive/tech_challenge/train.json"
OUTPUT_PATH_DATASET = "/content/drive/MyDrive/tech_challenge/dataset.json"

max_seq_length = 2048
dtype = None
load_in_4bit = True
fourbit_models = [
    "unsloth/mistral-7b-v0.3-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",
    "unsloth/llama-3-8b-Instruct-bnb-4bit",
    "unsloth/llama-3-70b-bnb-4bit",
    "unsloth/Phi-3-mini-4k-instruct",
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",
]

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


## Criando a função de formatação e limpeza do dataset

In [None]:
def format_dataset_into_model_input(data_list):
    def format_data(data):
        return "DESCRIBE THIS PRODUCT.", data['title'], data['content']

    # Inicializando as listas para armazenar os dados
    instructions = []
    inputs = []
    outputs = []

    # Processando o dataset
    for data in data_list:
        if len(data['content']) > 0: # Utilizando apenas dados que possuem descrição
          instruction, input_text, response = format_data(data)
          instructions.append(instruction)
          inputs.append(input_text)
          outputs.append(response)

    # Criando o dicionário final
    formatted_data = {
        "instruction": instructions,
        "input": inputs,
        "output": outputs
    }

    # Salvando o resultado em um arquivo JSON
    with open(OUTPUT_PATH_DATASET, 'w') as output_file:
        json.dump(formatted_data, output_file, indent=4)

    print(f"Dataset salvo em {OUTPUT_PATH_DATASET}")

## Carregando e transformando o dataset

In [None]:
dataset = []
with open(DATA_PATH, 'r') as file:
    for line in file:
        dataset.append(json.loads(line))

format_dataset_into_model_input(dataset)

Dataset salvo em /content/drive/MyDrive/tech_challenge/dataset.json


## Carregando o modelo

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2024.9.post3: Fast Llama patching. Transformers = 4.45.1.
   \\   /|    GPU: NVIDIA L4. Max memory: 22.168 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/198 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

## Configurando o modelo

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",], # Módulos alvo para aplicar a técnica PEFT.
    lora_alpha = 16, # Fator de escala para a técnica LoRA.
    lora_dropout = 0, # Taxa de dropout para a técnica LoRA.
    bias = "none", # Tipo de viés a ser usado (nenhum neste caso).

    use_gradient_checkpointing = "unsloth", # Técnica de checkpointing de gradiente para economizar memória.
    random_state = 3407, # Semente para geração de números aleatórios para garantir reprodutibilidade.
    use_rslora = False, # Indica se a técnica RSLoRA deve ser usada.
    loftq_config = None, # Configuração para a técnica LoFTQ (se aplicável).
)

Unsloth 2024.9.post3 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


## Configurando o fine tunning

In [None]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):

        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset

dataset = load_dataset("json", data_files=OUTPUT_PATH_DATASET, split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/1498718 [00:00<?, ? examples/s]

## Configurando o treinamento do modelo com o fine tunning

In [None]:
trainer = SFTTrainer(
    model = model, # Modelo a ser treinado.
    tokenizer = tokenizer, # Tokenizador associado ao modelo.
    train_dataset = dataset, # Dataset de treinamento.
    dataset_text_field = "text", # Campo de texto no dataset.
    max_seq_length = max_seq_length, # Comprimento máximo da sequência.
    dataset_num_proc = 2, # Número de processos para carregar o dataset.
    packing = False, # Indica se o empacotamento deve ser usado.
    args = TrainingArguments(
        per_device_train_batch_size = 2, # Tamanho do lote por dispositivo.
        gradient_accumulation_steps = 4, # Número de etapas de acumulação de gradiente.
        warmup_steps = 5, # Número de etapas de aquecimento.
        max_steps = 60, # Número máximo de etapas de treinamento.
        learning_rate = 2e-4, # Taxa de aprendizado.
        fp16 = not is_bfloat16_supported(), # Indica se o treinamento deve ser feito em FP16.
        bf16 = is_bfloat16_supported(), # Indica se o treinamento deve ser feito em BF16.
        logging_steps = 1, # Intervalo de etapas para registrar métricas.
        optim = "adamw_8bit", # Otimizador a ser usado.
        weight_decay = 0.01, # Peso da decaimento.
        lr_scheduler_type = "linear", # Tipo de programador de taxa de aprendizado.
        seed = 3407, # Semente para geração de números aleatórios.
        output_dir = "outputs", # Diretório de saída.
    ),
)

Map (num_proc=2):   0%|          | 0/1498718 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


## Executando o treinamento

In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 1,498,718 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,2.9198
2,3.0705
3,3.0862
4,2.9102
5,2.4733
6,2.6661
7,2.2422
8,2.1131
9,2.1081
10,1.8211


## Salvando o modelo treinado

In [None]:
model.save_pretrained("/content/drive/MyDrive/tech_challenge/lora_model")
tokenizer.save_pretrained("/content/drive/MyDrive/tech_challenge/lora_model")

('/content/drive/MyDrive/tech_challenge/lora_model/tokenizer_config.json',
 '/content/drive/MyDrive/tech_challenge/lora_model/special_tokens_map.json',
 '/content/drive/MyDrive/tech_challenge/lora_model/tokenizer.json')

## Carregando o modelo treinado

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "/content/drive/MyDrive/tech_challenge/lora_model",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
FastLanguageModel.for_inference(model)

==((====))==  Unsloth 2024.9.post3: Fast Llama patching. Transformers = 4.45.1.
   \\   /|    GPU: NVIDIA L4. Max memory: 22.168 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear4bit(
      

## Testando o modelo treinado

In [None]:
inputs = tokenizer(
[
    alpaca_prompt.format(
        "DESCRIBE THIS PRODUCT.",
        "Adidas Ultraboost DNA Running Shoes",
        "",
    )
], return_tensors = "pt").to("cuda")

text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

NameError: name 'TextStreamer' is not defined

In [None]:
inputs = tokenizer(
[
    alpaca_prompt.format(
        "DESCRIBE THIS PRODUCT.",
        "Samsung Crystal UHD 55\" Smart TV",
        "",
    )
], return_tensors = "pt").to("cuda")

text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)