In [1]:
import os
import torch
import torch.nn as nn
import bitsandbytes as bnb
from datasets import load_dataset
import pprint
from huggingface_hub import notebook_login
import transformers
from transformers import AutoTokenizer, AutoConfig, LLaMAForCausalLM, LLaMATokenizer
from peft import prepare_model_for_int8_training, LoraConfig, get_peft_model, get_peft_model_state_dict


Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...


  warn(msg)
  warn(msg)
  warn(msg)


In [2]:
torch.cuda.device_count()

1

In [3]:
torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 3090 Ti'

In [4]:
transformers.__version__

'4.27.0.dev0'

In [5]:
pp = pprint.PrettyPrinter(indent=4)

## Preprocesado de los datos

In [6]:
tokenizer = LLaMATokenizer.from_pretrained(
    "decapoda-research/llama-7b-hf", add_eos_token=True
)

tokenizer.pad_token_id = (
    0  # unk. we want this to be different from the eos token
)
tokenizer.padding_side = "left"

In [7]:
def generate_prompt(data_point):
    # desculpe o desastre de formatação, preciso ser rápido
    if data_point["input"]:
        return f"""
A continuación hay una instrucción que describe una tarea, junto con una entrada que proporciona más contexto. Escriba una respuesta que complete adecuadamente la solicitud.
                ### Instrucción:
                {data_point["instruction"]}
                ### Entrada:
                {data_point["input"]}
                ### Respuesta:
                {data_point["output"]}"""
    else:
        return f"""A continuación hay una instrucción que describe una tarea. Escriba una respuesta que complete adecuadamente la solicitud.
                ### Instrucción:
                {data_point["instruction"]}
                ### Respuesta:
                {data_point["output"]}"""
    
CUTOFF_LEN = 256
def tokenize(prompt, add_eos_token=True):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=CUTOFF_LEN,
        padding=False,
        return_tensors=None,
    )
    if (
        result["input_ids"][-1] != tokenizer.eos_token_id
        and len(result["input_ids"]) < CUTOFF_LEN
        and add_eos_token
    ):
        result["input_ids"].append(tokenizer.eos_token_id)
        result["attention_mask"].append(1)

    result["labels"] = result["input_ids"].copy()

    return result

def generate_and_tokenize_prompt(data_point):
    full_prompt = generate_prompt(data_point)
    tokenized_full_prompt = tokenize(full_prompt)
    return tokenized_full_prompt

In [8]:
data_alpaca_es = load_dataset("json", data_files="../Data/Alpaca/alpaca_data_cleaned_spanish.json")

Found cached dataset json (/root/.cache/huggingface/datasets/json/default-166bb64199227331/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)


  0%|          | 0/1 [00:00<?, ?it/s]

In [9]:
data_alpaca_es

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 51942
    })
})

In [10]:
data_alpaca_es = data_alpaca_es["train"].train_test_split(
    test_size=2000, shuffle=True, seed=42
)

Loading cached split indices for dataset at /root/.cache/huggingface/datasets/json/default-166bb64199227331/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e/cache-d6248da7d1c12d05.arrow and /root/.cache/huggingface/datasets/json/default-166bb64199227331/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e/cache-e245ea87a3d36b20.arrow


In [11]:
data_alpaca_es

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 49942
    })
    test: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 2000
    })
})

In [12]:
train_data = (
    data_alpaca_es["train"].map(generate_and_tokenize_prompt)
)
val_data = (
    data_alpaca_es["test"].map(generate_and_tokenize_prompt)
)

Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-166bb64199227331/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e/cache-a43e1d3975623cfb.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-166bb64199227331/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e/cache-b4b4a9a5ec35adeb.arrow


In [13]:
print(train_data[6])

{'instruction': 'Explica cómo ir al gimnasio puede mejorar la salud física', 'input': '', 'output': 'Ir al gimnasio puede mejorar la salud física al aumentar la masa muscular, mejorar la salud cardíaca y ayudar a mantener un peso saludable. También puede ayudar a reducir los niveles de estrés, mejorar la coordinación y el equilibrio, y aumentar los niveles de energía.', 'input_ids': [1, 319, 3133, 2709, 14842, 1185, 5778, 6941, 712, 8453, 1185, 260, 6203, 29889, 3423, 699, 2291, 1185, 620, 28959, 712, 4866, 19967, 4979, 22536, 425, 26978, 11267, 29889, 13, 18884, 835, 2799, 582, 6941, 29901, 13, 18884, 12027, 10123, 28810, 4346, 3805, 394, 330, 326, 22911, 601, 11493, 16918, 279, 425, 4497, 566, 27087, 983, 13, 18884, 835, 2538, 28959, 29901, 13, 18884, 6600, 394, 330, 326, 22911, 601, 11493, 16918, 279, 425, 4497, 566, 27087, 983, 394, 19291, 279, 425, 5516, 29874, 2301, 16637, 29892, 16918, 279, 425, 4497, 566, 5881, 29983, 11989, 343, 10156, 566, 279, 263, 13694, 759, 443, 8928, 298

## Finetuning del modelo

In [14]:
MICRO_BATCH_SIZE = 4  # this could actually be 5 but i like powers of 2
BATCH_SIZE = 128
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
EPOCHS = 3  # we don't need 3 tbh
LEARNING_RATE = 3e-4  # the Karpathy constant
LORA_R = 8
LORA_ALPHA = 16
LORA_DROPOUT = 0.05

In [15]:
OUTPUT_DIR = "chivito_lora_alpaca_es_7b"

In [16]:
model = LLaMAForCausalLM.from_pretrained(
    "decapoda-research/llama-7b-hf",
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map="auto",
)

model = prepare_model_for_int8_training(model)

config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)
model.print_trainable_parameters()

Loading checkpoint shards:   0%|          | 0/33 [00:00<?, ?it/s]

trainable params: 4194304 || all params: 6742609920 || trainable%: 0.06220594176090199


In [17]:
training_arguments = transformers.TrainingArguments(
    per_device_train_batch_size=MICRO_BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    warmup_steps=100,
    num_train_epochs=EPOCHS,
    learning_rate=LEARNING_RATE,
    fp16=True,
    logging_steps=10,
    optim="adamw_torch",
    evaluation_strategy="steps",
    save_strategy="steps",
    eval_steps=100,
    save_steps=100,
    output_dir=OUTPUT_DIR,
    save_total_limit=3,
    load_best_model_at_end=True,
    report_to="tensorboard"
)

data_collator = transformers.DataCollatorForSeq2Seq(
    tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
)

In [18]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=val_data,
    args=training_arguments,
    data_collator=data_collator
)

model.config.use_cache = False
old_state_dict = model.state_dict
model.state_dict = (
    lambda self, *_, **__: get_peft_model_state_dict(
        self, old_state_dict()
    )
).__get__(model, type(model))

model = torch.compile(model)

In [19]:
trainer.train(resume_from_checkpoint=False)

Step,Training Loss,Validation Loss
100,0.8648,0.855989
200,0.8255,0.820177
300,0.8233,0.806855
400,0.8174,0.799181
500,0.803,0.794019
600,0.7971,0.789714
700,0.7912,0.785695
800,0.7784,0.783246
900,0.7835,0.780501
1000,0.778,0.779066


There were missing keys in the checkpoint model loaded: ['base_model.model.model.embed_tokens.weight', 'base_model.model.model.layers.0.self_attn.q_proj.weight', 'base_model.model.model.layers.0.self_attn.k_proj.weight', 'base_model.model.model.layers.0.self_attn.v_proj.weight', 'base_model.model.model.layers.0.self_attn.o_proj.weight', 'base_model.model.model.layers.0.self_attn.rotary_emb.inv_freq', 'base_model.model.model.layers.0.mlp.gate_proj.weight', 'base_model.model.model.layers.0.mlp.down_proj.weight', 'base_model.model.model.layers.0.mlp.up_proj.weight', 'base_model.model.model.layers.0.input_layernorm.weight', 'base_model.model.model.layers.0.post_attention_layernorm.weight', 'base_model.model.model.layers.1.self_attn.q_proj.weight', 'base_model.model.model.layers.1.self_attn.k_proj.weight', 'base_model.model.model.layers.1.self_attn.v_proj.weight', 'base_model.model.model.layers.1.self_attn.o_proj.weight', 'base_model.model.model.layers.1.self_attn.rotary_emb.inv_freq', 'bas

TrainOutput(global_step=1170, training_loss=0.8471155215532352, metrics={'train_runtime': 27675.4376, 'train_samples_per_second': 5.414, 'train_steps_per_second': 0.042, 'total_flos': 1.350852079138898e+18, 'train_loss': 0.8471155215532352, 'epoch': 3.0})

In [20]:
model.save_pretrained(OUTPUT_DIR)

## Vizualizacion de la perdida del entrenamiento

In [21]:
%load_ext tensorboard
%tensorboard --logdir chivito_lora_alpaca_es_7b/runs

## Almacenamiento del modelo en hugginface

In [22]:
notebook_login()

Token is valid.
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credential.helper store

Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.[0m
Token has not been saved to git credential helper.
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [23]:
model.push_to_hub("CristianC/chivito_lora_alpaca_es_7b", use_auth_token=True)

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.bin:   0%|          | 0.00/16.8M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/CristianC/chivito_lora_alpaca_es_7b/commit/1d45cdb69b61dde90e8c1c273cc421d45b304ff4', commit_message='Upload model', commit_description='', oid='1d45cdb69b61dde90e8c1c273cc421d45b304ff4', pr_url=None, pr_revision=None, pr_num=None)