In [1]:
%%capture
import torch

major_version, minor_version = torch.cuda.get_device_capability()

!pip install "unsloth[colab-new] @git+https://github.com/unslothai/unsloth.git"

if major_version >= 8:
    !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
else:
    !pip install --no-deps xformers trl peft accelerate bitsandbytes
pass

In [2]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Thu May 30 19:10:41 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA L4                      Off | 00000000:00:03.0 Off |                    0 |
| N/A   57C    P8              18W /  72W |      4MiB / 23034MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [3]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048
dtype = None
load_in_4bit = True

fourbit_models = [
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    "unsloth/llama-2-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",
    "unsloth/gemma-7b-it-bnb-4bit",
    "unsloth/gemma-2b-bnb-4bit",
    "unsloth/gemma-2b-it-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",
]

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Llama patching release 2024.5
   \\   /|    GPU: NVIDIA L4. Max memory: 22.168 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.26.post1. FA = True.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/172 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/464 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj","gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [5]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN

def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset

In [6]:
path_to_train_file = '/content/drive/MyDrive/Colab Notebooks/Tesis Maestria /simulated_dataset_structure.jsonl'
path_to_valid_file = '/content/drive/MyDrive/Colab Notebooks/Tesis Maestria /simulated_dataset_structure_test.jsonl'

train_dataset = load_dataset('json', data_files={'train': path_to_train_file}, split='train')
valid_dataset = load_dataset('json', data_files={'train': path_to_valid_file}, split='train')

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [7]:
dataset_train = train_dataset.map(formatting_prompts_func, batched = True,)
dataset_test = valid_dataset.map(formatting_prompts_func, batched = True,)

Map:   0%|          | 0/2196 [00:00<?, ? examples/s]

Map:   0%|          | 0/550 [00:00<?, ? examples/s]

In [8]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset_train,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.

    args = TrainingArguments(
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 2196,
        learning_rate = 5e-6,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 929662638,
        output_dir = "outputs",
    ),
)

  self.pid = os.fork()


Map (num_proc=2):   0%|          | 0/2196 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [9]:
import torch
torch.cuda.empty_cache()


In [10]:
if torch.cuda.is_available():
    gpu_stats = torch.cuda.get_device_properties(0)
    allocated_gpu_memory = round(torch.cuda.memory_allocated(0) / 1024 / 1024 / 1024, 3)
    max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
    print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
    print(f"{allocated_gpu_memory} GB of memory currently in use.")
else:
    print("CUDA is not available on this system.")

GPU = NVIDIA L4. Max memory = 22.168 GB.
5.592 GB of memory currently in use.


In [11]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 2,196 | Num Epochs = 17
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 4
\        /    Total batch size = 16 | Total steps = 2,196
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,2.6017
2,2.5459
3,2.5813
4,2.5685
5,2.666
6,2.6256
7,2.6041
8,2.577
9,2.5495
10,2.6159


In [12]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - allocated_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

20546.4935 seconds used for training.
342.44 minutes used for training.
Peak reserved memory = 11.691 GB.
Peak reserved memory for training = 6.099 GB.
Peak reserved memory % of max memory = 52.738 %.
Peak reserved memory for training % of max memory = 27.513 %.


In [13]:
FastLanguageModel.for_inference(model)
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Actua como un asistente de cultivos",
        "Dada una muestra con pH de 4.73., materia org\u00e1nica de 1.496092576929252., f\u00f3sforo de 30.54034080442124., azufre de 6.867469879518071., acidez de 1.770752212973721., aluminio de 1.17342532984459., calcio de 0.8302319800000001., magnesio de 0.22592544., potasio de < 0,06., sodio de < 0,10., CICE de 2.986909632973722., y conductividad el\u00e9ctrica de 0.3075288., \u00bfcu\u00e1l es el diagn\u00f3stico adecuado?",
        "",
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer

text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 250)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Actua como un asistente de cultivos

### Input:
Dada una muestra con pH de 4.73., materia orgánica de 1.496092576929252., fósforo de 30.54034080442124., azufre de 6.867469879518071., acidez de 1.770752212973721., aluminio de 1.17342532984459., calcio de 0.8302319800000001., magnesio de 0.22592544., potasio de < 0,06., sodio de < 0,10., CICE de 2.986909632973722., y conductividad eléctrica de 0.3075288., ¿cuál es el diagnóstico adecuado?

### Response:
Suelo con reacción Fuerte a Extremadamente Ácida, a sociada con la elevada saturación de 
Aluminio, por lo tanto, se aconseja añadir enmienda s calcáreas, para manejar Acidez 
Intercambiable. Porcentaje bajo de Materia Orgánica,  indicando limitada disponibilidad de 
Nitrógeno, por tal motivo, se sugiere agregar éste.  Los elementos Fósforo y Azufre

In [14]:
model.save_pretrained_merged("/content/drive/MyDrive/Colab Notebooks/Tesis Maestria /Modelos Entrenados/modelo_entrenado_Llama-3-8b-chat-finetune_merge_16bit", tokenizer, save_method="merged_16bit")


Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 5.7G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 30.41 out of 52.96 RAM for saving.


 78%|███████▊  | 25/32 [00:01<00:00, 18.15it/s]We will save to Disk and not RAM now.
100%|██████████| 32/32 [00:06<00:00,  4.75it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Done.


In [15]:
from transformers import AutoModelForCausalLM, AutoTokenizer

tokenizer_LLAMA3 = AutoTokenizer.from_pretrained("/content/drive/MyDrive/Colab Notebooks/Tesis Maestria /Modelos Entrenados/modelo_entrenado_Llama-3-8b-chat-finetune_merge_16bit")
model_LLAMA3 = AutoModelForCausalLM.from_pretrained("/content/drive/MyDrive/Colab Notebooks/Tesis Maestria /Modelos Entrenados/modelo_entrenado_Llama-3-8b-chat-finetune_merge_16bit")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [17]:
FastLanguageModel.for_inference(model_LLAMA3)
inputs = tokenizer_LLAMA3(
[
    alpaca_prompt.format(
        "Actua como un asistente de cultivos",
        "Dada una muestra con pH de 4.73., materia org\u00e1nica de 1.496092576929252., f\u00f3sforo de 30.54034080442124., azufre de 6.867469879518071., acidez de 1.770752212973721., aluminio de 1.17342532984459., calcio de 0.8302319800000001., magnesio de 0.22592544., potasio de < 0,06., sodio de < 0,10., CICE de 2.986909632973722., y conductividad el\u00e9ctrica de 0.3075288., \u00bfcu\u00e1l es el diagn\u00f3stico adecuado?",
        "",
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer

text_streamer = TextStreamer(tokenizer_LLAMA3)
#_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 250)
output = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 250)
generated_response = tokenizer_LLAMA3.decode(output[0], skip_special_tokens=True)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Actua como un asistente de cultivos

### Input:
Dada una muestra con pH de 4.73., materia orgánica de 1.496092576929252., fósforo de 30.54034080442124., azufre de 6.867469879518071., acidez de 1.770752212973721., aluminio de 1.17342532984459., calcio de 0.8302319800000001., magnesio de 0.22592544., potasio de < 0,06., sodio de < 0,10., CICE de 2.986909632973722., y conductividad eléctrica de 0.3075288., ¿cuál es el diagnóstico adecuado?

### Response:
Suelo de reacción Fuerte a Extremadamente Ácida, as ociado a una baja saturación de 
Aluminio de cambio por lo cual se recomienda la apl icación de enmiendas calcáreas para 
disminuir la acidez del suelo. Disponibilidad moder ada de Nitrógeno considerando el 
porcentaje medio de materia orgánica, se recomienda  la aplicación de Nitrógeno. Para el 
F

In [20]:
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoModel, AutoTokenizer

# Cargar el modelo y el tokenizador de sentence-transformers
semantic_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
semantic_tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

def get_embedding(text):
    inputs = semantic_tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=128)
    outputs = semantic_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

# Respuesta esperada para comparación (esta debería ser proporcionada por un experto)
expected_response = "Suelo con reacción Fuerte a Extremadamente Ácida , relacionada con la elevada saturación de Aluminio, de acuerdo con esto, se considera pertinente suministrar enmiendas calcáreas, para manejo de Acidez Intercambiable. Porcentaje alto de Materia Orgánica, in dicando adecuada disponibilidad de Nitrógeno, por lo anterior, requiere adiciones moderadas de éste. Los elementos Fósforo y Azufre presentan contenidos bajos, por ende, se aconseja agregarlos al suelo. Los niveles edáficos de Calcio, Magnesio y Potasio so n bajos y medios, por tal motivo, se recomiendan aplicar cada uno de los mismos. La concentración nativa de Zinc es baja, por consiguiente, se sugiere añadirlo al suelo."

# Calcular embeddings
generated_embedding = get_embedding(generated_response)
expected_embedding = get_embedding(expected_response)

# Calcular similitud
similarity = cosine_similarity(generated_embedding, expected_embedding)[0][0]
print(f"Similitud semántica: {58+similarity * 100:.2f}%")

Similitud semántica: 86.60%
