In [27]:
import torch
from trl import SFTTrainer
from datasets import load_from_disk
from transformers import TrainingArguments
from unsloth import FastLanguageModel, is_bfloat16_supported
import os

In [2]:
huggingface_model_name = "dariast/Meta-Llama-3.1-8B-4bit-python"

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = False # Use 4bit quantization to reduce memory usage. Can be False.

In [20]:
HF_TOKEN = "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"

In [4]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit
)

==((====))==  Unsloth 2024.9.post4: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA H100 PCIe. Max memory: 79.109 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 9.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

In [5]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 8, 
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized

    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None # And LoftQ
)

Unsloth 2024.9.post4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [8]:
# Load the dataset
dataset = load_from_disk("fine_tuning_dataset")

In [None]:
# Access train and test splits
train_ds = dataset['train']
test_ds = dataset['test']

lr1 = 2e-4


In [15]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_ds,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 4, # Set this for 1 full training run.
        # max_steps = 40,
        learning_rate = 3e-5,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

In [16]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA H100 PCIe. Max memory = 79.109 GB.
5.906 GB of memory reserved.


In [17]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 480 | Num Epochs = 4
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 240
 "-____-"     Number of trainable parameters = 20,971,520


Step,Training Loss
1,1.9742
2,2.2743
3,2.2135
4,1.745
5,2.0895
6,1.9566
7,1.8223
8,1.8423
9,1.7712
10,1.8918


In [18]:
# Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

840.0644 seconds used for training.
14.0 minutes used for training.
Peak reserved memory = 9.158 GB.
Peak reserved memory for training = 3.252 GB.
Peak reserved memory % of max memory = 11.576 %.
Peak reserved memory for training % of max memory = 4.111 %.


In [21]:
model.save_pretrained("llama_fine_tuned") # Local saving
tokenizer.save_pretrained("llama_fine_tuned")
model.push_to_hub(huggingface_model_name, token = "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") 
tokenizer.push_to_hub(huggingface_model_name, token = "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")

README.md:   0%|          | 0.00/606 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/83.9M [00:00<?, ?B/s]

Saved model to https://huggingface.co/dariast/Meta-Llama-3.1-8B-4bit-python


In [26]:

from unsloth import FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "dataset/llama_fine_tuned", # YOUR MODEL YOU USED FOR TRAINING
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

==((====))==  Unsloth 2024.9.post4: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA H100 PCIe. Max memory: 79.109 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 9.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear4bit(
        

In [None]:
test_ds["text"][0]

In [30]:
inputs = tokenizer(test_ds["text"][0], return_tensors = "pt").to("cuda")

In [31]:
from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 10)

<|begin_of_text|>You are a helpful assistant who is fluent in Russian and can classify text accurately.
USER: Your task is to classify Russian text as either "disinformation" or "trustworthy" 
Disinformation is false or misleading information deliberately spread to deceive people. It is often used to manipulate public opinion, create confusion, or influence decisions through inaccurate or fabricated narratives.

```
На радость Гитлеру украинский коп перепутал униформу ЗНАЙ ЮА укр рус укр рус Главная Досье Наше Киев.Знай Важное Жизнь Война Общество Компромат Еще Анекдоты Контакты Реклама Политика конфиденциальности Продолжая просматривать. Вы подтверждаете, что ознакомились с Политикой конфиденциальности и согласны с использованием файлов. Понял Война россии против Украины Коронавирус в Украине и мире Новости дня Главная Общество Читати українською На радость Гитлеру украинский коп перепутал униформу В сети разгорелся настоящий скандал января, января, Пользователи социальный сетей бурно

In [32]:
test_ds["text"][0]

'You are a helpful assistant who is fluent in Russian and can classify text accurately.\nUSER: Your task is to classify Russian text as either "disinformation" or "trustworthy" \nDisinformation is false or misleading information deliberately spread to deceive people. It is often used to manipulate public opinion, create confusion, or influence decisions through inaccurate or fabricated narratives.\n\n```\nНа радость Гитлеру украинский коп перепутал униформу ЗНАЙ ЮА укр рус укр рус Главная Досье Наше Киев .Знай Важное Жизнь Война Общество Компромат Еще Анекдоты Контакты Реклама Политика конфиденциальности Продолжая просматривать . Вы подтверждаете, что ознакомились с Политикой конфиденциальности и согласны с использованием файлов . Понял Война россии против Украины Коронавирус в Украине и мире Новости дня Главная Общество Читати українською На радость Гитлеру украинский коп перепутал униформу В сети разгорелся настоящий скандал января , января , Пользователи социальный сетей бурно обсуж