In [None]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-v0.3-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.8: Fast Mistral patching. Transformers = 4.43.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/4.14G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/137k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/560 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Unsloth: Will load unsloth/mistral-7b-v0.3-bnb-4bit as a legacy tokenizer.


In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 64, # LoRA rank
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",

                      "embed_tokens", "lm_head",], # Add for continual pretraining
    lora_alpha = 32,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = True,
    loftq_config = None,
)

Unsloth: Offloading input_embeddings to disk to save VRAM
Unsloth: Offloading output_embeddings to disk to save VRAM


Unsloth 2024.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


Unsloth: Casting embed_tokens to float32
Unsloth: Casting lm_head to float32


In [None]:
import pandas as pd

In [None]:
df = pd.read_json("/content/pretrain_data.json")

In [None]:
df.head()

Unnamed: 0,data
0,The Air Force's Standardization and Evaluation...
1,The Squadron Standardization/Evaluation (CCV) ...
10,The F-22 Raptor attained Full Operational Capa...
100,Significant design modifications distinguished...
101,"The inaugural F-22, known as Raptor 4001, was ..."


In [None]:
df2 = df.reset_index()

In [None]:
df2.columns = ['index', 'data']

In [None]:
df2.drop("index", axis=1, inplace=True)

In [None]:
import datasets

In [None]:
f22_dataset = datasets.Dataset.from_pandas(df2)
print(f22_dataset)

Dataset({
    features: ['data'],
    num_rows: 273
})


In [None]:
shuffled_dataset = f22_dataset.shuffle(seed=42)

In [None]:
EOS_TOKEN = tokenizer.eos_token
def formatting_func(example):
    return example["data"] + EOS_TOKEN

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from unsloth import UnslothTrainer, UnslothTrainingArguments

trainer = UnslothTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = shuffled_dataset,
    dataset_text_field = "data",
    max_seq_length = max_seq_length,
    dataset_num_proc = 8,

    args = UnslothTrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 8,

        warmup_ratio = 0.1,
        num_train_epochs = 2,

        learning_rate = 5e-5,
        embedding_learning_rate = 5e-6,

        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 2,
        optim = "adamw_8bit",
        weight_decay = 0.00,
        lr_scheduler_type = "cosine",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Map (num_proc=8):   0%|          | 0/273 [00:00<?, ? examples/s]

In [None]:
trainable, total = model.get_nb_trainable_parameters()
print(f"Trainable: {trainable} | total: {total} | Percentage: {trainable/total*100:.4f}%")

Trainable: 436207616 | total: 7684231168 | Percentage: 5.6767%


In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 273 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 8
\        /    Total batch size = 16 | Total steps = 34
 "-____-"     Number of trainable parameters = 436,207,616


Unsloth: Setting lr = 5.00e-06 instead of 5.00e-05 for embed_tokens.
Unsloth: Setting lr = 5.00e-06 instead of 5.00e-05 for lm_head.


Step,Training Loss
2,2.2012
4,1.788
6,1.4883
8,2.1897
10,1.7272
12,1.6297
14,1.9757
16,1.2644
18,1.3913
20,1.7423


In [None]:
FastLanguageModel.for_inference(model)
inputs = tokenizer(
[
    "The Radar Cross Section (RCS) value"
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 128, use_cache = True)
tokenizer.batch_decode(outputs)

["<s>The Radar Cross Section (RCS) value is a measure of an aircraft's visibility to radar. A lower RCS value indicates better stealth capabilities, as it means the aircraft is more difficult to detect. The F-22 has a very low RCS value, making it one of the stealthiest aircraft in the world. The exact value of the F-22's RCS is classified, but it is believed to be around 0.0001 m2 (0.001 sq ft) at certain angles, which is extremely low. This means that the aircraft is very difficult to detect on radar, even at close range."]

In [None]:
model.push_to_hub("APP04/mistral-7b-v0.3-bnb-4bit_PT", token = "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")
tokenizer.push_to_hub("APP04/mistral-7b-v0.3-bnb-4bit_PT", token = "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")

README.md:   0%|          | 0.00/586 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

Saved model to https://huggingface.co/APP04/mistral-7b-v0.3-bnb-4bit_PT


  0%|          | 0/1 [00:00<?, ?it/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]