<a href="https://colab.research.google.com/github/Ak-Gautam/efficient_llm_fine_tunes/blob/main/Qwen2/lvn_tweet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Training

In [1]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

In [2]:
!pip show torch

Name: torch
Version: 2.3.1+cu121
Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration
Home-page: https://pytorch.org/
Author: PyTorch Team
Author-email: packages@pytorch.org
License: BSD-3
Location: /usr/local/lib/python3.10/dist-packages
Requires: filelock, fsspec, jinja2, networkx, nvidia-cublas-cu12, nvidia-cuda-cupti-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-runtime-cu12, nvidia-cudnn-cu12, nvidia-cufft-cu12, nvidia-curand-cu12, nvidia-cusolver-cu12, nvidia-cusparse-cu12, nvidia-nccl-cu12, nvidia-nvtx-cu12, sympy, triton, typing-extensions
Required-by: accelerate, bitsandbytes, fastai, peft, torchaudio, torchtext, torchvision, trl, xformers


In [1]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 512 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = False # Use 4bit quantization to reduce memory usage. Can be False.

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [2]:
model_id = "Qwen/Qwen2-1.5B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_id,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2024.8: Fast Qwen2 patching. Transformers = 4.44.0.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.8 patched 28 layers with 0 QKV layers, 28 O layers and 28 MLP layers.


### Data

In [4]:
from datasets import load_dataset

# Load the CSV file
dataset = load_dataset('csv', data_files='/content/data_lvn_clean.csv')
dataset = dataset.shuffle(seed=65)

def format_chat_template(row):
    row_json = [{"role": "user", "content": row["human"]},
               {"role": "assistant", "content": row["assistant"]}]
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

dataset = dataset.map(
    format_chat_template,
    num_proc=4,
)
dataset = dataset['train']

In [5]:
dataset['text'][3]

"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\ncreate a X post<|im_end|>\n<|im_start|>assistant\nPlease install the latest version of Zoom. Zoom won't admit you without its latest update. \n\nSee you Monday at 9:00 a.m. Seattle time. Look in your inbox for meeting invitations. \n\nPS: Please remember to use a headset to reduce echo<|im_end|>\n"

In [6]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 300,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.018,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

max_steps is given, it will override any value given in num_train_epochs


In [7]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
3.393 GB of memory reserved.


In [8]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 671 | Num Epochs = 4
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 300
 "-____-"     Number of trainable parameters = 18,464,768


Step,Training Loss
1,3.6993
2,3.7182
3,3.6033
4,3.3577
5,3.1345
6,2.9556
7,2.7961
8,2.706
9,2.7204
10,2.4981


In [9]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

521.9335 seconds used for training.
8.7 minutes used for training.
Peak reserved memory = 4.209 GB.
Peak reserved memory for training = 0.816 GB.
Peak reserved memory % of max memory = 28.539 %.
Peak reserved memory for training % of max memory = 5.533 %.


In [12]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "chatml", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
    map_eos_token = True, # Maps <|im_end|> to  instead
)

FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"from": "human", "value": "Write a funny tweet"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

['<|im_start|>user\nWrite a funny tweet<|im_end|>\n<|im_start|>assistant\nWhat do you think of my new look?<|im_end|>']

In [13]:
from google.colab import userdata

In [14]:
model.push_to_hub_merged("glouriousgautam/qwen2_1_5b_lvn_chat", tokenizer, save_method = "merged_16bit", token = userdata.get('HF_WRITE'))

Unsloth: You are pushing to hub, but you passed your HF username = glouriousgautam.
We shall truncate glouriousgautam/qwen2_1_5b_lvn_chat to qwen2_1_5b_lvn_chat
Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which will take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 3.1G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 4.28 out of 12.67 RAM for saving.


100%|██████████| 28/28 [00:00<00:00, 36.38it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Unsloth: Saving qwen2_1_5b_lvn_chat/pytorch_model.bin...


README.md:   0%|          | 0.00/576 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

Done.
Saved merged model to https://huggingface.co/glouriousgautam/qwen2_1_5b_lvn_chat


## Running the model
Restart kernal session.

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer

In [2]:
model = AutoModelForCausalLM.from_pretrained(
    "glouriousgautam/qwen2_1_5b_lvn_chat",
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("glouriousgautam/qwen2_1_5b_lvn_chat")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
messages = [
    {"from": "human", "value": "Write a funny tweet"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


["<|im_start|>user\nWrite a funny tweet<|im_end|>\n<|im_start|>assistant\nThe only way I know to make money on the stock market is to invest in your own education. \n\nThat’s why I started #thecloudseminar for $FREE when I noticed you paid people couldn't participate in live sessions due to lack of access.<|im_end|>"]

## Tokenizer exploration ignore

In [9]:
tokenizer2 = AutoTokenizer.from_pretrained("Qwen/Qwen2-1.5B-Instruct")

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

In [10]:
# 1. Basic Information
print(f"Tokenizer type: {type(tokenizer)}")
print(f"Tokenizer name or path: {tokenizer.name_or_path}")

Tokenizer type: <class 'transformers.models.qwen2.tokenization_qwen2_fast.Qwen2TokenizerFast'>
Tokenizer name or path: 


In [12]:
# 2. Vocabulary
vocab_size = tokenizer.vocab_size
print(f"Vocabulary size: {vocab_size}")

Vocabulary size: 151643


In [13]:
# 2. Vocabulary
print(f"Vocabulary size: {tokenizer2.vocab_size}")

Vocabulary size: 151643


In [14]:
# 3. Special Tokens
print(f"Special tokens: {tokenizer.special_tokens_map}")
print(f"All special tokens: {tokenizer.all_special_tokens}")
print(f"Special tokens ids: {tokenizer.all_special_ids}")

Special tokens: {'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>'}
All special tokens: ['<|im_end|>', '<|endoftext|>']
Special tokens ids: [151645, 151643]


In [15]:
# 3. Special Tokens
print(f"Special tokens: {tokenizer2.special_tokens_map}")
print(f"All special tokens: {tokenizer2.all_special_tokens}")
print(f"Special tokens ids: {tokenizer2.all_special_ids}")

Special tokens: {'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>']}
All special tokens: ['<|im_end|>', '<|endoftext|>', '<|im_start|>']
Special tokens ids: [151645, 151643, 151644]


In [16]:
sample_text = "Hello, how are you?"
tokens = tokenizer.tokenize(sample_text)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(f"Sample text: {sample_text}")
print(f"Tokens: {tokens}")
print(f"Token IDs: {token_ids}")

Sample text: Hello, how are you?
Tokens: ['Hello', ',', 'Ġhow', 'Ġare', 'Ġyou', '?']
Token IDs: [9707, 11, 1246, 525, 498, 30]


In [17]:
sample_text = "Hello, how are you?"
tokens = tokenizer2.tokenize(sample_text)
token_ids = tokenizer2.convert_tokens_to_ids(tokens)
print(f"Sample text: {sample_text}")
print(f"Tokens: {tokens}")
print(f"Token IDs: {token_ids}")

Sample text: Hello, how are you?
Tokens: ['Hello', ',', 'Ġhow', 'Ġare', 'Ġyou', '?']
Token IDs: [9707, 11, 1246, 525, 498, 30]


In [20]:
print(f"Full tokenizer configuration: {tokenizer}")

Full tokenizer configuration: Qwen2TokenizerFast(name_or_path='', vocab_size=151643, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}


In [22]:
print(tokenizer2)

Qwen2TokenizerFast(name_or_path='Qwen/Qwen2-1.5B-Instruct', vocab_size=151643, model_max_length=32768, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>']}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}


In [24]:
#print the difference between tokenizer and tokenizer2
for key in tokenizer.__dict__:
    if key not in tokenizer2.__dict__:
        print(key)

original_push_to_hub
push_to_hub
_ollama_modelfile
_system_message


In [25]:
for key in tokenizer2.__dict__:
    if key not in tokenizer.__dict__:
        print(key)