In [1]:
!pip install huggingface_hub
!pip install -q bitsandbytes datasets accelerate loralib
!pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import torch
import torch.nn as nn
import bitsandbytes as bnb
import transformers
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
import pandas as pd

from datasets import Dataset, DatasetDict
from peft import LoraConfig, get_peft_model, PeftModel, PeftConfig

In [3]:
model_name = "Qwen/Qwen2.5-1.5B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

In [5]:
train = pd.read_csv('/content/train_df.csv', engine='python')
test = pd.read_csv('/content/test_df.csv', engine='python')

In [6]:
train.drop('Unnamed: 0',axis = 1, inplace = True)
test.drop('Unnamed: 0',axis = 1, inplace = True)

tds = Dataset.from_pandas(train)
vds = Dataset.from_pandas(test)

In [7]:
data = DatasetDict()

data['train'] = tds
data['validation'] = vds

In [13]:
data = data.map(lambda samples: tokenizer(text=[str(pred) for pred in samples['prediction']], padding=True, truncation=True), batched=True)

Map:   0%|          | 0/51100 [00:00<?, ? examples/s]

Map:   0%|          | 0/6814 [00:00<?, ? examples/s]

In [14]:
for param in model.parameters():
  param.requires_grad = False  # freeze the model - train adapters later
  if param.ndim == 1:
    # cast the small parameters (e.g. layernorm) to fp32 for stability
    param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
  def forward(self, x): return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)

In [15]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [16]:
config = LoraConfig(
    r=16, #attention heads
    lora_alpha=32, #alpha scaling
    # target_modules=["q_proj", "v_proj"], #if you know the
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM" # set this for CLM or Seq2Seq
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 2179072 || all params: 1545893376 || trainable%: 0.1409587513492263


In [17]:
%time
trainer = transformers.Trainer(
    model=model,
    train_dataset=data['train'],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        warmup_steps=100,
        max_steps=400,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir='outputs'
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.15 µs


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
1,3.7582
2,3.7004
3,3.7203
4,3.8516
5,3.9422
6,3.6464
7,3.8029
8,3.687
9,3.8331
10,3.8655


TrainOutput(global_step=400, training_loss=2.0479731819033624, metrics={'train_runtime': 1045.5588, 'train_samples_per_second': 6.121, 'train_steps_per_second': 0.383, 'total_flos': 1.034248707588096e+16, 'train_loss': 2.0479731819033624, 'epoch': 0.12524461839530332})

In [18]:
model.save_pretrained("/content/model")
tokenizer.save_pretrained("/content/model")

('/content/model/tokenizer_config.json',
 '/content/model/special_tokens_map.json',
 '/content/model/vocab.json',
 '/content/model/merges.txt',
 '/content/model/added_tokens.json',
 '/content/model/tokenizer.json')

In [19]:
!zip -r /content/model.zip /content/model

  adding: content/model/ (stored 0%)
  adding: content/model/merges.txt (deflated 57%)
  adding: content/model/README.md (deflated 66%)
  adding: content/model/adapter_model.safetensors (deflated 8%)
  adding: content/model/adapter_config.json (deflated 51%)
  adding: content/model/tokenizer.json (deflated 81%)
  adding: content/model/added_tokens.json (deflated 67%)
  adding: content/model/tokenizer_config.json (deflated 83%)
  adding: content/model/special_tokens_map.json (deflated 69%)
  adding: content/model/vocab.json (deflated 61%)


In [20]:
from google.colab import files
files.download("/content/model.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [21]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [24]:
model.push_to_hub("Ebad/bloom-7b1-lora-tagger-for-tweet-generation",
                  use_auth_token=True,
                  commit_message="basic training",
                  private=True)

adapter_model.safetensors:   0%|          | 0.00/8.73M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Ebad/bloom-7b1-lora-tagger-for-tweet-generation/commit/2e35ce1207724d2b8a402eaa865e15cbb362d718', commit_message='basic training', commit_description='', oid='2e35ce1207724d2b8a402eaa865e15cbb362d718', pr_url=None, pr_revision=None, pr_num=None)