In [1]:
!pip install -q -U transformers peft bitsandbytes datasets accelerate trl pyarrow nibabel

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from trl import SFTConfig, SFTTrainer
from peft import LoraConfig
from datasets import load_dataset
from google.colab import userdata
from huggingface_hub import login

secret_hf = userdata.get('HF_TOKEN')
login(secret_hf)

dataset = load_dataset("zeroshot/twitter-financial-news-sentiment", split="train")

model_id = "google/gemma-2b-it"
bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4")

model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config)
tokenizer = AutoTokenizer.from_pretrained(model_id)

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/164 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

In [2]:
def formatting_prompts_func(example):
    text = f"### Instruction: Analyze the sentiment of this financial news.\n### Input: {example['text']}\n### Response: "
    return text

In [3]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    task_type="CAUSAL_LM"
)

sft_config = SFTConfig(
    output_dir="./fin-gemma-results",
    max_length=512,
    per_device_train_batch_size=4,
    num_train_epochs=1,
    logging_steps=10,
    report_to="none"
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    args=sft_config,
    processing_class=tokenizer,
    formatting_func=formatting_prompts_func
)

trainer.train()

Applying formatting function to train dataset:   0%|          | 0/9543 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/9543 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/9543 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/9543 [00:00<?, ? examples/s]

Step,Training Loss
10,7.063416
20,6.880003
30,6.510105
40,6.182238
50,6.156195
60,5.631929
70,5.495963
80,5.114114
90,4.941853
100,4.627182


TrainOutput(global_step=2386, training_loss=2.666107555047265, metrics={'train_runtime': 4571.4737, 'train_samples_per_second': 2.088, 'train_steps_per_second': 0.522, 'total_flos': 6807415231438848.0, 'train_loss': 2.666107555047265})