In [3]:
!pip install -q peft bitsandbytes transformers trl
# peft for LoRA, bitsandbytes for quantization, trl for fine-tuning

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.2/245.2 kB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m37.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m51.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.4/102.4 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m60.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━

In [4]:
import torch
from datasets import load_dataset, load_dataset_builder
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline
)
from peft import LoraConfig
from trl import SFTTrainer

In [6]:
# sample dataset from huggingface hub: ""
data_name = "mayench18/Llama2-Formatted-Finance-Sentiment"

# loading only train, not validation or test data
# returns Dataset object

training_data = load_dataset(data_name, split="train")
# print(training_data[0:9], sep="\n")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/273 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/104k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [8]:
base_model_name = "NousResearch/Llama-2-7b-chat-hf"

# llama 3 8B: "meta-llama/Meta-Llama-3-8B", llama 2 13B: "meta-llama/Llama-2-13b-hf"
# finGPT: "FinGPT/fingpt-mt_llama2-7b_lora"

# insert name for our model here
tuned_model = "llama-2-7b-fin"

# returns LlamaTokenizerFast object
llama_tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)

# make padding token the end of sentence token, not sure if needed for example or actual data
llama_tokenizer.pad_token = llama_tokenizer.eos_token

llama_tokenizer.padding_side = "right"
 # usually left, but fp16 quantization creates problems

In [9]:
# model quantization
quant_config = BitsAndBytesConfig(
    load_in_4bit=True, # change to load_in_8bit=True if necessary, paper used 8bit
    bnb_4bit_quant_type="nf4", # sets dtype in 4bit linear layers if using 4bit quant
    bnb_4bit_compute_dtype=torch.float16, # sets computational type to fp16
    bnb_4bit_use_double_quant=False # disable nested quantization
)

# load base model
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=quant_config,
    device_map={"": 0}
)
base_model.config.use_cache = False  # disable caching of model outputs
base_model.config.pretraining_tp = 1 # pretraining temperature

config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]



In [10]:
peft_parameters = LoraConfig(
    lora_alpha=16, # alpha parameter for scaling
    lora_dropout=0.1, # dropout probability for LoRA layers
    r=8, # attention dimension or rank
    bias="none", # not updating biases during training
    task_type="CAUSAL_LM"
)


train_params = TrainingArguments(
    output_dir="./results_modified", # will contain model predictions and checkpoints
    num_train_epochs=2, # 5 epochs used in paper, doing 2 for testing purposes
    per_device_train_batch_size=2, # changed from 4 because running out of memory
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit", # AdamW optimizer
    save_steps=50,# changed from 25
    logging_steps=50, # changed from 25
    learning_rate=2e-4, # keras default is 1e-3, much smaller for fine-tuning
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3, # max gradient norm for gradient clipping
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)


fine_tuner = SFTTrainer(
    model=base_model,
    train_dataset=training_data,
    peft_config=peft_parameters,
    dataset_text_field="prompt", # name of text field in dataset
    tokenizer=llama_tokenizer,
    args=train_params,
)



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [11]:
# actual fine-tuning happens here
fine_tuner.train()

fine_tuner.model.save_pretrained(tuned_model)

Step,Training Loss
50,1.9217
100,1.01
150,0.9635
200,0.9668
250,0.9249
300,0.9912
350,0.9091
400,0.9251
450,0.9025
500,0.9279




In [12]:
# removed eval code, dependent on the structure of the dataset used

In [None]:
# random wallstreetbets quote
query = "Calls on Boeing Hitman, his Q2 earnings are going to be nuts."

text_gen = pipeline(task="text-generation", model=tuned_model, tokenizer=llama_tokenizer, max_length=200)
output = text_gen(f"<s>[INST] {query} [/INST]") # might want to change /inst tags
print(output[0]['generated_text'])