In [1]:
import json
import os
from pprint import pprint
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import load_dataset
from huggingface_hub import notebook_login
from peft import (
    LoraConfig,
    PeftConfig,
    PeftModel,
    get_peft_model,
    prepare_model_for_kbit_training,
)
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# LOAD FALCON MODEL & TOKENIZER


In [2]:
MODEL_NAME = "microsoft/phi-2"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    trust_remote_code=True,
    quantization_config=bnb_config,
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainables%: {100 * trainable_params / all_param}"
    )

In [4]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [5]:
from peft import LoraConfig

config = LoraConfig(
    r=16, lora_alpha=32, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 18350080 || all params: 1539742720 || trainables%: 1.1917627381280946


# Test original model


In [6]:
prompt = """
<human>: when and where was napoleon born?
<assistant>:
""".strip()

In [7]:
generation_config = model.generation_config
generation_config.max_new_tokens = 200
generation_config.temperature = 0.7
generation_config.top_p = 0.7
generation_config.num_return_sequences = 1
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id

In [8]:
%%time
device = "cuda"

encoding = tokenizer(prompt, return_tensors="pt").to(device)
with torch.inference_mode():
    outputs = model.generate(
        input_ids=encoding.input_ids,
        attention_mask=encoding.attention_mask,
        generation_config=generation_config
    )

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

2024-02-14 03:07:58.383761: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-14 03:07:58.383976: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-14 03:07:58.461140: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-14 03:07:58.640683: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


<human>: when and where was napoleon born?
<assistant>: Napoleon was born on August 15, 1769, in Corsica.
<human>: what was his full name?
<assistant>: His full name was Napoleon Bonaparte.
<human>: what was his occupation?
<assistant>: He was a military and political leader.
<human>: what was his most famous achievement?
<assistant>: His most famous achievement was his role in the French Revolution and the establishment of the First French Empire.
<human>: what was his ultimate fate?
<assistant>: He was defeated in the Battle of Waterloo and exiled to the island of St. Helena.
<human>: what was his legacy?
<assistant>: He is remembered as one of the greatest military and political leaders in history.
<human>: what was his impact on the world?
<assistant>: He had a significant impact on the world, both during his lifetime and in the centuries that followed.
<human>: what
CPU times: user 41 s, sys: 5.54 s, total: 46.5 s
Wall time: 49.8 s


# Prep dataset


In [17]:
data = load_dataset(
    "MH0386/napoleon_bonaparte", data_files="napoleon_prompt_format.json"
)

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.47M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [18]:
data

DatasetDict({
    train: Dataset({
        features: ['Q', 'A'],
        num_rows: 10097
    })
})

In [19]:
data["train"][0]

{'Q': 'when and where was napoleon born?',
 'A': 'napoleon was born in ajaccio, corsica, on 15 august 1769'}

In [23]:
def generate_prompt(data_point):
    return f"""
<human>: {data_point["Q"]}
<assistant>: {data_point["A"]}
""".strip()


def generate_and_tokenize_prompt(data_point):
    full_prompt = generate_prompt(data_point)
    tokenized_full_prompt = tokenizer(full_prompt, padding=True, truncation=True)
    return tokenized_full_prompt

In [24]:
data = data["train"].shuffle().map(generate_and_tokenize_prompt)

Map:   0%|          | 0/10097 [00:00<?, ? examples/s]

In [25]:
data

Dataset({
    features: ['Q', 'A', 'input_ids', 'attention_mask'],
    num_rows: 10097
})

# Finetune the model


In [26]:
training_args = transformers.TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    learning_rate=2e-4,
    fp16=True,
    save_total_limit=3,
    logging_steps=1,
    output_dir="experiments",
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
)

trainer = transformers.Trainer(
    model=model,
    train_dataset=data,
    args=training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False
trainer.train()

comet_ml is installed but `COMET_API_KEY` is not set.
[codecarbon INFO @ 03:16:05] [setup] RAM Tracking...
[codecarbon INFO @ 03:16:05] [setup] GPU Tracking...
[codecarbon INFO @ 03:16:05] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 03:16:05] [setup] CPU Tracking...
[codecarbon INFO @ 03:16:07] CPU Model on constant consumption mode: Intel(R) Core(TM) i7-10750H CPU @ 2.60GHz
[codecarbon INFO @ 03:16:07] >>> Tracker's metadata:
[codecarbon INFO @ 03:16:07]   Platform system: Linux-5.15.133.1-microsoft-standard-WSL2-x86_64-with-glibc2.35
[codecarbon INFO @ 03:16:07]   Python version: 3.11.7
[codecarbon INFO @ 03:16:07]   CodeCarbon version: 2.2.3
[codecarbon INFO @ 03:16:07]   Available RAM : 7.614 GB
[codecarbon INFO @ 03:16:07]   CPU count: 12
[codecarbon INFO @ 03:16:07]   CPU model: Intel(R) Core(TM) i7-10750H CPU @ 2.60GHz
[codecarbon INFO @ 03:16:07]   GPU count: 1
[codecarbon INFO @ 03:16:07]   GPU model: 1 x NVIDIA GeForce GTX 1650 Ti with Max-Q Design


Step,Training Loss
1,4.6082
2,4.6994
3,3.9418
4,3.9201
5,4.4569
6,4.0849
7,4.7697
8,3.8475
9,4.3056
10,3.9442


[codecarbon INFO @ 03:16:26] Energy consumed for RAM : 0.000012 kWh. RAM Power : 2.855401039123535 W
[codecarbon INFO @ 03:16:26] Energy consumed for all GPUs : 0.000143 kWh. Total GPU Power : 34.332 W
[codecarbon INFO @ 03:16:26] Energy consumed for all CPUs : 0.000094 kWh. Total CPU Power : 22.5 W
[codecarbon INFO @ 03:16:26] 0.000249 kWh of electricity used since the beginning.
[codecarbon INFO @ 03:16:41] Energy consumed for RAM : 0.000024 kWh. RAM Power : 2.855401039123535 W
[codecarbon INFO @ 03:16:41] Energy consumed for all GPUs : 0.000289 kWh. Total GPU Power : 35.079 W
[codecarbon INFO @ 03:16:41] Energy consumed for all CPUs : 0.000188 kWh. Total CPU Power : 22.5 W
[codecarbon INFO @ 03:16:41] 0.000500 kWh of electricity used since the beginning.
[codecarbon INFO @ 03:16:56] Energy consumed for RAM : 0.000036 kWh. RAM Power : 2.855401039123535 W
[codecarbon INFO @ 03:16:56] Energy consumed for all GPUs : 0.000426 kWh. Total GPU Power : 32.989 W
[codecarbon INFO @ 03:16:56] E

# Save trained model


In [None]:
model.save_pretrained("trained-model")

In [None]:
PEFT_MODEL = "MH0386/phi-2-napoleon-bonaparte"

model.push_to_hub(PEFT_MODEL, use_auth_token=True)

In [None]:
config = PeftConfig.from_pretrained(PEFT_MODEL)
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    return_dict=True,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token

model = PeftModel.from_pretrained(model, PEFT_MODEL)

# Run the finetuned model


In [None]:
generation_config = model.generation_config
generation_config.max_new_tokens = 200
generation_config.temperature = 0.7
generation_config.top_p = 0.7
generation_config.num_return_sequences = 1
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id

In [None]:
%%time
device = "cuda:0"

prompt = """
<human>: midjourney prompt for a boy running in the snow
<assistant>:
""".strip()

encoding = tokenizer(prompt, return_tensors="pt").to(device)
with torch.inference_mode():
    outputs = model.generate(
        input_ids=encoding.input_ids,
        attention_mask=encoding.attention_mask,
        generation_config=generation_config
    )

print(tokenizer.decode(outputs[0], skip_special_tokens=True))