Based on https://medium.com/@heyamit10/fine-tuning-gemma-2b-a-practical-guide-e4c25de43b2d 

In [1]:
# import os
# os.environ["TMPDIR"] = "./mytempdir"

In [2]:
!ls . -al

total 388
drwxrwxr-x 3 cs29824 cs29824   4096 Feb 19 22:06 .
drwxrwxr-x 5 cs29824 cs29824   4096 Feb 19 16:40 ..
-rw-rw-r-- 1 cs29824 cs29824   1638 Feb 19 06:05 alpaca_trainer.py
-rw-rw-r-- 1 cs29824 cs29824   1540 Feb 19 06:37 assistant_trainer.py
-rw-rw-r-- 1 cs29824 cs29824   1079 Feb 19 05:59 code_alpaca_trainer.py
-rw-rw-r-- 1 cs29824 cs29824  24575 Feb 19 21:31 finetuning_gemma_2b_a_practical_guide.ipynb
-rw-rw-r-- 1 cs29824 cs29824   1916 Feb 19 06:08 gpt_trainer.py
drwxrwxr-x 2 cs29824 cs29824   4096 Feb 19 22:06 mytempdir
-rw-rw-r-- 1 cs29824 cs29824    806 Feb 19 06:02 qa_trainer.py
-rw-rw-r-- 1 cs29824 cs29824 330091 Feb 19 21:23 sft_finetuning_example.ipynb
-rw-rw-r-- 1 cs29824 cs29824   6665 Feb 19 06:45 sft_finetuning_example.py


In [1]:
from transformers import AutoTokenizer

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b")

# Tokenize your dataset
def tokenize_data(example):
    return tokenizer(
        example["text"], truncation=True, padding="max_length", max_length=512
    )

In [2]:
from datasets import load_dataset

raw_dataset = load_dataset("sentence-transformers/eli5")

In [3]:
def formatting_prompts_func(example):
    output_texts = []
    text = f"### Question: {example['question']}\n ### Answer: {example['answer']}"
    output_texts.append(text)
    return output_texts

In [4]:
raw_dataset = raw_dataset.map(
    lambda examples: {"text": formatting_prompts_func(examples)[0]},
    remove_columns=raw_dataset["train"].column_names
)


In [5]:
tokenized_dataset = raw_dataset.map(tokenize_data, batched=True)


In [6]:
raw_dataset['train'][0]

{'text': "### Question: in football whats the point of wasting the first two plays with a rush - up the middle - not regular rush plays i get those\n ### Answer: Keep the defense honest, get a feel for the pass rush, open up the passing game. An offense that's too one dimensional will fail. And those rushes up the middle can be busted wide open sometimes for big yardage."}

In [7]:
# Convert the dataset to a Pandas DataFrame
df = tokenized_dataset["train"].to_pandas()

from sklearn.model_selection import train_test_split

# Split the DataFrame
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

In [8]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
model_name = 'google/gemma-2-2b'
device = 'cuda:0'
# Load the model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
       model_name,
       torch_dtype=torch.float16
).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Example tokenization
text = "Fine-tuning Gemma 2B is exciting!"
tokens = tokenizer(text, return_tensors="pt")
print(tokens)

2025-02-25 20:49:48.269497: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-25 20:49:48.303294: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1740516588.332866  664049 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1740516588.340716  664049 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-25 20:49:48.368858: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

{'input_ids': tensor([[     2,  36422, 235290, 110748, 137061, 235248, 235284, 235305,    603,
          17305, 235341]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [9]:
# Define the input string
input_text = "What's the derivative of tan(x)?"

# Tokenize the input text and send data to the device ('cuda:0' in our case)
inputs = tokenizer(input_text, return_tensors="pt").to(device)

# Generate output tokens from the model
with torch.no_grad():
    output_ids = model.generate(**inputs, max_new_tokens=2)

# Decode the generated tokens back into text
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Print the output string
print("Generated output:")
print(output_text)

Generated output:
What's the derivative of tan(x)?

What


In [11]:
from transformers import pipeline


In [12]:

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) 

Device set to use cuda:0


In [13]:
no_quantization_pipe = pipeline("text-generation", model="google/gemma-2-2b", device="cuda:0")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0


Generation is slow the first time, but then fast? As in, 2 tokens the first time takes 1 minute, but 30 tokens the third time takes .8s

In [18]:
no_quantization_pipe("What's the derivative of tan(x)?", max_new_tokens=5)

[{'generated_text': "What's the derivative of tan(x)?\n\n[Answer 1"}]

In [21]:
no_quantization_pipe("How can I purchase a spear?", max_new_tokens=30)



[{'generated_text': "How can I purchase a spear?\n\n[User 0001]\n\nI'm looking to purchase a spear. I'm not sure what to look for. I"}]

In [17]:
pipe("What's the derivative of tan(x)?", max_new_tokens=5)

[{'generated_text': "What's the derivative of tan(x)?\n\n[Answer 1"}]

In [None]:
import wandb
from transformers import TrainingArguments

wandb.init(project="gemma2b-fine-tune")

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    report_to="wandb",
)