In [None]:
!pip install torch transformers datasets peft bitsandbytes huggingface_hub



In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, \
    DataCollatorForLanguageModeling
from datasets import Dataset
from peft import LoraConfig, get_peft_model
from huggingface_hub import login
import pandas as pd

In [None]:
login("{Hugging Face Token}")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
def load_data(prompt_path, answer_path):
    prompt_df = pd.read_csv(prompt_path, sep='\t', header=None, names=["Prompt"], engine='python')
    answer_df = pd.read_csv(answer_path, sep='\t', header=None, names=["Answer"], engine='python')

    combined_df = pd.concat([prompt_df, answer_df], axis=1)

    dataset = Dataset.from_pandas(combined_df)
    return dataset


def tokenize_function(examples):
    concatenated_text = [
          f"{prompt if prompt else ''}: {answer if answer else ''} <eos>"
          for prompt, answer in zip(examples["Prompt"], examples["Answer"])
    ]
    return tokenizer(concatenated_text, padding="max_length", truncation=True, max_length=token_max_length)

In [None]:
wd = '/mnt/data'
output_wd = '/mnt/output'
model_output = 'english-tt-fine-tuned-gemma-2-2b'
model_name = 'google/gemma-2-2b-it'
batch_size = 8
token_max_length = 256

In [None]:
train_dataset = load_data(f'{wd}/train-source.txt', f'{wd}/train-target.txt')
val_dataset = load_data(f'{wd}/val-source.txt', f'{wd}/val-target.txt')

tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenized_train_datasets = train_dataset.map(
    tokenize_function,
    batched=True,
    desc="Tokenizing training data",
    batch_size=batch_size
)

tokenized_val_datasets = val_dataset.map(
    tokenize_function,
    batched=True,
    desc="Tokenizing validation data",
    batch_size=batch_size
)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.2,
    bias="none",
    target_modules=["q_proj", "v_proj", "o_proj"],
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    attn_implementation='eager'
)

model = get_peft_model(model, lora_config)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir=f"{output_wd}",
    report_to="none",
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=2,
    num_train_epochs=5,
    weight_decay=0.01,
    save_steps=1000,
    save_total_limit=2,
    logging_dir='logs',
    logging_steps=200,
    fp16=True,
    eval_strategy="steps",
    eval_steps=500,
    dataloader_num_workers=8,
    load_best_model_at_end=True,
    logging_first_step=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_datasets,
    eval_dataset=tokenized_val_datasets,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

{'Prompt': ['Generate tongue twisters about key words: dirty double damask dinner napkin', 'Generate tongue twisters about key words: slits', 'Generate tongue twisters about key words: amy aiming anemic anemones', 'Generate tongue twisters about key words: shop sport short socks', 'Generate tongue twisters about key words: state fish hatchery'], 'Answer': ['The duke dropped the dirty double damask dinner napkin', 'She slits the sheet she sits on.', 'Am I and Amy aiming anemic anemones on my many enemies?', 'Does this shop sport short socks with spots?', 'They hatch fish at the state fish hatchery and sell hatched fish to the fish stick factory.']}
{'Prompt': ['Generate tongue twisters about key words: sixteen times', 'Generate tongue twisters about key words: kitchen cutlery clattered', 'Generate tongue twisters about key words: save stu', 'Generate tongue twisters about key words: dried draped', 'Generate tongue twisters about key words: tweety trains two tree toads'], 'Answer': ['Say

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Tokenizing training data:   0%|          | 0/1912 [00:00<?, ? examples/s]

Tokenizing validation data:   0%|          | 0/106 [00:00<?, ? examples/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [None]:
print("Starting training...")
trainer.train()

print("Finished training")

Starting training...


Step,Training Loss,Validation Loss
500,2.0912,No log


Finished training


In [None]:
output_path = f"{output_wd}/{model_output}"
trainer.save_model(output_path)
tokenizer.save_pretrained(output_path)

('/mnt/output/fine-tuned-gemma-2-2b-english-tongue-twisters-v11/tokenizer_config.json',
 '/mnt/output/fine-tuned-gemma-2-2b-english-tongue-twisters-v11/special_tokens_map.json',
 '/mnt/output/fine-tuned-gemma-2-2b-english-tongue-twisters-v11/tokenizer.model',
 '/mnt/output/fine-tuned-gemma-2-2b-english-tongue-twisters-v11/added_tokens.json',
 '/mnt/output/fine-tuned-gemma-2-2b-english-tongue-twisters-v11/tokenizer.json')

In [None]:
trained_model_name = f"/mnt/output/{model_output}"
trained_model = AutoModelForCausalLM.from_pretrained(trained_model_name)
inference_tokenizer = AutoTokenizer.from_pretrained(trained_model_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
question = "Generate tongue twisters about key words: cinnamon synonym"

In [None]:
inputs = tokenizer(question, return_tensors="pt")

with torch.cuda.amp.autocast():
  outputs = trained_model.generate(
      inputs.input_ids,
      max_length=100,
      num_return_sequences=1,
      no_repeat_ngram_size=2,
      repetition_penalty=2.0,
      top_k=50,
      top_p=0.95,
      temperature=0.7,
      eos_token_id=inference_tokenizer.eos_token_id
  )
response = inference_tokenizer.decode(outputs[0], skip_special_tokens=True)

  with torch.cuda.amp.autocast():


In [None]:
response

"Generate tongue twisters about key words: cinnamon synonymously smells sweetly in the oven when it's baking. "