<a href="https://www.kaggle.com/code/aisuko/llm-prompt-recovery-with-gemma?scriptVersionId=165643354" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Overview



In [1]:
%%capture
!pip install transformers==4.38.2
!pip install accelerate==0.27.2
!pip install datasets==2.18.0
!pip install peft==0.9.0
!pip install bitsandbytes==0.42.0
!pip install trl==0.7.11

In [2]:
import os
import torch
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
login(token=user_secrets.get_secret("HUGGINGFACE_TOKEN"))

os.environ["WANDB_API_KEY"]=user_secrets.get_secret("WANDB_API_KEY")
os.environ["WANDB_PROJECT"] = "Fine-tuning gemma-2b-it"
os.environ["WANDB_NAME"] = "ft-google-gemma-2b-it-qlora"
os.environ["MODEL_NAME"] = "google/gemma-2b-it"
os.environ["DATASET"] = "/kaggle/input/gemma-rewrite-nbroad/nbroad-v2.csv"

torch.backends.cudnn.deterministic=True
# https://github.com/huggingface/transformers/issues/28731
torch.backends.cuda.enable_mem_efficient_sdp(False)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [3]:
!accelerate estimate-memory ${MODEL_NAME} --library_name transformers

Loading pretrained config for `google/gemma-2b-it` from `transformers`...
config.json: 100%|█████████████████████████████| 627/627 [00:00<00:00, 3.15MB/s]
┌────────────────────────────────────────────────────┐
│   Memory Usage for loading `google/gemma-2b-it`    │
├───────┬─────────────┬──────────┬───────────────────┤
│ dtype │Largest Layer│Total Size│Training using Adam│
├───────┼─────────────┼──────────┼───────────────────┤
│float32│   1.95 GB   │ 9.34 GB  │      37.38 GB     │
│float16│  1000.0 MB  │ 4.67 GB  │      18.69 GB     │
│  int8 │   500.0 MB  │ 2.34 GB  │      9.34 GB      │
│  int4 │   250.0 MB  │ 1.17 GB  │      4.67 GB      │
└───────┴─────────────┴──────────┴───────────────────┘


# Checking Data

In [4]:
import pandas as pd

csv_train=pd.read_csv('/kaggle/input/llm-prompt-recovery/train.csv')
csv_train.tail()

Unnamed: 0,id,original_text,rewrite_prompt,rewritten_text
0,-1,The competition dataset comprises text passage...,"Convert this into a sea shanty: """"""The competi...",Here is your shanty: (Verse 1) The text is rew...


In [5]:
csv_test=pd.read_csv('/kaggle/input/llm-prompt-recovery/test.csv')
csv_test.tail()

Unnamed: 0,id,original_text,rewritten_text
0,-1,The competition dataset comprises text passage...,Here is your shanty: (Verse 1) The text is rew...


In [6]:
csv_sub=pd.read_csv('/kaggle/input/llm-prompt-recovery/sample_submission.csv')
csv_sub.tail()

Unnamed: 0,id,rewrite_prompt
0,9559194,Improve that text.


# Loading Tokenizer

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(os.getenv("MODEL_NAME"))
tokenizer

tokenizer_config.json:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/888 [00:00<?, ?B/s]

GemmaTokenizerFast(name_or_path='google/gemma-2b-it', vocab_size=256000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<bos>', 'eos_token': '<eos>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<start_of_turn>', '<end_of_turn>']}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<eos>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("<bos>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	106: AddedToken("<start_of_turn>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	107: AddedToken("<end_of_turn>", rstrip=False, lstr

# Loading Model

In [8]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    llm_int8_enable_fp32_cpu_offload=True,
)

model = AutoModelForCausalLM.from_pretrained(
    os.getenv("MODEL_NAME"),
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

model.config.eos_token_id=tokenizer.eos_token_id
model.gradient_checkpointing_enable() # reducing memory usage
print(model.model.embed_tokens)

def print_trainable_parameters(model):
    trainable_params=0
    all_params=0
    for _, param in model.named_parameters():
        all_params+=param.numel()
        if param.requires_grad:
            trainable_params+=param.numel()
    print(f"trainable params: {trainable_params} || all params: {all_params} || trainable%: {100 * trainable_params/all_params:.2f}")

print_trainable_parameters(model)

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Embedding(256000, 2048, padding_idx=0)
trainable params: 524363776 || all params: 1515268096 || trainable%: 34.61


In [9]:
input_text = "The Weather of Melbourne"
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")

outputs = model.generate(**input_ids)
print(tokenizer.decode(outputs[0]))

2024-03-06 03:46:23.210473: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-06 03:46:23.210585: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-06 03:46:23.489920: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


<bos>The Weather of Melbourne

The weather in Melbourne is renowned for its unpredictable nature, with four distinct


# Loading Data

In [10]:
train_ds=pd.read_csv(os.getenv("DATASET"), nrows=1000)
train_ds.tail()

Unnamed: 0,id,original_text,rewrite_prompt,rewritten_text
995,NLaUSPgYMK,Selfish Bastard never thought of anyone but hi...,Rewrite the essay in a play format with hilari...,"(Characters: Sultan, Beggar, Homeless Acrobat,..."
996,VuvdlOfYGx,"People of Earth. Please, listen. As you may kn...",Rewrite the story but each leader is killed of...,"People of Earth, listen up. As you know, every..."
997,EWnCGnmVxi,`` I need time to think.'' I almost laugh when...,Rewrite the prompt as if it's describing a cha...,In the historical fiction setting of the novel...
998,ObiwfrAyWJ,"NSFW, possibly. \n \n This was n't exactly wha...",Rewrite the piece with a strong message about ...,"In this era of societal transformation, it is ..."
999,lgSLOZntuR,"In retrospect, it made perfect sense. \n \n Th...",Rewrite the story as if all the characters are...,The kitchen danced with the aroma of freshly-s...


In [11]:
from datasets import load_dataset, Dataset

data=Dataset.from_pandas(train_ds)
data=data.map(lambda samples: tokenizer(samples["original_text"]), batched=True)
data=data.map(lambda samples: tokenizer(samples["rewritten_text"]), batched=True)
data=data.map(lambda samples: tokenizer(samples["rewrite_prompt"]), batched=True)

data=data.train_test_split(test_size=0.1)
data

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'original_text', 'rewrite_prompt', 'rewritten_text', 'input_ids', 'attention_mask'],
        num_rows: 900
    })
    test: Dataset({
        features: ['id', 'original_text', 'rewrite_prompt', 'rewritten_text', 'input_ids', 'attention_mask'],
        num_rows: 100
    })
})

# Freeze Original Weight and Import LoRA

In [12]:
from peft import prepare_model_for_kbit_training

prepared_model=prepare_model_for_kbit_training(
    model, use_gradient_checkpointing=True
)

print_trainable_parameters(prepared_model)
print(prepared_model)

trainable params: 0 || all params: 1515268096 || trainable%: 0.00
GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear4bit(in_features=16384, out_features=2048, bias=False)
          (act_fn): GELUActivation()
        )
        (input_layernorm): Gemma

In [13]:
from peft import LoraConfig, TaskType, get_peft_model

lora_config=LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj','gate_proj','up_proj','down_proj'],
    lora_dropout=0.1,
    bias="none",
    modules_to_save=["lm_head","embed_tokens"], # we added new tokens to tokenizer, this is necesarry
    task_type=TaskType.CAUSAL_LM
)

lora_model=get_peft_model(prepared_model, lora_config)
lora_model.config.use_cache=False
print_trainable_parameters(lora_model)
print(lora_model)

trainable params: 1068187648 || all params: 2583455744 || trainable%: 41.35
PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GemmaForCausalLM(
      (model): GemmaModel(
        (embed_tokens): ModulesToSaveWrapper(
          (original_module): Embedding(256000, 2048, padding_idx=0)
          (modules_to_save): ModuleDict(
            (default): Embedding(256000, 2048, padding_idx=0)
          )
        )
        (layers): ModuleList(
          (0-17): 18 x GemmaDecoderLayer(
            (self_attn): GemmaSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Lin

# Training

In [14]:
from transformers import TrainingArguments, set_seed
from trl import SFTTrainer

set_seed(2024)

bs=8
bs_eval=8
ga_steps=16
lr=0.0002
epochs=10

steps_per_epoch=len(data['train'])//(bs*ga_steps)

def preprocess_func(example):
    text = f"Original Essay:\n{example['original_text'][0]}\n\nRewritten Essay:\n{example['rewritten_text'][0]}\n\nInstruction:\n Given are 2 essays, the Rewritten essay was created from the Original essay using the google Gemma model.You are trying to understand how the original essay was transformed into a new version.Analyzing the changes in style, theme, etc., please come up with a prompt that must have been used to guide the transformation from the original to the rewritten essay.Only give me the PROMPT. Start directly with the prompt, that's all I need. Output should be only line ONLY.\n\nResponse: \n{example['rewrite_prompt'][0]}"
    return [text]

args=TrainingArguments(
    output_dir=os.getenv("WANDB_NAME"),
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=bs_eval,
    evaluation_strategy="steps",
    logging_steps=1,
    eval_steps=steps_per_epoch//2,
    save_steps=steps_per_epoch,
    gradient_accumulation_steps=ga_steps,
    num_train_epochs=epochs,
    lr_scheduler_type="constant",
    optim="paged_adamw_32bit", # val_loss will go nan with paged_adamw_8bit
    learning_rate=lr,
    group_by_length=False,
    fp16=True,
    ddp_find_unused_parameters=False,
    report_to='wandb',
    run_name=os.getenv('WANDB_NAME')
)

trainer=SFTTrainer(
    model=lora_model,
    tokenizer=tokenizer,
    args=args,
    train_dataset=data['train'],
    eval_dataset=data['test'],
    formatting_func=preprocess_func
)

trainer.train()



Map:   0%|          | 0/900 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Currently logged in as: [33murakiny[0m ([33mcausal_language_trainer[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.16.4 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.16.3
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20240306_034653-yffgehh7[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mft-google-gemma-2b-it-qlora[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/causal_language_trainer/Fine-tuning%20gemma-2b-it[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/causal_language_trainer/Fine-tuning%20gemma-2b-it/runs/yffgehh7[0m


Step,Training Loss,Validation Loss
3,0.1332,2.537846
6,0.0683,2.585815
9,0.0236,2.735514




TrainOutput(global_step=10, training_loss=0.08935996079817414, metrics={'train_runtime': 162.8397, 'train_samples_per_second': 0.061, 'train_steps_per_second': 0.061, 'total_flos': 155184172892160.0, 'train_loss': 0.08935996079817414, 'epoch': 10.0})

In [15]:
kwargs={
    'model_name': f'{os.getenv("WANDB_NAME")}',
    'finetuned_from': os.getenv('MODEL_NAME'),
#     'tasks': '',
#     'dataset_tags':'',
#     'dataset': os.getenv("DATASET")
}

tokenizer.push_to_hub(os.getenv("WANDB_NAME"))
trainer.push_to_hub(**kwargs)

README.md:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/4.92k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/4.27G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/aisuko/ft-google-gemma-2b-it-qlora/commit/0268156af3afbacd84724f0fd71e5a8efabae8ba', commit_message='End of training', commit_description='', oid='0268156af3afbacd84724f0fd71e5a8efabae8ba', pr_url=None, pr_revision=None, pr_num=None)

In [16]:
# If necessary to merge the adapter, uncomment code below

# from peft import PeftModel

# model=PeftModel.from_pretrained(model, os.getenv("WANDB_NAME"))
# model=model.merge_and_unload()

# model.save_pretrained(os.getenv("WANDB_NAME"), safe_serialization=True, max_shared_size="4GB")

# model.push_to_hub(os.getenv("WANDB_NAME"))

In [17]:
import gc
import re
import datetime
from tqdm import tqdm

start_time=datetime.datetime.now()

def truncate_txt(text, length):
    text_list = text.split()
    
    if len(text_list) <= length:
        return text
    
    return " ".join(text_list[:length])


def gen_prompt(og_text, rewritten_text):
    
    # Truncate the texts to first 200 words for now
    # As we are having memory issues on Mixtral8x7b
    og_text = truncate_txt(og_text, 150)
    rewritten_text = truncate_txt(rewritten_text, 150)
    
    return f"""    
    Original Essay:
    \"""{og_text}\"""
    
    Rewritten Essay:
    \"""{rewritten_text}\"""
    
    Given are 2 essays, the Rewritten essay was created from the Original essay using the google Gemma model.
    You are trying to understand how the original essay was transformed into a new version.
    Analyzing the changes in style, theme, etc., please come up with a prompt that must have been used to guide the transformation from the original to the rewritten essay.
    Start directly with the prompt, that's all I need. Output should be only line ONLY.
    """


device = "cuda"
csv_test['id'] = csv_sub['id'].copy()

pbar = tqdm(total=csv_test.shape[0])

it = iter(csv_test.iterrows())
idx, row = next(it, (None, None))

# https://www.kaggle.com/competitions/llm-prompt-recovery/discussion/481116
DEFAULT_TEXT = "Please improve the following text using the writing style of, maintaining the original meaning but altering the tone, diction, and stylistic elements to match the new style.Enhance the clarity, elegance, and impact of the following text by adopting the writing style of , ensuring the core message remains intact while transforming the tone, word choice, and stylistic features to align with the specified style."

res = []

while idx is not None:
    
    if (datetime.datetime.now() - start_time) > datetime.timedelta(hours=8, minutes=30):
        res.append([row["id"], DEFAULT_TEXT])
        idx, row = next(it, (None, None))
        pbar.update(1)
        continue
        
    torch.cuda.empty_cache()
    gc.collect()
        
    try:        
        messages = [
            {
                "role": "user",
                "content": gen_prompt(row["original_text"], row["rewritten_text"])
            }
        ]
        encoded_input = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to(device)
        
        with torch.no_grad():
            encoded_output = lora_model.generate(encoded_input, max_new_tokens=50, do_sample=True, pad_token_id=tokenizer.eos_token_id)
        
        decoded_output = tokenizer.batch_decode(encoded_output, skip_special_tokens=True)[0]
        decoded_output = result = re.sub(r"[\s\S]*\[\/INST\]", '', decoded_output, 1)
                
        res.append([row["id"], decoded_output])
                            
    except Exception as e:
        print(f"ERROR: {e}")
        res.append([row["id"], DEFAULT_TEXT])
        
    finally:
        idx, row = next(it, (None, None))
        pbar.update(1)

        
pbar.close()

  0%|          | 0/1 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
100%|██████████| 1/1 [01:10<00:00, 70.59s/it]


In [18]:
sub=pd.DataFrame(res, columns=['id','rewrite_prompt'])
sub

Unnamed: 0,id,rewrite_prompt
0,9559194,"user\nOriginal Essay:\n """"""The competition ..."


In [19]:
sub.to_csv("submission.csv", index=False)

# Acknowledge

* https://www.kaggle.com/code/wlifferth/starter-notebook-generating-more-data-with-gemma
* https://www.kaggle.com/code/nischaydnk/gemma-asking-llm-to-generate-prompt
* https://www.kaggle.com/code/aatiffraz/prompt-prediction-w-mixtral-mistral7b-gemma-llama
* https://www.kaggle.com/code/yujansaya/gemma-7b-with-lora-prompt-recovery
* https://www.kaggle.com/code/aisuko/fine-tuning-phi-2-with-qlora