# Finetuning Google Gemma's Model with CSV [data](https://drive.google.com/file/d/1Kkllt9AFmIz1K2Oe7AJpNmXTxoODIwht/view?usp=sharing)

In [None]:
!pip3 install -q -U bitsandbytes==0.42.0
!pip3 install -q -U peft==0.8.2
!pip3 install -q -U trl==0.7.10
!pip3 install -q -U accelerate==0.27.1
!pip3 install -q -U datasets==2.17.0
!pip3 install -q -U transformers==4.38.0
!pip3 install -q -U huggingface_hub

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.4/183.4 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m41.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.9/150.9 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.4/103.4 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━

In [None]:
import os
import transformers
import torch
from google.colab import userdata
from datasets import load_dataset
from trl import SFTTrainer
from peft import LoraConfig
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig, GemmaTokenizer

In [None]:
from getpass import getpass
os.environ["HF_TOKEN"] = getpass("Your Huggingface Hub Token: ")

In [None]:
model_id = "google/gemma-2b"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ['HF_TOKEN'])
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             quantization_config=bnb_config,
                                             device_map={"":0},
                                             token=os.environ['HF_TOKEN'])



tokenizer_config.json:   0%|          | 0.00/33.6k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [None]:
os.environ["WANDB_DISABLED"] = "false"

In [None]:
lora_config = LoraConfig(
    r = 8,
    target_modules = ["q_proj", "o_proj", "k_proj", "v_proj",
                      "gate_proj", "up_proj", "down_proj"],
    task_type = "CAUSAL_LM",
)

In [None]:
def formatting_func(example):
    text = f"Question: {example['inst'][0]}\n Answer: {example['answer'][0]}"
    return [text]

In [None]:
from datasets import load_dataset
data = load_dataset('csv', data_files={'train': 'file.csv'})
data = data.map(lambda samples: tokenizer(samples["inst"]), batched=True)

In [None]:
data['train']

Dataset({
    features: ['inst', 'answer', 'input_ids', 'attention_mask'],
    num_rows: 11
})

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=data["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=100, # should be changed if data is too less
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    ),
    peft_config=lora_config,
    formatting_func=formatting_func,
)



Map:   0%|          | 0/11 [00:00<?, ? examples/s]



In [None]:
trainer.train()

Step,Training Loss
1,0.5439
2,0.5439
3,0.5344
4,0.5141
5,0.4939
6,0.474
7,0.4535
8,0.4313
9,0.4066
10,0.3797


TrainOutput(global_step=100, training_loss=0.0771272501032945, metrics={'train_runtime': 71.8958, 'train_samples_per_second': 5.564, 'train_steps_per_second': 1.391, 'total_flos': 595117043712000.0, 'train_loss': 0.0771272501032945, 'epoch': 100.0})

In [None]:
text = "Question: The Car Doctors Ltd. 547 High Street, London Co. Reg. No.: 12345678 VAT No.GB123456789 Email: contact@thecardoctors.co.uk Phone: 07831261234 Website: thecardoctors.co.uk Bill to: John Smith 123 Piccadilly, London +447812345678 United Kingdom Invoice: Invoice Date: Due Date: 2021-009 19/05/2021 02/06/2021 Description Dent repair 6-inch dent above back left tyre Quantity 1 Unit Price VAT Amount each 200.00 20% 200.00 New tyre Standard multi-directional 1 each 60.00 20% 60.00 Tyre replacement Back left tyre - standard multi-directional 1 each 20.00 20% 20.00 Subtotal without VAT VAT 20% of 280.00 Total GBP Amount Paid Amount Due (GBP) 280.00 56.00 336.00 0.00 336.00 Terms & Conditions Vehicle: Black Ford Fiesta - AB12 CDE Payment to be made within 14 days via the payment link below. HH Pay online Bank Barclays Sort Code: 12-34-56 Account No.: 7654321 Account Holder: The Car Doctors Ltd. SWIFT: ABCDEF11 IBAN: GB11 HBUK 1111 1111 1111 11 Go to: https://invoice.sumup.com/s/OEsrHH Dm Or scan the QR code using your phone's camera. & sumup• Page 1 of 1 for Invoice #2021-009"

In [None]:
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)

outputs = model.generate(**inputs, max_new_tokens=20)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Question: The Car Doctors Ltd. 547 High Street, London Co. Reg. No.: 12345678 VAT No.GB123456789 Email: contact@thecardoctors.co.uk Phone: 07831261234 Website: thecardoctors.co.uk Bill to: John Smith 123 Piccadilly, London +447812345678 United Kingdom Invoice: Invoice Date: Due Date: 2021-009 19/05/2021 02/06/2021 Description Dent repair 6-inch dent above back left tyre Quantity 1 Unit Price VAT Amount each 200.00 20% 200.00 New tyre Standard multi-directional 1 each 60.00 20% 60.00 Tyre replacement Back left tyre - standard multi-directional 1 each 20.00 20% 20.00 Subtotal without VAT VAT 20% of 280.00 Total GBP Amount Paid Amount Due (GBP) 280.00 56.00 336.00 0.00 336.00 Terms & Conditions Vehicle: Black Ford Fiesta - AB12 CDE Payment to be made within 14 days via the payment link below. HH Pay online Bank Barclays Sort Code: 12-34-56 Account No.: 7654321 Account Holder: The Car Doctors Ltd. SWIFT: ABCDEF11 IBAN: GB11 HBUK 1111 1111 1111 11 Go to: https://invoice.sumup.com/s/OEsrHH D

In [None]:
# Save the model and tokenizer locally
model.save_pretrained("gemma-arya")
tokenizer.save_pretrained("gemma-arya")

('gemma-arya/tokenizer_config.json',
 'gemma-arya/special_tokens_map.json',
 'gemma-arya/tokenizer.model',
 'gemma-arya/added_tokens.json',
 'gemma-arya/tokenizer.json')

In [None]:
from transformers import AutoModel, AutoTokenizer, AutoConfig
from huggingface_hub import HfApi

In [None]:
# Define your model repository name
repo_name = "/content/gemma-arya"

# Push to Hugging Face Hub
api = HfApi()
api.create_repo(repo_id="aryatito/my-gemma-model-csv", exist_ok=True)
api.upload_folder(
    repo_id="aryatito/my-gemma-model-csv",
    folder_path=repo_name,
    repo_type="model",
    commit_message="Initial commit",
    use_auth_token=True
)

model.safetensors:   0%|          | 0.00/3.25G [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/aryatito/my-gemma-model-csv/commit/64ddf56f588e82398fcc4cc75480507b554bf52e', commit_message='Initial commit', commit_description='', oid='64ddf56f588e82398fcc4cc75480507b554bf52e', pr_url=None, pr_revision=None, pr_num=None)