## Prerequisites

Before delving into the fine-tuning process, ensure that you have the following prerequisites in place:

1. **GPU**: [gemma-2b](https://huggingface.co/google/gemma-2b) - can be finetuned on T4(free google colab) while [gemma-7b](https://huggingface.co/google/gemma-7b) requires an A100 GPU.
2. **Python Packages**: Ensure that you have the necessary Python packages installed. You can use the following commands to install them:

Let's begin by checking if your GPU is correctly detected:

In [1]:
!nvidia-smi

Wed Jun 26 09:20:01 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

## Step 2 - Model loading
We'll load the model using QLoRA quantization to reduce the usage of memory


In [2]:
!pip install -q accelerate==0.27.1 peft==0.8.2 bitsandbytes==0.42.0 transformers==4.38.0 trl==0.7.10 datasets==2.17.0 sentence-splitter

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m279.7/279.7 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.4/183.4 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m65.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.9/150.9 kB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.6/536.6 kB[0m [31m50.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.0/45.0 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m56.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

eos_equal_to_pad = True

Now we specify the model ID and then we load it with our previously defined quantization configuration.Now we specify the model ID and then we load it with our previously defined quantization configuration.

In [None]:
from huggingface_hub import login
import os
from dotenv import load_dotenv

load_dotenv()
    
hugging_face_token = os.getenv("HUGGING_FACE_TOKEN")
login(hugging_face_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
# 허깅페이스에서 모델명을 통해 다운로드 가능 (구글의 Gemma, MS의 phi, Mistral, Llama 등 가능)

# model_id = 'google/gemma-2b-it'
# model_id = 'microsoft/Phi-3-mini-4k-instruct'
model_id = 'microsoft/Phi-3-mini-128k-instruct'
# model_id = 'mistralai/Mistral-7B-Instruct-v0.2'
# model_id = 'microsoft/phi-2'

use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)


# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map={"": 0},
    trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True)
# tokenizer.padding_side = "right"
tokenizer.pad_token = tokenizer.eos_token

In [10]:
def get_completion(query: str, model, tokenizer) -> str:
  device = "cuda:0"

  # phi-2 template
  # user_start = 'Question: '
  # user_end = '\n'
  # bot_start = 'Answer: '

  # phi-3 template
  user_start = '<|user|>Find an answer corresponding to a given question: \n'
  user_end = '<|end|>'
  bot_start = '<|assistant|>'

  # gemma template
  # user_start = '<start_of_turn>user\n'
  # user_end = '<end_of_turn>'
  # bot_start = '<start_of_turn>model\n'

  # Mistral template
  # user_start = '<s>[INST]'
  # user_end = '[/INST]'
  # bot_start = ''

  prompt_template =  user_start \
                    + """{query}""" \
                    + user_end + bot_start
  prompt = prompt_template.format(query=query)
  encodeds = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)

  model_inputs = encodeds.to(device)

  # generated_ids = model.generate(**model_inputs, max_new_tokens=1000, do_sample=False)
  generated_ids = model.generate(**model_inputs, max_new_tokens=1000, do_sample=True, temperature=0.5,  pad_token_id=tokenizer.eos_token_id)
  decoded = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
  return (decoded)

In [11]:
test_case1= 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?'
test_case2 = 'Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?'
test_case3 = '3 * 40 + 12 = ?'


result = get_completion(query=test_case1, model=model, tokenizer=tokenizer)
print("---------- output ------------")
print(result)
print()

result = get_completion(query=test_case2, model=model, tokenizer=tokenizer)
print("---------- output ------------")
print(result)
print()

result = get_completion(query=test_case3, model=model, tokenizer=tokenizer)
print("---------- output ------------")
print(result)
print()



A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


---------- output ------------
Find an answer corresponding to a given question: 
Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May? ### Query

Given the document below, you have to determine if "Yes" or "No", the summary is factually consistent with the document.

Document: Growth into Piyumas. Did you know that according to a well-known proverb that was recorded by Sayyida Hurra that she didn't grow into a piyama but rather a piyuma grew into her? This whole story is recorded in the biography about Sayyida Hurra written by Shaban Muhammad Saydee. Another one was about where she got the name Piyuma. According to the biography, Piyuma meant bulk and the bulky personality of Sayyida Hurra could be seen in how she grew up. Once upon a time Piyuma de said that she couldn't grow any longer and put herself on a leaf and walked out of the tree. But that wasn't true. Piyuma grewed tall whe

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


---------- output ------------
Find an answer corresponding to a given question: 
Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn? # question

In a complex litigation case involving multiple defendants, Defendant A is found liable and ordered to pay damages. However, Defendant A files a motion for relief from judgment, claiming ineffective assistance of counsel. Defendant B, who was not a party to the original judgment, seeks to intervene in the motion for relief to ensure their interests are represented. What is the most compelling argument for Defendant B's standing to intervene?

A. Defendant B has no standing to intervene as they were not a party to the original trial and judgment, and only defendants named in the judgment can seek relief.
B. Defendant B has standing to intervene because the relief from judgment could potentially affect their rights or interests, and they have a sufficient connection to the matter at 

## Step 3 - Load dataset for finetuning

### Lets Load the Dataset (GSM8K train/test data)



In [None]:
from datasets import load_dataset

train_data = load_dataset("openai/gsm8k", 'main', split="train")  # 학습셋은 과제에서 실제로 사용하지 않을 예정
test_data = load_dataset("openai/gsm8k", 'main', split="test")


df = test_data.to_pandas()
df.head(10)



Instruction Fintuning - Prepare the dataset under the format of "prompt" so the model can better understand :
1. the function generate_prompt : take the instruction and output and generate a prompt
2. shuffle the dataset
3. tokenizer the dataset

### Formatting the Dataset

Now, let's format the dataset in the required [gemma instruction formate](https://huggingface.co/google/gemma-7b-it).

> Many tutorials and blogs skip over this part, but I feel this is a really important step.

```
<start_of_turn>user What is your favorite condiment? <end_of_turn>
<start_of_turn>model Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavor to whatever I'm cooking up in the kitchen!<end_of_turn>
```

You can use the following code to process your dataset and create a JSONL file in the correct format:

Split dataset into 80% for training and 20% for validation

In [None]:
# dataset = train_data.train_test_split(test_size=0.2)
# train_data = dataset["train"]
# validation_data = dataset["test"]

In [None]:
import re
from sentence_splitter import SentenceSplitter

splitter = SentenceSplitter(language='en')

def parse_answer(answer: str, answer_prefix=None) -> str:

    if answer_prefix ==None:
        sentences = splitter.split(text=answer)
        answer_text = sentences[-1]
    else:
        answer_text = answer.split(answer_prefix)[-1].strip()

    # find all the numbers (including decimals) in the string
    numbers = re.findall(r"\d+\.?\d*", answer_text.replace(",", ""))

    # return the first number (removing trailing decimal point if present),
    # or an empty string if there were no numbers
    return numbers[-1].rstrip(".") if numbers else ""


def get_completion_custom(query: str, model, tokenizer, do_sample=True, temperature = 0.4) -> str:
  device = "cuda:0"

  # phi-2 template(1)
  # user_start = 'Question: '
  # user_end = '\n'
  # bot_start = 'Answer: '

  # phi-2 template(2)
  # user_start = 'Solve the following math problem: '
  # user_end = '\n'
  # bot_start = 'The solution is: '

  # phi-2 template(3)
  # user_start = 'Instruct: Solve the following problem in detial.\nProblem: '
  # user_end = '\n'
  # bot_start = 'Solution: '

  # phi-2 template(4)
  # user_start = 'Solve the following problem step-by-step. Problem: '
  # user_end = '\n'
  # bot_start = 'Solution: '

  # phi-3 template
  user_start = '<|user|>Find an answer corresponding to a given question: \n'
  user_end = '<|end|>'
  bot_start = '<|assistant|>'

  # gemma template
  # user_start = '<start_of_turn>user\n'
  # user_end = '<end_of_turn>'
  # bot_start = '<start_of_turn>model\n'

  # Mistral template
  # user_start = '<s>[INST]'
  # user_end = '[/INST]'
  # bot_start = ''

  examples = """
  Solve the following problem step-by-step.
  Problem: Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?
  Solution:
  Step 1: Find the number of clips sold in May.
  Clips sold in May = 48 / 2 = 24 clips
  Step 2: Add the number of clips sold in April and May.
  Total clips sold = 48 + 24 = 72 clips

  Solve the following problem step-by-step.
  Problem: Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?
  Solution:
  Step 1: Convert the minutes to hours.
  50 minutes = 50 / 60 = 0.833 hours
  Step 2: Calculate the earnings.
  Earnings = $12 * 0.833 = $10

  Solve the following problem step-by-step.
  Problem: 3 * 40 + 12 = ?
  Solution:
  Step 1: Multiply 3 by 40.
  3 * 40 = 120
  Step 2: Add 12 to the result.
  120 + 12 = 132
  """
  # few-shot setting
  # prompt_template = examples + user_start + """{query}""" + user_end + bot_start

  prompt_template = user_start + """{query}""" + user_end + bot_start

  prompt = prompt_template.format(query=query)

  encodeds = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)

  model_inputs = encodeds.to(device)

  if do_sample == True:
      generated_ids = model.generate(**model_inputs, max_new_tokens=1000, do_sample=do_sample, temperature=0.4, pad_token_id=tokenizer.eos_token_id)
  else:
      generated_ids = model.generate(**model_inputs, max_new_tokens=1000, do_sample=do_sample, pad_token_id=tokenizer.eos_token_id)

  decoded = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
  return (decoded)

def run_temperature(model, tokenizer, test_data, total_count=50):
  accuracy_results = []

  for temperature in [0.0, 0.2, 0.5, 0.7, 1.0]:
      accuracy = 0
      for idx, example in enumerate(test_data):
          if idx >= total_count:
              break
          ground_truth_answer = parse_answer(example['answer'], "####")
          result = get_completion_custom(query=example['question'], model=model, tokenizer=tokenizer, do_sample=True, temperature=temperature)
          prediction = parse_answer(result)
          if ground_truth_answer in prediction:
              accuracy += 1
          if idx % 5 == 0:
              print("=========================================================================================================")
              print("#####   Processing   #####: ", 100 * (idx / total_count), "%")
              print("#####  Model Output  #####: ", result)
              print()
              print("#####  Ground Truth  #####: ", ground_truth_answer)
              print("#####   Prediction   #####: ", prediction)
              print("#####  Current Acc   #####: ", accuracy / (idx + 1))
              print()

      final_accuracy = accuracy / total_count
      accuracy_results.append({
          'temperature': temperature,
          'accuracy': final_accuracy
      })
      print(f"#####  Final Accuracy (temperature={temperature})   #####: {final_accuracy}")

  return accuracy_results

accuracy_results = run_temperature(model, tokenizer, test_data)

# accuracy = 0
# total_count = 50    # 예제수가 많기 때문에 과제 효율을 위하여 50개 예제만 테스트셋으로 사용
# for idx, example in enumerate(test_data):
#     if idx >= total_count:
#         break
#     ground_truth_answer = parse_answer(example['answer'], "####")
#     result = get_completion_custom(query=example['question'], model=model, tokenizer=tokenizer, do_sample=True, temperature=0.4)   # do_sample 및 temperature를 조정가능함에 따라 결과 달라짐.
#     prediction = parse_answer(result)
#     if (ground_truth_answer) in prediction:
#         accuracy +=1
#     if idx % 5 ==0:
#         print("=========================================================================================================")
#         print("#####   Processing   #####: ", 100*(idx/total_count), "%")
#         print("#####  Model Output  #####: ", result)
#         print()
#         print("#####  Ground Truth  #####: ", ground_truth_answer)
#         print("#####   Prediction   #####: ", prediction)
#         print("#####  Current Acc   #####: ", accuracy/(idx+1))
#         print()

# print()
# print()
# print("#####  Final Accuracy   #####: ",accuracy/total_count )



In [9]:
import pandas as pd
df_results = pd.DataFrame(accuracy_results)
print(df_results)

   temperature  accuracy
0          0.0      0.66
1          0.2      0.72
2          0.5      0.70
3          0.7      0.60
4          1.0      0.74
