<a target="_blank" href="https://colab.research.google.com/github/Blaizzy/LLMOps/blob/ft/phi-2/fine-tuning/phi-2/phi-2-on-slimOrca.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

In [None]:
!pip install datasets accelerate torch trl wandb "bitsandbytes>0.37.0" plotly

In [None]:
!pip install git+https://github.com/huggingface/huggingface_hub.git

In [None]:
!pip uninstall peft
!pip install git+https://github.com/huggingface/peft.git

In [None]:
!pip uninstall transformers
!pip install git+https://github.com/huggingface/transformers

In [1]:
from datasets import load_dataset
from datasets import Dataset
from pprint import pprint

dataset = load_dataset("Open-Orca/SlimOrca")

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
dataset

In [None]:
pprint(dataset['train']['conversations'][1])

In [None]:
import numpy as np
from typing import List

def formatting_prompts_func(example):

    # Assuming example is a numpy array of dictionaries and outputs_texts is previously defined
    if isinstance(example, np.ndarray):
      output_dict = {item['from']: item['value'] for item in example}

      # Define the keys we are looking for
      keys_required = ['system', 'human', 'gpt']
      # Use set to check if all required keys are present
      keys_present = set(output_dict.keys()) & set(keys_required)

      # Construct the output text based on the keys present
      output_text_segments = []
      for key in keys_required:
          if key in keys_present:
              text = output_dict[key]
              if key == 'gpt':
                  key = 'Assistant'
              output_text_segments.append(f"### {key.title().capitalize()}: {text}\n")


      return "".join(output_text_segments)


    system_text, human_text, assistant_text = "", "", ""
    output_dict = {item['from']: item['value'] for item in example['conversations']}

    if 'system' in output_dict:
        system_text = output_dict['system']
    if 'human' in output_dict:
        human_text = output_dict['human']
    if 'gpt' in output_dict:
        assistant_text = output_dict['gpt']

    return {
        "messages": [
            {"role": "system", "content": system_text},
            {"role": "user", "content": human_text},
            {"role": "assistant", "content": assistant_text}
        ]
    }


In [None]:
# Convert dataset to OAI messages
dataset = dataset.map(formatting_prompts_func, remove_columns='conversations',batched=False)


In [None]:
dataset

In [None]:
pprint(dataset['train'][2]['messages'])

In [None]:
# Count the number of tokens in the dataset
def get_token_count(sample):
    return sum(len(i['content']) for i in sample) / 4


In [None]:
df_train = dataset['train'].to_pandas()

token_count = df_train['messages'].apply(lambda x: get_token_count(x))

In [None]:
token_count.describe()

In [None]:
from matplotlib import pyplot as plt
import plotly.graph_objects as go

nbins = 250

# plot the number of tokens per sample using plotly
fig = go.Figure()
fig.add_trace(go.Histogram(x=token_count, nbinsx=nbins))
fig.update_layout(
    title_text=f"No. of tokens per sample on SlimOrca",
    xaxis_title_text="Number of tokens",
    yaxis_title_text="Number of samples",
    bargap=0.2,
    bargroupgap=0.1,
)


In [113]:
# Load model directly
import torch
from trl.models.utils import *
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM, setup_chat_format
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

tokenizer = AutoTokenizer.from_pretrained("prince-canuma/Damysus-2.7B-Chat")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/phi-2",
    device_map="auto",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config
)

In [None]:
model, tokenizer = setup_chat_format(model, tokenizer)



In [None]:
tokenizer.max_model_input_sizes

In [None]:
# trim the dataset to the maximum length of the model
max_length = tokenizer.max_model_input_sizes['Salesforce/codegen-350M-mono']
df_train = df_train[df_train['messages'].apply(lambda x: get_token_count(x)) < max_length]

In [None]:
token_count = df_train['messages'].apply(lambda x: get_token_count(x))
token_count.describe()

In [None]:
# get the longest sample and print it
max_length_id = token_count.idxmax()
text = df_train['messages'][max_length_id]
longest_sample = tokenizer.apply_chat_template(text)
print(f"Total number of tokens: {len(longest_sample)}")
print("---------Tokens------------")
print(longest_sample)
print("---------SAMPLE------------")
pprint(text)

In [None]:
# Load clean dataset
dataset = Dataset.from_pandas(df_train.reset_index(drop=True))

# Shuffle the combined dataset
shuffled_dataset = dataset.shuffle(seed=42)  # You can set a seed for reproducibility

# Get the first 1000 samples from the shuffled dataset
first_1000_samples = shuffled_dataset.select(range(1000))

# split the dataset into train and validation
dataset = dataset.train_test_split(test_size=0.2)
dataset

In [None]:
dataset['train'].to_json('train_dataset.json', orient='records')
dataset['test'].to_json('test_dataset.json', orient='records')
first_1000_samples.to_json('dataset.json', orient='records')

In [2]:
dataset = load_dataset('prince-canuma/tinyOrca', split='train')

Downloading readme: 100%|██████████| 2.16k/2.16k [00:00<00:00, 7.53MB/s]
Downloading data: 100%|██████████| 839k/839k [00:00<00:00, 941kB/s]
Generating train split: 100%|██████████| 1000/1000 [00:00<00:00, 70067.39 examples/s]


In [None]:
from trl.models.utils import *

resize_to_multiple_of = None
chat_format = ChatMlSpecialTokens()
tokenizer.chat_template = chat_format.chat_template

tokenizer.add_special_tokens(dict(bos_token=chat_format.bos_token, eos_token=chat_format.eos_token, pad_token='<|pad|>'))
model.config.bos_token_id = tokenizer.bos_token_id
model.config.eos_token_id = tokenizer.eos_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.generation_config.bos_token_id = tokenizer.bos_token_id
model.generation_config.eos_token_id = tokenizer.eos_token_id
model.generation_config.pad_token_id = tokenizer.pad_token_id
# Not need with models with pre-exisiting blank tokens like Gemma
model.resize_token_embeddings(
    len(tokenizer), pad_to_multiple_of=resize_to_multiple_of if resize_to_multiple_of is not None else None
)

In [None]:
print(f"Tokenizer special tokens: {tokenizer.special_tokens_map_extended}")
print(f"Model config: {model.config}")
print(f"Model generation config: {model.generation_config}")

In [None]:
import wandb
wandb.login()

In [None]:
from huggingface_hub import login
login()

In [None]:
import os
os.environ["WANDB_PROJECT"] = "phi-2-SlimOrca"
os.environ["WANDB_LOG_MODEL"] = "checkpoint"

In [None]:
from peft import LoraConfig

# LoRA config based on QLoRA paper & Sebastian Raschka experiment
peft_config = LoraConfig(
        lora_alpha=128,
        lora_dropout=0.05,
        r=256,
        bias="none",
        target_modules=["q_proj", "k_proj", "v_proj", "dense", "fc1", "fc2"],
        modules_to_save=["embed_tokens", "lm_head"],
        task_type="CAUSAL_LM",
)


In [None]:
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir="phi-2-slimorca",
    max_steps=250,
    # fp16=True,
    bf16=True, # For NVIDIA GPUs on Ampere Arch
    per_device_train_batch_size=2,
    # per_device_eval_batch_size=2,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    optim="adamw_torch_fused",
    logging_steps=10,
    # evaluation_strategy="steps",
    save_strategy="steps",
    save_steps=100,
    learning_rate=2e-4,
    max_grad_norm=0.3,
    # load_best_model_at_end=True,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    push_to_hub=False,
    report_to="wandb"
)

In [None]:
trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=dataset,
    tokenizer=tokenizer,
    peft_config=peft_config,
    data_collator=DataCollatorForCompletionOnlyLM(response_template='assistant', tokenizer=tokenizer),
    max_seq_length=1744
)

In [None]:
trainer.train()

In [None]:
trainer.push_to_hub()

# Test

In [None]:
# Reload base modela and LoRA adapter (if needed)
# model_id = 'prince-canuma/phi-2-slimorca'
# model = AutoModelForCausalLM.from_pretrained(
#     "microsoft/phi-2",
#     device_map="auto",
#     torch_dtype=torch.bfloat16,
# )
# model.load_adapter(model_id)

In [None]:
def prGreen(skk): print("\033[92m{}\033[00m" .format(skk))
def generate_response(inputs, max_new_tokens=256, ground_truths=None):
    pprint(inputs)
    model_inputs = tokenizer.apply_chat_template(
        inputs, add_generation_prompt=True, return_tensors='pt'
    ).to("cuda")

    outputs = model.generate(model_inputs, do_sample=False, max_new_tokens=max_new_tokens, top_p=0.9)
    input_length = model_inputs.shape[1]
    prGreen("Response")
    print(tokenizer.batch_decode(outputs[:, input_length:], skip_special_tokens=True)[0])

    if ground_truths:
        prGreen("Ground Truth")
        print(ground_truths)

In [None]:
inputs = tokenizer.apply_chat_template(
    [
        {"content":"","role":"system"},
        {"content":"""Given the question: Read the article and select the best
         answer. Article: Can you swim? Do you like swimming? Well, how can you
         learn to swim? I think the best way is to go into the water and learn.
        I'm afraid you'll never learn to swim just by reading books about
        Swimming or looking at others swimming. It's the same with the English
        study. We must practice, practice and practice. Listening and speaking
        are very important for beginners. We can listen to English programs on radio.
        You may just understand a few words. It doesn't matter. Just be relaxed,
        try to catch every word. Somebody may be a good listener, but he is afraid
        to speak because he's afraid of making mistakes. You know we sometimes
        make mistakes when we speak Chinese. Don't be afraid. We must be brave.
        If you really want to learn English well, you must try to speak with
        everyone as long as he knows English. When there's nobody to talk with,
        you can talk to yourself in English. It's interesting and also a good
        way to practice your spoken English. Remember, the more you speak, the
        fewer mistakes you'll make. Reading and writing are more important for
        senior school students. First we must choose the books we're interested
        in. A lot of reading will improve your language sense.
        This is very important. It's easier said than done. Well, let's do
        more practice from now on. I'm sure you'll learn English well in this
        way. ,A, B, C, D,. (10)
        Question: Which is the best title for the passage?
        Options:
            A: How to Learn English.
            B: Easier Said Than Done.
            C: Listen First, Speak Second.
            D: How to learn to Swim.\n
        The answer is:""","role":"user"}
    ], add_generation_prompt=True, return_tensors='pt',
).to('cuda')

In [None]:
generate_response(
    [
        {"content":"","role":"system"},
        {"content":"""Given the question: Read the article and select the best
            answer. Article: Can you swim? Do you like swimming? Well, how can you
            learn to swim? I think the best way is to go into the water and learn.
            I'm afraid you'll never learn to swim just by reading books about
            Swimming or looking at others swimming. It's the same with the English
            study. We must practice, practice and practice. Listening and speaking
            are very important for beginners. We can listen to English programs on radio.
            You may just understand a few words. It doesn't matter. Just be relaxed,
            try to catch every word. Somebody may be a good listener, but he is afraid
            to speak because he's afraid of making mistakes. You know we sometimes
            make mistakes when we speak Chinese. Don't be afraid. We must be brave.
            If you really want to learn English well, you must try to speak with
            everyone as long as he knows English. When there's nobody to talk with,
            you can talk to yourself in English. It's interesting and also a good
            way to practice your spoken English. Remember, the more you speak, the
            fewer mistakes you'll make. Reading and writing are more important for
            senior school students. First we must choose the books we're interested
            in. A lot of reading will improve your language sense.
            This is very important. It's easier said than done. Well, let's do
            more practice from now on. I'm sure you'll learn English well in this
            way. ,A, B, C, D,. (10)
            Question: Which is the best title for the passage?
            Options:
                A: How to Learn English.
                B: Easier Said Than Done.
                C: Listen First, Speak Second.
                D: How to learn to Swim.\n
            The answer is:
        ""","role":"user"}
    ],
    ground_truths="A: How to Learn English.")

In [None]:
generate_response(
    [
        {"content":"You are an AI assistant that follows instruction extremely well. Help as much as you can.","role":"system"},
        {"content":"Fact 1: Natural disasters can cause animals to leave an environment.  Fact 2: If the property is damaged by a fire, natural disaster.  Given the two facts above, what can cause animals to leave an environment?\n\nChoose from:\n(I). light rain\n(II). good weather\n(III). drought\n(IV). mines\n(V). storms\n(VI). fires\n(VII). wind\n(VIII). gentle breezes\nAnswer:","role":"user"}
    ],
    ground_truths="Based on the provided facts, the possible causes for animals to leave an environment are:\n\n(III). drought\n(V). storms\n(VI). fires")

In [None]:
generate_response(
    [
         {"content":"You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.","role":"system"},{"content":"Please answer the following question: Extract the answer to the question from the following context. Question: What organization usually holds it's sporting events in Oklahoma City? Context: Oklahoma City is the annual host of the Big 12 Baseball Tournament, the World Cup of Softball, and the annual NCAA Women's College World Series. The city has held the 2005 NCAA Men's Basketball First and Second round and hosted the Big 12 Men's and Women's Basketball Tournaments in 2007 and 2009. The major universities in the area \u2013 University of Oklahoma, Oklahoma City University, and Oklahoma State University \u2013 often schedule major basketball games and other sporting events at Chesapeake Energy Arena and Chickasaw Bricktown Ballpark, although most home games are played at their campus stadiums.\nAnswer:","role":"user"}
    ],
    ground_truths="The organization that usually holds its sporting events in Oklahoma City is the Big 12 Conference."
)