# Training Mistal-7B Instruct model on QA public dataset

In [None]:
model_name = "mistralai/Mistral-7B-Instruct-v0.1"
use_gpu = True
num_workers = 1
cpus_per_worker = 2

In [None]:
import ray

ray.init(
    runtime_env={
        "pip": [
            "datasets",
            "evaluate",
            # Latest combination of accelerate==0.19.0 and transformers==4.29.0
            # seems to have issues with DeepSpeed process group initialization,
            # and will result in a batch_size validation problem.
            # TODO(jungong) : get rid of the pins once the issue is fixed.
            "accelerate==0.16.0",
            "transformers==4.26.0",
            "torch>=1.12.0",
            "deepspeed==0.12.3",
        ],
    },
)

2024-01-17 19:23:33,945	INFO worker.py:1724 -- Started a local Ray instance.


0,1
Python version:,3.10.12
Ray version:,2.9.0


In [None]:
ray.shutdown()

### Loading Dataset

In [None]:
# from datasets import load_dataset

# print("Loading tiny_shakespeare dataset")
# current_dataset = load_dataset("tiny_shakespeare")
# current_dataset

## Finetuning Mistral 7B Instruct Model

In [None]:
!pip install git+https://github.com/huggingface/transformers trl accelerate torch bitsandbytes peft datasets

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-8ap6ab64
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-8ap6ab64
  Resolved https://github.com/huggingface/transformers to commit f4f57f9dfa68948a383c352a900d588f63f6290a
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting tokenizers<0.19,>=0.14 (from transformers==4.37.0.dev0)
  Using cached tokenizers-0.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Using cached tokenizers-0.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25ldone
[?25h  Created wheel for transformers: filename=transformers-4

In [None]:
import torch
from trl import SFTTrainer
# from google.colab import drive
from random import randrange
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
from peft import AutoPeftModelForCausalLM, LoraConfig, get_peft_model, prepare_model_for_kbit_training

[2024-01-17 19:23:46,390] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [None]:
import pandas as pd

In [None]:
from ray import train
from ray.train.huggingface.transformers import (
    prepare_trainer,
    RayTrainReportCallback
)

## Loading Dataset and Formatting

In [None]:
# Loading the dataset
dataset = load_dataset("databricks/databricks-dolly-15k", split="train")

# Since I will only finetune on Question-Answer pairs without context, I will filter accordingly
# Filter QA pairs without context
dataset = dataset.filter(lambda x:x['context'] == '')

# A prompting formatting function
def create_prompt_instruction(sample):
   return f"""### Instruction:
   Use the input below to create an instruction, which could have been used to generate the input using an LLM.

   ### Input
   {sample['response']}

   ### Response:
   {sample['instruction']}
   """


In [None]:
import ray.data

ray_datasets = {
    "train": ray.data.from_huggingface(dataset)
}

# ray_datasets = {
#     "train": ray.data.from_huggingface(dataset["train"]),
#     "validation": ray.data.from_huggingface(dataset["validation"])
# }

ray_datasets

{'train': MaterializedDataset(
    num_blocks=1,
    num_rows=10544,
    schema={
       instruction: string,
       context: string,
       response: string,
       category: string
    }
 )}

In [None]:
block_size = 512

In [None]:
from transformers import AutoTokenizer

def split_text(batch: pd.DataFrame) -> pd.DataFrame:
    text = list(batch["text"])
    flat_text = "".join(text)
    split_text = [
        x.strip()
        for x in flat_text.split("\n")
        if x.strip() and not x.strip()[-1] == ":"
    ]
    return pd.DataFrame(split_text, columns=["text"])


def tokenize(batch: pd.DataFrame) -> dict:
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
    tokenizer.pad_token = tokenizer.eos_token
    ret = tokenizer(
        list(batch["text"]),
        truncation=True,
        max_length=block_size,
        padding="max_length",
        return_tensors="np",
    )
    ret["labels"] = ret["input_ids"].copy()
    return dict(ret)

processed_datasets = {
    key: ds.map_batches(split_text, batch_format="pandas").map_batches(tokenize, batch_format="pandas").random_shuffle(seed=42)
    for key, ds in ray_datasets.items()
}
processed_datasets

{'train': RandomShuffle
 +- MapBatches(tokenize)
    +- MapBatches(split_text)
       +- Dataset(
             num_blocks=1,
             num_rows=10544,
             schema={
                instruction: string,
                context: string,
                response: string,
                category: string
             }
          )}

In [None]:
# print(create_prompt_instruction(dataset[0]))

### Instruction: 
   Use the input below to create an instruction, which could have been used to generate the input using an LLM. 

   ### Input 
   Tope

   ### Response:
   Which is a species of fish? Tope or Rope
   


In [None]:
def train_func(config):
    # Use the actual number of CPUs assigned by Ray
    os.environ["OMP_NUM_THREADS"] = str(
        train.get_context().get_trial_resources().bundles[-1].get("CPU", 1)
    )
    # Enable tf32 for better performance
    torch.backends.cuda.matmul.allow_tf32 = True

    batch_size = config.get("batch_size", 4)
    epochs = config.get("epochs", 2)
    warmup_steps = config.get("warmup_steps", 0)
    learning_rate = config.get("learning_rate", 0.00002)
    weight_decay = config.get("weight_decay", 0.01)
    steps_per_epoch = config.get("steps_per_epoch")

    deepspeed = {
        "fp16": {
            "enabled": "auto",
            "initial_scale_power": 8,
        },
        "bf16": {"enabled": "auto"},
        "optimizer": {
            "type": "AdamW",
            "params": {
                "lr": "auto",
                "betas": "auto",
                "eps": "auto",
            },
        },
        "zero_optimization": {
            "stage": 3,
            "offload_optimizer": {
                "device": "cpu",
                "pin_memory": True,
            },
            "offload_param": {
                "device": "cpu",
                "pin_memory": True,
            },
            "overlap_comm": True,
            "contiguous_gradients": True,
            "reduce_bucket_size": "auto",
            "stage3_prefetch_bucket_size": "auto",
            "stage3_param_persistence_threshold": "auto",
            "gather_16bit_weights_on_model_save": True,
            "round_robin_gradients": True,
        },
        "gradient_accumulation_steps": "auto",
        "gradient_clipping": "auto",
        "steps_per_print": 10,
        "train_batch_size": "auto",
        "train_micro_batch_size_per_gpu": "auto",
        "wall_clock_breakdown": False,
    }

    # Specify the GPU device index
    device_idx = 0

    # Set the device
    device = torch.device(f'cuda:{device_idx}')


    print("Preparing training arguments")
    training_args = TrainingArguments(
        "output",
        logging_steps=1,
        save_strategy="steps",
        save_steps=steps_per_epoch,
        max_steps=steps_per_epoch * epochs,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        warmup_steps=warmup_steps,
        label_names=["input_ids", "attention_mask"],
        push_to_hub=False,
        report_to="none",
        disable_tqdm=True,  # declutter the output a little
        fp16=True,
        gradient_checkpointing=True,
        deepspeed=deepspeed,
    )
    disable_progress_bar()

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token

    print("Loading model")

    model = GPTJForCausalLM.from_pretrained(model_name, use_cache=False)
    model.resize_token_embeddings(len(tokenizer))

    print("Model loaded")

    enable_progress_bar()

    metric = evaluate.load("accuracy")

    train_ds = train.get_dataset_shard("train")
    eval_ds = train.get_dataset_shard("validation")

    train_ds_iterable = train_ds.iter_torch_batches(batch_size=batch_size)
    eval_ds_iterable = eval_ds.iter_torch_batches(batch_size=batch_size)

In [None]:
storage_path="Path/to/storage/models"     # TODO: Alternatively, set up NFS

In [None]:
from ray.train.torch import TorchTrainer
from ray.train import RunConfig, ScalingConfig

batch_size = 16
train_ds_size = processed_datasets["train"].count()
steps_per_epoch = train_ds_size // (batch_size * num_workers)

trainer = TorchTrainer(
    train_loop_per_worker=train_func,
    train_loop_config={
        "epochs": 1,
        "batch_size": batch_size,  # per device
        "steps_per_epoch": steps_per_epoch
    },
    scaling_config=ScalingConfig(
        num_workers=num_workers,
        use_gpu=use_gpu,
        resources_per_worker={"GPU": 1, "CPU": cpus_per_worker},
    ),
    datasets=processed_datasets,
    run_config=RunConfig(storage_path=storage_path),
)

## Evaluation

In [None]:
# Load the finetuned model
finetuned_model = AutoPeftModelForCausalLM.from_pretrained(
    "mistral_instruct_qa",
    low_cpu_mem_usage=True,
    torch_dtype=torch.bfloat16,
  device_map="auto"
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("mistral_instruct_qa")

In [None]:
# Import saved dataset split from the drive (saved during fine-tuning)
# Connect colab with my drive
drive.mount('/content/drive')


# Load dataset and prepare evaluation prompts
dataset = load_from_disk('/path/to/your/saved/dataset/split')

test_dataset = dataset['test']

# dataset
"""
Dataset({
    features: ['instruction', 'context', 'response', 'category'],
    num_rows: 3164
})
"""


# Prepare evaluation prompts
prompts = [
    f"""### Instruction:
    Use the input below to create an instruction, which could have been used to generate the input using an LLM.

    ### Input
    {sample['response']}

    ### Response:
    """ for sample in test_dataset
]

# Get references / Ground Truth the model will be evaluated against
references = [sample['instruction'] for sample in dataset]

In [None]:
import evaluate
from vllm import LLM, SamplingParams
rouge = evaluate.load('rouge')

In [None]:
# Sampling parameters
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

# My finetuned model is pushed to this hf repository path
hf_finetuned_model_path = "path/to/your/Finetuned-mistral-7b-instruct-model-v01"

# An LLM instance
llm = LLM(model=hf_finetuned_model_path)

# Generate responses
outputs = llm.generate(prompts, sampling_params)

# A list to save finetuned model responses
finetuned_model_responses = []

# loop over outputs to get each response
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    finetuned_model_responses.append(generated_text)

In [None]:
# Finetuned model evaluation
finetuned_model_evaluation = rouge.compute(predictions=finetuned_model_responses, references=references)

# Print 'rouge1', 'rouge2', and 'rougeL'
print("Rouge-1 Evaluation:")
print(finetuned_model_evaluation["rouge1"])
print("--"*20)
print("Rouge-2 Evaluation:")
print(finetuned_model_evaluation["rouge2"])
print("--"*20)
print("Rouge-L Evaluation:")
print(finetuned_model_evaluation["rougeL"])