# Lesson 2: Centralized LLM Fine-tuning

In [11]:
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Welcome to Lesson 2!

To access the `requirements.txt` and `utils` files for this course, go to `File` and click `Open`.

#### 1. Import packages and utilities

In [12]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
import torch
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments
from utils.utils import *
from utils.LLM import LLM_pretrained, LLM_cen_partial
from utils.LLM import get_fireworks_api_key,load_env

> Note: Throughout this course, we use Hydra, a framework for managing and configuring files. 

In [13]:
# Load config
cfg = get_config("centralized")

# Inspect the config
print_config(cfg)

dataset:
  name: medalpaca/medical_meadow_medical_flashcards
model:
  name: EleutherAI/pythia-70m
  quantization: 4
  gradient_checkpointing: true
  use_fast_tokenizer: true
  lora:
    peft_lora_r: 16
    peft_lora_alpha: 64
    target_modules: null
train:
  save_every_round: 5
  seq_length: 512
  padding_side: left
  training_arguments:
    learning_rate: 0.0005
    per_device_train_batch_size: 2
    gradient_accumulation_steps: 1
    logging_steps: 1
    max_steps: 5
    report_to: null
    save_steps: 200
    save_total_limit: 10
    gradient_checkpointing: true
    lr_scheduler_type: cosine



#### 2. Load the dataset
  
**Note:** For more information about the dataset, check here [medAlpaca](https://arxiv.org/abs/2304.08247).

In [14]:
trainset_full = load_dataset(cfg.dataset.name, split='train')
train_test = trainset_full.train_test_split(test_size=0.9, seed=1234)
train_dataset = train_test["train"]

train_dataset = format_dataset(train_dataset)

print(train_dataset)

Dataset({
    features: ['instruction', 'response'],
    num_rows: 3395
})


In [15]:
# Print an example from the dataset
example_index = 9

data_point = train_dataset[example_index]

data_point

{'instruction': 'At what level does the inferior vena cava (IVC) perforate the diaphragm?',
 'response': 'The IVC perforates the diaphragm at the level of T8.'}

#### 3. Asking the LLM 


* First attempt 

Ask a pre-trained LLM a question in a specific domain.

In [16]:
llm_pretrained = LLM_pretrained()

* Write a prompt to ask the LLM.

In [18]:
import fireworks
fireworks.client.api_key = "ta_clé_API"


In [17]:
prompt = "How to predict the weather"
llm_pretrained.eval(prompt, verbose = False)
llm_pretrained.print_response(verbose = False)

ValueError: No API key provided. You can set your API key in code using 'fireworks.client.api_key = <API-KEY>', or you can set the environment variable FIREWORKS_API_KEY=<API-KEY>).

* Evaluate pre-trained model on Medical Q&A

In [None]:
llm_pretrained.eval(data_point['instruction'], verbose=True)

llm_pretrained.print_response()

ex_response = format_string(data_point['response'])
print(f"Expected output:\n\t{ex_response}")

* Second attempt:

Ask a fine tuned LLM a question in a specific domain.

* Set the model.

In [None]:
model = get_model(cfg.model)


trainable, all_parameters = model.get_nb_trainable_parameters()
print(f"Trainable parameters: {trainable}")
print(f"All parameters: {all_parameters}")
print(f"Trainable (%): {100*trainable / all_parameters:.3f}")

* Define the tokenizer.

In [None]:
(
tokenizer, 
data_collator, 
format_prompts_fn
) = get_tokenizer_and_data_collator_and_propt_formatting(
        cfg.model.name, cfg.model.use_fast_tokenizer, cfg.train.padding_side
)

* Define the finetune_centralised function.

In [None]:
save_centralized = "./my_centralized_model"
def finetune_centralised():

    # The notebooks you are running in this course does not
    # come with a GPU. However, we don't harcode "cpu" as the
    # device to use in case you wish to download this notebook
    # and run it on your own GPU.
    use_cuda = torch.cuda.is_available()
    training_arguments = TrainingArguments(
                         **cfg.train.training_arguments,
                         use_cpu=not(use_cuda),
                         output_dir=save_centralized,
    )

    # Construct trainer
    trainer = SFTTrainer(
        tokenizer=tokenizer,
        data_collator=data_collator,
        formatting_func=format_prompts_fn,
        max_seq_length=cfg.train.seq_length,
        model=model,
        args=training_arguments,
        train_dataset=train_dataset,
    )

    # Do local training
    trainer.train()

    # Save the checkpoint
    model.save_pretrained(save_centralized)

* Let's do some fine-tuning.

In [None]:
finetune_centralised()

> Note: The previous fine-tuning was performed on a smaller model. To fine-tune a larger model (e.g., a 7 billion parameter model), you can use the same code with a different configuration. Refer to the code snippet below:


```
 cfg = get_config("centralized_full")
```

It is recommended to execute all prior cells with this configuration. However, please ensure that your machine has at least one GPU.

* Evaluate the Centrally fine-tuned LLM.

In [None]:
llm_cen = LLM_cen_partial()
example_index = 9

data_point = train_dataset[example_index]

llm_cen.eval(data_point['instruction'], verbose=True)

llm_cen.print_response()

ex_response = format_string(data_point['response'])
print(f"Expected output:\n\t{ex_response}")

#### 4. Visualize results of prompting with pretrained LLM and fine-tuned LLM

In [None]:
visualize_results(results=['7b/pretrained', '7b/cen_10'])

#### Extra! 

#### Generate the data structure for systematic evaluation

We used this code to evaluate the performance of the finetuned 7B LLM you tested in the previous code cell.

In [None]:
# Lauch evalution code
# from utils.utils import inference, evaluate

# Step 1: generate answers
# To run inference on the pre-trained model:
# inference(base_model_name_path=cfg.model.name, run_name="pretrained")

# To run inference on the centralised finetuned model:
# inference(
#           base_model_name_path=cfg.model.name, 
#           peft_path=path/to/your/checkpoint/directory, 
#           run_name="centralised_finetuned",
# )


# Step 2: evaluation --- accuracy value will be printed
# evaluate(run_name="pretrained")
# evaluate(run_name="centralised_finetuned")