# Fine-Tuning LLMs with Hugging Face

## Step 1: Installing and importing the libraries

In [2]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m235.5/244.2 kB[0m [31m7.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m101.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m104.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━

In [3]:
!pip install huggingface_hub



In [4]:
!pip install gtts

Collecting gtts
  Downloading gTTS-2.5.1-py3-none-any.whl (29 kB)
Installing collected packages: gtts
Successfully installed gtts-2.5.1


In [5]:
import torch
from trl import SFTTrainer
from peft import LoraConfig
from datasets import load_dataset
from transformers import (AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, pipeline)

## Step 2: Loading the model

In [6]:
llama_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path = "aboonaji/llama2finetune-v2",
                                                   quantization_config = BitsAndBytesConfig(load_in_4bit = True, bnb_4bit_compute_dtype = getattr(torch, "float16"), bnb_4bit_quant_type = "nf4"))
llama_model.config.use_cache = False
llama_model.config.pretraining_tp = 1

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/632 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/174 [00:00<?, ?B/s]

## Step 3: Loading the tokenizer

In [7]:
llama_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path = "aboonaji/llama2finetune-v2", trust_remote_code = True)
llama_tokenizer.pad_token = llama_tokenizer.eos_token
llama_tokenizer.padding_side = "right"

tokenizer_config.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

## Step 4: Setting the training arguments

In [8]:
training_arguments = TrainingArguments(output_dir = "./results", per_device_train_batch_size = 4, max_steps = 100)

## Step 5: Creating the Supervised Fine-Tuning trainer

In [9]:
llama_sft_trainer = SFTTrainer(model = llama_model,
                               args = training_arguments,
                               train_dataset = load_dataset(path = "aboonaji/wiki_medical_terms_llam2_format", split = "train"),
                               tokenizer = llama_tokenizer,
                               peft_config = LoraConfig(task_type = "CAUSAL_LM", r = 64, lora_alpha = 16, lora_dropout = 0.1),
                               dataset_text_field = "text")

Downloading data:   0%|          | 0.00/54.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6861 [00:00<?, ? examples/s]



Map:   0%|          | 0/6861 [00:00<?, ? examples/s]

## Step 6: Training the model

In [10]:
llama_sft_trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


TrainOutput(global_step=100, training_loss=1.655062713623047, metrics={'train_runtime': 1381.4552, 'train_samples_per_second': 0.29, 'train_steps_per_second': 0.072, 'total_flos': 8228119310991360.0, 'train_loss': 1.655062713623047, 'epoch': 0.06})

## Step 7: Chatting with the model

In [22]:
user_prompt = "what is your name and tell me about Paracetamol poisoning"
text_generation_pipeline = pipeline(task = "text-generation", model = llama_model, tokenizer = llama_tokenizer, max_length = 300)
model_answer = text_generation_pipeline(f"<s>[INST] {user_prompt} [/INST]")

generated_text = model_answer[0]['generated_text']
def extract_response(text):
    # Remove special tokens and instruction text
    if "[INST]" in text and "[/INST]" in text:
        start = text.index("[/INST]") + len("[/INST]")
        return text[start:].strip()
    return text.strip()

response = extract_response(generated_text)
print(response)

Hello! My name is LLaMA, I'm a large language model trained by a team of researcher at Meta AI. everybody! I'm here to help you with any questions you may have.

Paracetamol poisoning, also known as acetaminophen poisoning, occurs when a person takes more than the recommended dose of paracetamol, which is a common pain reliever and fever reducer found in many over-the-counter medications. Paracetamol is generally safe when used as directed, but taking too much can cause liver damage or even failure in severe cases.

Symptoms of paracetamol poisoning can include:

* Nausea and vomiting
* Abdominal pain
* Yellowing of the skin and eyes (jaundice)
* Loss of appetite
* Fatigue
* Confusion
* Headache
* Dizziness
* Sleepiness
* Coma

If you suspect you or someone else has taken too much paracetamol, it is important to seek medical attention immediately. Treatment may involve administering an antidote, such as N-acetylcysteine (NAC), to help


In [25]:
from gtts import gTTS
import os
from IPython.display import display, Audio, Javascript
import json
import re

def chunk_text(text, max_length=2000):
    # Keep commas and periods, remove other non-word characters
    text = re.sub(r'[^\w\s,.]', '', text)
    words = text.split()
    chunks = []
    chunk = []
    for word in words:
        if len(" ".join(chunk + [word])) <= max_length:
            chunk.append(word)
        else:
            chunks.append(" ".join(chunk))
            chunk = [word]
    if chunk:
        chunks.append(" ".join(chunk))
    return chunks

# Example text to be converted to speech
generated_text = response

# Generate chunks of text
text_chunks = chunk_text(generated_text)

# Convert each chunk to speech and save it
audio_files = []
for i, chunk in enumerate(text_chunks):
    tts = gTTS(text=chunk, lang='en')
    chunk_audio_file = f"output_{i}.mp3"
    tts.save(chunk_audio_file)
    audio_files.append(chunk_audio_file)
    print(f"Saved {chunk_audio_file}")

# Convert the list of audio files to a JSON string
audio_files_json = json.dumps([f"/content/{file}" for file in audio_files])

# Generate JavaScript to play audio files sequentially
js_code = f"""
var audio_files = {audio_files_json};
var current_audio = 0;

function playNextAudio() {{
    if (current_audio < audio_files.length) {{
        var audio = new Audio(audio_files[current_audio]);
        audio.play();
        audio.onended = playNextAudio;
        current_audio++;
    }}
}}

playNextAudio();
"""

# Display the JavaScript code block
display(Javascript(js_code))

# Display the audio files for manual playback (if needed)
for file in audio_files:
    display(Audio(file, autoplay=True))


Saved output_0.mp3


<IPython.core.display.Javascript object>