# Init

In [1]:
!python -m pip install --upgrade pip
!pip3 install -q torch==2.2.1 torchvision torchaudio   xformers --index-url https://download.pytorch.org/whl/cu121

Collecting pip
  Using cached pip-24.2-py3-none-any.whl.metadata (3.6 kB)
Using cached pip-24.2-py3-none-any.whl (1.8 MB)
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-24.2
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m757.3/757.3 MB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.8/20.8 MB[0m [31m98.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m92.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m78.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

In [2]:
import torch
major_version, minor_version = torch.cuda.get_device_capability()
# Must install separately since Colab has torch 2.2.1, which breaks packages
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
if major_version >= 8:
    # Use this for new GPUs like Ampere, Hopper GPUs (RTX 30xx, RTX 40xx, A100, H100, L40)
    !pip install --no-deps -q packaging ninja einops flash-attn xformers trl peft \
    accelerate bitsandbytes
else:
    # Use this for older GPUs (V100, Tesla T4, RTX 20xx)
    !pip install --no-deps -q trl peft accelerate bitsandbytes
!pip install datasets
!pip install hyperopt
!pip install optuna
pass

Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-jwiswj15/unsloth_394c6c1cbf9a404ca41fdc8eb59be454
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-jwiswj15/unsloth_394c6c1cbf9a404ca41fdc8eb59be454
  Resolved https://github.com/unslothai/unsloth.git to commit 12b437e12204532f82542c12ac1ab00d19e3ebbf
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tyro (from unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Downloading tyro-0.8.8-py3-none-any.whl.metadata (8.4 kB)
Collecting transformers>=4.43.2 (from unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[c

In [3]:
# !python -m xformers.info
# !python -m bitsandbytes
# !nvidia-smi

In [1]:
from huggingface_hub import login, logout
token="hf_"
login()  # you can put `token` parameter for non-blocking login

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Data

In [19]:
from datasets import load_dataset
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer
from peft import LoraConfig
from trl import SFTTrainer
from transformers import TrainingArguments

dataset = load_dataset("ruslanmv/ai-medical-chatbot", split="train")  # Question, context, answer
dataset_sampled = dataset.select(range(100))

## Look into dataset & its structure, meaning, edge cases, ...
 - Modeling without optimization is about importing needed parts and running it
 - More effort needs to be and should be spent on understanding and preparing data in many cases

In [3]:
# Each datapoint has three kinds of information: Question, Context, and Answer.
# I'd like context-aware model so that it can have different answer with same question.
# However, to make the model act like that requires pertinent data. Here, I won't deal with that, but for actual project, investigation on it can be conducted.
dataset_sampled[0]

{'Description': 'Q. What does abutment of the nerve root mean?',
 'Patient': 'Hi doctor,I am just wondering what is abutting and abutment of the nerve root means in a back issue. Please explain. What treatment is required for\xa0annular bulging and tear?',
 'Doctor': 'Hi. I have gone through your query with diligence and would like you to know that I am here to help you. For further information consult a neurologist online -->'}

In [7]:
# Umm Doctors answer can be quite long. So my model needs to be capable of handling rather long sequence.
dataset_sampled[1]

{'Description': 'Q. What should I do to reduce my weight gained due to genetic hypothyroidism?',
 'Patient': 'Hi doctor, I am a 22-year-old female who was diagnosed with hypothyroidism (genetic) when I was 12. Over the past five years, I have become around 50 pounds overweight and all of my attempts to lose have seemed to fail so I have given up, but my weight has stayed the same. There is so much information put there about losing weight with hypothyroidism but it all seems to conflict. I am so unsure as to what type of exercise and diet I should follow as a result but I still would like to lose weight, but most importantly have my body feel better. What can I do? I am currently on Levothyroxine, Buspar, and Benedryl.',
 'Doctor': 'Hi. You have really done well with the hypothyroidism problem. Your levels are normal with less medications which are very good. As it is genetically induced, it is very difficult to lose weight. My advice to you is, you should focus on maintaining normal l

In [9]:
dataset_sampled[2]

{'Description': 'Q. I have started to get lots of acne on my face, particularly on my forehead. Please help me.',
 'Patient': "Hi doctor! I used to have clear skin but since I moved to a new place, I started to have lots of acne on my face particularly on my forehead. I thought it would disappear once I went back home, but it only got worse. I did some research and assumed that it was caused by drinking too much cow's milk, but it has been since since I stopped and they would still not go away. I also noticed that I get deep acne whenever I'm nearing my period, along with the usual small red bumps. I bought an acne soap and have been using it for a month now but I'm not sure if it works. I hope you can help me because it has been affecting my mental state lately :((((",
 'Doctor': 'Hi there Acne has multifactorial etiology. Only acne soap does not improve if ypu have grade 2 or more grade acne. You need to have oral and topical medications. This before writing medicines i need to confi

In [14]:
dataset_sampled[7]

{'Description': 'Q. Kindly suggest a homeopathic medicine to stop hairfall and promote hair growth.',
 'Patient': 'Hello doctor,I am 24 years old, and for the past nine years, I am facing hair fall problem nowadays my 60 % of hair is falling on my top and front of my head. I checked my thyroid and hemoglobin many times, but their reports are good. I use many home remedies and hair oils, but when I stop using it again, it starts falling. The allopathic doctor says use Minoxidil 5 %, Finasteride, and hair serum. But Finasteride has many side effects. Can you please tell me is any medicine available in homeopathy which stops hair fall and promotes hair growth if yes can you please tell me the name.',
 'Doctor': 'Hello. I checked the attached photo (attachment removed to protect patient identity) and read your description. It seems you have been suffering from hair loss problem for a longtime. Do you eat healthy food like vegetables and fruits every day? Sometimes lack of nutrition is also

In [None]:
# Another thing I'd like to do is some kind of topic modeling for the dataset
# We can check for what topic model performance well and not; and see whether this can be improved with supplemental information via RAG
# Let it remain as our future task and focus on the training as of now.

## Data Preparation
 - Data Prep is another salient part of ML
    - In traditional ML, it can be about converting data format, normalizing values to more stable learning, handling of categorical variables, and/or reducing its dimensionality at a cost of transparency/explainability
    - In LLM, it more has to do with appropriate formatting and prompting, while other data operations will be done by model. Anyway it's way better to notice the roles different types of layer playing.
        - Llama herds were trained with Alpaca-format prompt so I'm gonna use it to give consistency between pre-training and fine-tuning

In [16]:
# Device map
device_map = 'auto'
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)  # Input Processor
tokenizer.pad_token = tokenizer.eos_token

In [17]:
alpaca_prompt = """ Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### instruction:
{}

### Context:
{}

### output:
{}"""

EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instruction  = examples["Description"]
    inputs       = examples["Patient"]
    outputs      = examples["Doctor"]
    texts = []
    for instruction, input, output in zip(instruction, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}
pass

# Apply prompt formatting
dataset_with_v1_tokenizer = dataset_sampled.map(formatting_prompts_func, batched = True,)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

# Training 1. OOM at batch_size of 2 even with QLoRA and Gradient Checkpointing

 - Option 1. Reduce batch size and adjust steps for gradient accumulation to cover it
 - Option 2. Try more memomry-efficient way

 I'm gonna pursue the second path using Unsloth Library

In [9]:
# Quantization Config; nf4 & bf16 are orginal setting of QLoRA paper
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",             # Load model weights as quantized 4-bit normal float
    bnb_4bit_compute_dtype=torch.float16,  # Computations at fp16
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=True,
    use_cache=False,  # False, if you gonna use gradient checkpoint
    device_map=device_map
)

# PEFT config
lora_args = dict(
    r=32,         # Rank; decomposition level
    lora_alpha=16,     # Scaler; impact of adaptor to pre-trained model
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["k_proj", "q_proj", "v_proj", "up_proj", "down_proj", "gate_proj"],
    modules_to_save=["embed_tokens", "input_layernorm", "post_attention_layernorm", "norm"],

)

peft_config = LoraConfig(**lora_args)


# Training Args
trainer_args = dict(
    output_dir = "./results",
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 2,
    optim = "adamw_torch",
    save_steps = 10,
    logging_steps = 1,
    learning_rate = 2e-4,
    max_grad_norm = 0.3,
    max_steps = 1,
    warmup_ratio = 0.1,
    lr_scheduler_type = "cosine",
    fp16=True,
    group_by_length=True,
    gradient_checkpointing=True,  # To activate this, `use_cache` parameter of pre-trained model needs to be `False`
    #report_to="wandb",
)

training_arguments = TrainingArguments(**trainer_args)

# Trainer
max_seq_length = 512
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="context",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
)

# Train :)
trainer.train()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.96 GiB. GPU 0 has a total capacity of 14.75 GiB of which 1.28 GiB is free. Process 14210 has 13.47 GiB memory in use. Of the allocated memory 13.25 GiB is allocated by PyTorch, and 109.73 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

# Training 2. Unsloth

In [4]:
import torch
from unsloth import FastLanguageModel

max_seq_length = 2048  # Choose any! We auto support RoPE Scaling internally!
dtype = None           # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True    # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3-mini-4k-instruct",          # Phi-3 2x faster!d
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


## Base Model

In [20]:
# Load base model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.25. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


## Wrap the base model to be trained with adapter (LoRA)

In [21]:
# Get a Peftmodel that's wrapping up base model, Llama 3.1 8B in our case.
# Peftmodel is prepared to be trained.
model = FastLanguageModel.get_peft_model(
    model,   # Base model to be wrapped
    r = 16,  # Rank; Degree of Decomposition; recommended: 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,  # Impact of LoRA weights to original parameters (W)
    lora_dropout = 0, # Whatever, but optimized at 0
    bias = "none",    # Whatever, but optimized at "none"
    use_gradient_checkpointing = "unsloth", # Use "unsloth" or "True" for very long context
    random_state = 3407,
    use_rslora = False,  # RSLoRA
    loftq_config = None, # LoftQ
)

## Dataset
 - As we changed model from Llama 3 to Llama 3.1, there's possibility for processor/tokenizer to be changed. They can be same but I wanna be lazy now so just write another code to make sure to reflect the model change

In [22]:
alpaca_prompt = """ Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### instruction:
{}

### Context:
{}

### output:
{}"""

EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instruction  = examples["Description"]
    inputs       = examples["Patient"]
    outputs      = examples["Doctor"]
    texts = []
    for instruction, input, output in zip(instruction, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}
pass

# Apply prompt formatting
dataset_with_v2_tokenizer = dataset.map(formatting_prompts_func, batched = True,)

Map:   0%|          | 0/256916 [00:00<?, ? examples/s]

In [26]:
# ========== test ==========
print("[Input]")
print(dataset_with_v2_tokenizer['Description'][13])
print(dataset_with_v2_tokenizer['Patient'][13].replace("\n", " "))
print()
print("[Expected Output]")
dataset_with_v2_tokenizer['Doctor'][13]

[Input]
Q. I have stomach pain and bloating after taking Augmentin 625. Why?
Age - 33 years Height - 164 cms Weight - 67 Kg I am suffering from sore throat so I took Augmentin 625 before 12-13 days back but found no improvement but I started suffering from Stomach pain and excessive gases. Now I am taking below medicines as per prescribed by ENT specialist that are 1.Allegra M tablet 2.Rinifol capsule 3.Esoz D 40 capsule 4.Acinil O surup for ten days.Two days remaning to finish course. Stomach pain and blown stomach due to gas is still there.

[Expected Output]


'Hello. I welcome you to icliniq. Well it is not uncommon to have Gastric upsets after a course of antibiotic. And antibiotic like augmentin are notorious to cause this. This occur secondary to effect of augmentin on decreasing the amount of lactobacillus microorganism which are normally present in the human gut and help digest the food. You should take yougurt, Kefir (fermented yougurt), Kefir. and there are certain diets with probiotic activity you can start taking it. Plus take Tab Ecotec twice daily for 4 weeks, and Tab Levosulpiride 75 mg once daily half hour before meals.for 2 weeks. You can continue with acinil O syp. while rest of your current medication can be stopped. I hope this will correct your microbiota and will bring the same healthiness as before. good day Antibiotic induced dysbiosis Small intestinal bacterial overgrowth Antibiotic induced dysbiosis Let me know in one week time how you respond to above regimen, if persistent of symptoms I would consider testing and gi

## Train the model

In [27]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset_with_v2_tokenizer,
    dataset_text_field = "text",             # Where all information is condensed through our processing
    max_seq_length = max_seq_length,         # Maximum length of output sequence
    dataset_num_proc = 2,                    # Num data that will be assigned per processor
    packing = False,                         # Activate if for short sequences to boost learning speed 5 times
    args = TrainingArguments(
        per_device_train_batch_size = 2,     # Batch size per device. Adjust if memory lacks
        gradient_accumulation_steps = 4,     # How many times you want to stack gradients and then update them at once
        warmup_steps = 5,                    # Warming-up stage where learning rate is gradually increased
        # num_train_epochs = 1,              # Training epoch
        max_steps = 60,                      # Max step, which overrides training epoch if any
        learning_rate = 2e-4,                # LR
        fp16 = not is_bfloat16_supported(),  # Use fp16 if bf16 is not supported
        bf16 = is_bfloat16_supported(),      # Use bf16 if supported
        logging_steps = 1,                   #
        optim = "adamw_8bit",                # Optimizer of adamw_8bit
        weight_decay = 0.01,                 # Weight decay
        lr_scheduler_type = "linear",        # Learning rate will be changed linearly
        seed = 3407,                         # Set seed for reproducibility
        output_dir = "outputs",              #
    ),
)

Map (num_proc=2):   0%|          | 0/256916 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [28]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 256,916 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,2.8091
2,2.734
3,2.7775
4,3.0227
5,2.6989
6,2.5394
7,2.6519
8,2.4365
9,2.3956
10,2.2438


## Save model

In [29]:
model.save_pretrained("lora_model") # Local saving
tokenizer.save_pretrained("lora_model")
# model.push_to_hub("your_name/lora_model", token = "...")  # Save on HF
# tokenizer.push_to_hub("your_name/lora_model", token = "...")  # Save on HF

('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/tokenizer.json')