In [None]:
import os

In [None]:
def detect_environment():
    """Return Environment variable (kaggle, colab, local)
    """
    if 'KAGGLE_KERNEL_RUN_TYPE' in os.environ:
        return 'kaggle'
    elif 'COLAB_GPU' in os.environ:
        return 'colab'
    else:
        return 'local'

def create_workspace_dirs(workspace_path: str):
    """Setup environment specific workspace directories.

       Parameters:
       -----------
        - workspace_path (str): Path to root workspace environment.

       Results:
       --------
        - CONFIG_PATH - path to all config fiels.
        - DATASETS_PATH - path to all datasets.
        - MODELS_PATH  - path to all models.
    """


    DATASETS_PATH: str  = os.path.join(workspace_path, 'datasets')
    MODELS_PATH: str    = os.path.join(workspace_path, 'models')
    CONFIG_PATH: str    = os.path.join(workspace_path, 'config')

    # create workspace directories
    [os.makedirs(path_var, exist_ok=True) for path_var in [workspace_path,CONFIG_PATH, DATASETS_PATH,MODELS_PATH]]

    print(f"Active working directories:\n\n{CONFIG_PATH=}\n{DATASETS_PATH=}\n{MODELS_PATH=}")

In [None]:
# Set the development environment variable
DEV_ENV: str = detect_environment()

# set development workspace directories
workspace_path_map: dict = {
    "colab": "/content/workspace/",
    "kaggle": "/kaggle/working/workspace/",
    "local": "set_path_to_local/workspace/" # update this value to local drive
                 }

create_workspace_dirs(workspace_path_map.get(DEV_ENV))


Active working directories:

CONFIG_PATH='/content/workspace/config'
DATASETS_PATH='/content/workspace/datasets'
MODELS_PATH='/content/workspace/models'


In [None]:
CONFIG_PATH='/content/workspace/config'
DATASETS_PATH='/content/workspace/datasets'
MODELS_PATH='/content/workspace/models'
BENCHMARK_PATH= '/content/working/workspace/benchmarks'
FINETUNED_MODELS_PATH = os.path.join(MODELS_PATH,'finetuned-model')

os.makedirs(BENCHMARK_PATH, exist_ok=True)
os.makedirs(FINETUNED_MODELS_PATH, exist_ok=True)

## Setup API secrets

In [None]:
if  DEV_ENV.__contains__("kaggle"):
   from kaggle_secrets import UserSecretsClient
   user_secrets = UserSecretsClient()

   # get secret keys
   wandb_api_key = user_secrets.get_secret("wandb-kenya-clinical-reasoning-key")
   huggingface_api_key = user_secrets.get_secret("huggingface-kenya-clinical-reasoning-key")

if DEV_ENV.__contains__("colab"):
    from google.colab import userdata as user_secrets

    # get secret keys dd
    #wandb_api_key = user_secrets.get("wandb-kenya-clinical-reasoning-key")
    #huggingface_api_key = user_secrets.get("huggingface-kenya-clinical-reasoning-key")


# 2. Install and import modules
---

In [None]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=d06f84b4d90273dcd4dccb8121abf6e525fef332cbb6107e730ec20763eb027f
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
# Do this only in Colab notebooks! Otherwise use pip install unsloth
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
!pip install --no-deps unsloth

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting xformers==0.0.29.post3
  Downloading xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting trl==0.15.2
  Downloading trl-0.15.2-py3-none-any.whl.metadata (11 kB)
Collecting cut_cross_entropy
  Downloading cut_cross_entropy-25.1.1-py3-none-any.whl.metadata (9.3 kB)
Collecting unsloth_zoo
  Downloading unsloth_zoo-2025.5.8-py3-none-any.whl.metadata (8.0 kB)
Downloading xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl (43.4 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m43.4/43.4 MB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trl-0.15.2-py3-none-any.whl (318 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m318.9/3

In [None]:
!pip install weave

Collecting weave
  Downloading weave-0.51.47-py3-none-any.whl.metadata (25 kB)
Collecting diskcache==5.6.3 (from weave)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Collecting emoji>=2.12.1 (from weave)
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Collecting gql[aiohttp,requests] (from weave)
  Downloading gql-3.5.3-py2.py3-none-any.whl.metadata (9.4 kB)
Collecting uuid-utils>=0.9.0 (from weave)
  Downloading uuid_utils-0.11.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.7 kB)
Collecting graphql-core<3.2.7,>=3.2 (from gql[aiohttp,requests]->weave)
  Downloading graphql_core-3.2.6-py3-none-any.whl.metadata (11 kB)
Collecting backoff<3.0,>=1.11.1 (from gql[aiohttp,requests]->weave)
  Downloading backoff-2.2.1-py3-none-any.whl.metadata (14 kB)
Downloading weave-0.51.47-py3-none-any.whl (512 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m 

In [None]:
import os
import yaml
import glob
import pandas as pd
from datetime import datetime

# Finetuning modules
from unsloth import FastLanguageModel
import torch
from unsloth.chat_templates import get_chat_template
from trl import SFTTrainer, SFTConfig
from transformers import TrainingArguments, DataCollatorForSeq2Seq, set_seed
from unsloth import is_bfloat16_supported
from datasets import load_dataset, Dataset
import warnings
warnings.filterwarnings('ignore')

set_seed(42)

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!


In [None]:
def save_versioned_config(config, config_dir='./', prefix='config_v'):
    """
    Save a configuration with versioning.

    Parameters:
        config: Configuration dictionary to save
        config_dir: Directory to save config in
        prefix: Prefix for the config filename

    Returns:
        Path to the saved config file
    """
    # Create version string with timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{prefix}{timestamp}.yaml"
    filepath = os.path.join(config_dir, filename)

    # Save the configuration
    with open(filepath, 'w') as f:
        yaml.dump(config, f, default_flow_style=False)

    print(f"Saved configuration to: {filepath}")
    return filepath

def load_latest_config(config_dir='./', prefix='config_v'):
    """
    Find and load the most recent versioned YAML configuration file.

    Args:
        config_dir: Directory containing config files
        prefix: Prefix of the config filenames

    Returns:
        The loaded configuration as a dictionary
    """
    # Find all config files matching the pattern
    config_files = glob.glob(f"{config_dir}{prefix}*.yaml")

    if not config_files:
        raise FileNotFoundError(f"No configuration files found matching {prefix}*.yaml")

    # Sort files by modification time (newest first)
    latest_file = max(config_files, key=os.path.getmtime)

    print(f"Loading latest configuration from: {latest_file}")

    # Load and return the configuration
    with open(latest_file, 'r') as f:
        return yaml.safe_load(f)


def load_config(path: str):

    # Load and return the configuration
    with open(path, 'r') as f:
        return yaml.safe_load(f)

# 3. Setup Config
---


In [None]:
# Create dict YAML configuration
config_dict = {
    "model": {
        "base_model": "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
        "load_in_4bit": True,
        "max_seq_length": 2048,
        "dtype": None
    },
    "lora": {
        "r": 16,
        "alpha": 32,
        "dropout": 0.1,
        "target_modules": [
            "q_proj",
            "k_proj",
            "v_proj",
            "o_proj",
            "gate_proj",
            "up_proj"
        ]
    },
    "training": {
        "learning_rate": 5e-5,
        "batch_size": 2,
        "epochs": 3,
        "optim": "adamw_torch",
        "lr_scheduler": 'cosine',
        "gradient_accumulation_steps": 1,
        "warmup_ratio": 0.03,
        "weight_decay": 0.01,
        "output_dir": FINETUNED_MODELS_PATH
    },
  "evaluation": {
      "strategy": 'steps',
      "eval_steps": 50,
      "save_steps": 50,
      "save_total_limit": 3,
  },
  "tokenizer":{
    "chat_template":"llama-3.2"
    }
}

# save as yaml config
latest_config_version_path: str = save_versioned_config(config=config_dict,
                                                        config_dir=CONFIG_PATH,
                                                        prefix='finetune_config_v'
                                                       )

# load configurations
finetune_config: dict  = load_config(path=latest_config_version_path)

finetune_config

Saved configuration to: /content/workspace/config/finetune_config_v20250522_134537.yaml


{'evaluation': {'eval_steps': 50,
  'save_steps': 50,
  'save_total_limit': 3,
  'strategy': 'steps'},
 'lora': {'alpha': 32,
  'dropout': 0.1,
  'r': 16,
  'target_modules': ['q_proj',
   'k_proj',
   'v_proj',
   'o_proj',
   'gate_proj',
   'up_proj']},
 'model': {'base_model': 'unsloth/Llama-3.2-1B-Instruct-bnb-4bit',
  'dtype': None,
  'load_in_4bit': True,
  'max_seq_length': 2048},
 'tokenizer': {'chat_template': 'llama-3.2'},
 'training': {'batch_size': 2,
  'epochs': 3,
  'gradient_accumulation_steps': 1,
  'learning_rate': 5e-05,
  'lr_scheduler': 'cosine',
  'optim': 'adamw_torch',
  'output_dir': '/content/workspace/models/finetuned-model',
  'warmup_ratio': 0.03,
  'weight_decay': 0.01}}

# Dataset preprocessing
---



**Tasks To Do:**
1. **Load raw datasets**
 - Load train data - `os.path.join(DATASETS_PATH,"train_dataset.csv")`
 - Load validation data - `os.path.join(DATASETS_PATH,"validation_dataset.csv")`
 - Load test data - `os.path.join(DATASETS_PATH,"test_dataset.csv")`
2. **Convert Pandas format to HuggingFace dataset format**
  -  

In [None]:
# Template for gemma
template = """<bos><start_of_turn>user
<<PROMPT>><end_of_turn>
<start_of_turn>model
<<CLINICIAN<end_of_turn>"""

def expand_df(df, template):
    rows = []

    for _, row in df.iterrows():
        base = row.to_dict()

        text_clinician = row['Clinician']#template.replace("<<CLINICIAN>>", row['Clinician'], 1).replace("<<PROMPT>>", row['Prompt'], 1)
        base['text'] = text_clinician
        rows.append(base.copy())

        for tool_col in ['GPT4.0', 'LLAMA', 'GEMINI']:
            tool_value = row[tool_col]
            if pd.notna(tool_value):
                new_row = row.to_dict()
                text_tool = tool_value #template.replace("<<CLINICIAN>>", tool_value, 1).replace("<<PROMPT>>", row['Prompt'], 1)
                new_row['text'] = text_tool
                rows.append(new_row)

    return pd.DataFrame(rows)

In [None]:
def format_data_llama_3(dataset, prompt_column_name: str, response_column_name: str, system_prompt: str=None):
    """Format data for Llama 3's chat template."""

    formatted_data = []

    if system_prompt is None:
       system_prompt = "You are a clinical reasoning assistant trained to help diagnose medical conditions. Provide detailed, evidence-based assessments. Always consider patient safety first."

    for row in dataset:
        # Format according to Llama 3 chat template
        formatted_text = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>\n{row[prompt_column_name]}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n{row[response_column_name]}<|eot_id|>"
        formatted_data.append({"text": formatted_text})

    return Dataset.from_list(formatted_data)

In [None]:
# load training dataset with selected columns
df_train_raw = pd.read_csv(os.path.join(DATASETS_PATH,"train_dataset.csv"),
                           usecols=['Master_Index', 'Prompt','Clinician' ]
                          )

df_validation_raw = pd.read_csv(os.path.join(DATASETS_PATH,"validation_dataset.csv"),
                           usecols=['Master_Index', 'Prompt','Clinician' ]
                          )

df_test_raw = pd.read_csv(os.path.join(DATASETS_PATH,"test_dataset.csv"),
                           usecols=['Master_Index', 'Prompt']
                          )

df_online_test_raw = pd.read_csv(os.path.join(DATASETS_PATH,"online_test.csv"),
                           usecols=['Master_Index', 'Prompt']
                          )


df_train_raw.head(2)


Unnamed: 0,Master_Index,Prompt,Clinician
0,ID_VBWWP,i am a nurse with 18 years of experience in ge...,summary a 4 year old with 5 superficial burns ...
1,ID_XMBBY,i am a nurse with 17 years of experience in ge...,summary 6 year old present with vomiting and a...


In [None]:
df_train_temp = pd.read_csv(os.path.join(DATASETS_PATH,"train_dataset.csv"),
                           usecols=['Master_Index', 'Prompt','Clinician', 'GPT4.0', 'LLAMA', 'GEMINI' ]
                          )
df_train_formatted = expand_df(df_train_temp, template)
df_train_formatted.head()

Unnamed: 0,Master_Index,Prompt,Clinician,GPT4.0,LLAMA,GEMINI,text
0,ID_VBWWP,i am a nurse with 18 years of experience in ge...,summary a 4 year old with 5 superficial burns ...,given your vast experience as a nurse in uasin...,1 immediate treatment protocol for second degr...,here s a response addressing the questions reg...,summary a 4 year old with 5 superficial burns ...
1,ID_VBWWP,i am a nurse with 18 years of experience in ge...,summary a 4 year old with 5 superficial burns ...,given your vast experience as a nurse in uasin...,1 immediate treatment protocol for second degr...,here s a response addressing the questions reg...,given your vast experience as a nurse in uasin...
2,ID_VBWWP,i am a nurse with 18 years of experience in ge...,summary a 4 year old with 5 superficial burns ...,given your vast experience as a nurse in uasin...,1 immediate treatment protocol for second degr...,here s a response addressing the questions reg...,1 immediate treatment protocol for second degr...
3,ID_VBWWP,i am a nurse with 18 years of experience in ge...,summary a 4 year old with 5 superficial burns ...,given your vast experience as a nurse in uasin...,1 immediate treatment protocol for second degr...,here s a response addressing the questions reg...,here s a response addressing the questions reg...
4,ID_XMBBY,i am a nurse with 17 years of experience in ge...,summary 6 year old present with vomiting and a...,clinical summary ‚Ä¢ a 6 year old girl with know...,based on the symptoms and signs you ve describ...,based on the presentation the 6 year old girl ...,summary 6 year old present with vomiting and a...


In [None]:
df_train_new = df_train_formatted[["Master_Index", "Prompt", "text"]].rename(columns={"text": "Clinician"})
df_train_new.head()


Unnamed: 0,Master_Index,Prompt,Clinician
0,ID_VBWWP,i am a nurse with 18 years of experience in ge...,summary a 4 year old with 5 superficial burns ...
1,ID_VBWWP,i am a nurse with 18 years of experience in ge...,given your vast experience as a nurse in uasin...
2,ID_VBWWP,i am a nurse with 18 years of experience in ge...,1 immediate treatment protocol for second degr...
3,ID_VBWWP,i am a nurse with 18 years of experience in ge...,here s a response addressing the questions reg...
4,ID_XMBBY,i am a nurse with 17 years of experience in ge...,summary 6 year old present with vomiting and a...


In [None]:
df_validation_raw.head(2)

Unnamed: 0,Master_Index,Prompt,Clinician
0,ID_OHZDT,i am a nurse with 8 years of experience in gen...,summary a patient with burns on chest and face...
1,ID_DGQWO,i am a nurse with 20 years of experience in pr...,summary a 27 year old female admitted after ac...


In [None]:
# Create formatted dataset
# formated_train_dataset = format_data_llama_3(dataset=Dataset.from_pandas(df_train_raw),
formated_train_dataset = format_data_llama_3(dataset=Dataset.from_pandas(df_train_new),
prompt_column_name= "Prompt",
response_column_name= "Clinician",
system_prompt=None
)

formated_validation_dataset = format_data_llama_3(dataset=Dataset.from_pandas(df_validation_raw),
prompt_column_name= "Prompt",
response_column_name= "Clinician",
system_prompt=None
)

print(f"Example formatted text: {formated_train_dataset[0]}")

Example formatted text: {'text': '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\nYou are a clinical reasoning assistant trained to help diagnose medical conditions. Provide detailed, evidence-based assessments. Always consider patient safety first.<|eot_id|><|start_header_id|>user<|end_header_id|>\ni am a nurse with 18 years of experience in general nursing working in a sub county hospitals and nursing homes in uasin gishu county in kenya a 4 year old child presents to the emergency department with second degree burns on the forearm after accidentally touching a hot stove the child was playing in the kitchen when they reached out to touch the stove the burns cover about 5 of the total body surface area the child is alert and crying with redness blisters and swelling on the affected area the burns appear to be superficial to moderate in severity the child is in mild pain and there is no indication of airway or breathing distress no other injuries are noted questions 1 what

# Setup Finetune: Model Configuration
---
**Base models**
1. "unsloth/Llama-3.2-1B-bnb-4bit",          
2. "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
3. "meta-llama/Llama-3.2-1B"
4. "unsloth/Llama-3.2-1B-Instruct"

In [None]:
# load configurations
finetune_config: dict  = load_config(path=latest_config_version_path)

finetune_config.get('training')

{'batch_size': 2,
 'epochs': 3,
 'gradient_accumulation_steps': 1,
 'learning_rate': 5e-05,
 'lr_scheduler': 'cosine',
 'optim': 'adamw_torch',
 'output_dir': '/content/workspace/models/finetuned-model',
 'warmup_ratio': 0.03,
 'weight_decay': 0.01}

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
                    model_name = finetune_config.get('model').get('base_model'),
                    max_seq_length = finetune_config.get('model').get('max_seq_length',2048),
                    dtype = finetune_config.get('model').get('dtype', torch.bfloat16),
                    load_in_4bit =finetune_config.get('model').get('load_in_4bit', True),
                 )

==((====))==  Unsloth 2025.5.7: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.03G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.7k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = finetune_config.get('lora').get('r', 16),                      # Suggested 8, 16, 32, 64, 128
    lora_alpha = finetune_config.get('lora').get('alpha', 32),         # Suggested lora_alpha = 2 x r
    target_modules = finetune_config.get('lora').get('target_modules'),
    lora_dropout = finetune_config.get('lora').get('dropout',0),       # Supports any, but = 0 is optimized
    bias = "none",                                                     # Supports any, but = "none" is optimized
    use_gradient_checkpointing = "unsloth",                            # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,                                                # We support rank stabilized LoRA
    loftq_config = None,                                               # And LoftQ
)

model.print_trainable_parameters()

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.1.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.5.7 patched 16 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


trainable params: 8,650,752 || all params: 1,244,465,152 || trainable%: 0.6951


## Create model specific chat template


In [None]:
tokenizer = get_chat_template(
    tokenizer,
    chat_template = finetune_config.get('tokenizer').get('chat_template'),
)

## Training Setup

In [None]:
# set training args, comment out paramenters to use default config values.
training_args = SFTConfig(
    output_dir = finetune_config.get("training").get("output_dir"),
    num_train_epochs= finetune_config.get("training").get("epochs"),
    per_device_train_batch_size= finetune_config.get("training").get("batch_size"),
    gradient_accumulation_steps= finetune_config.get("training").get("gradient_accumulation_steps"),
    optim= finetune_config.get("training").get("optim"),
    learning_rate= finetune_config.get("training").get("learning_rate"),
    lr_scheduler_type= finetune_config.get("training").get("lr_scheduler"),
    warmup_ratio= finetune_config.get("training").get("warmup_ratio"),
    weight_decay= finetune_config.get("training").get("weight_decay"),
    fp16=True,
    logging_steps=10,
    gradient_checkpointing=True,
    # evaluation_strategy=finetune_config.get("evaluation").get("strategy"),
    eval_steps=finetune_config.get("evaluation").get("eval_steps"),
    save_steps=finetune_config.get("evaluation").get("save_steps"),
    save_total_limit=finetune_config.get("evaluation").get("save_total_limit"),
)

## Train and save model checkpoints

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.741 GB.
4.846 GB of memory reserved.


In [None]:
#import os
#os.environ["WANDB_DISABLED"] = "true"

In [None]:
# Create trainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset= formated_train_dataset,
    eval_dataset=  formated_validation_dataset,
    dataset_text_field="text",
    args= training_args,
    packing=True,
    max_seq_length=2048,
    report_to="none"
)


# Finetune model
trainer_stats = trainer.train()

# Save model
trainer.save_model()

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/1376 [00:00<?, ? examples/s]

Unsloth: Hugging Face's packing is currently buggy - we're disabling it for now!


Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/18 [00:00<?, ? examples/s]

Unsloth: Hugging Face's packing is currently buggy - we're disabling it for now!


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,376 | Num Epochs = 3 | Total steps = 2,064
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 1 x 1) = 2
 "-____-"     Trainable parameters = 8,650,752/1,244,465,152 (0.70% trained)


Step,Training Loss
10,1.5266
20,1.6935
30,1.4776


KeyboardInterrupt: 

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.741 GB.
4.846 GB of memory reserved.


# Model Evaluation
---

In [None]:
"""
This metric computes the average ROUGE-L F1 score between the predicted clinician response
and the expert reference response. ROUGE-L captures the longest common subsequence between
two texts, making it suitable for evaluating structured clinical answers.

Required columns:
- solution: must contain a column named "reference"
- submission: must contain a column named "prediction"
"""

import pandas as pd
from rouge_score import rouge_scorer

class ParticipantVisibleError(Exception):
    pass

def rougel_f1_score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str) -> float:
    '''
    This metric computes the average ROUGE-L F1 score between the predicted clinician response
    and the expert reference response. ROUGE-L captures the longest common subsequence between
    two texts, making it suitable for evaluating structured clinical answers.

    Parameters:
    -----------
    - solution (pd.DataFrame): DataFrame must contain a column named "reference"
    - submission (pd.DataFrame): DataFrame must contain a column named "prediction"
    - row_id_column_name (str): ID column name to align records from both dataframes.

    Returns: Average F1 score

    PyTest:
    -------
    >>> import pandas as pd
    >>> row_id_column_name = "id"
    >>> y_true = pd.DataFrame({"id": [0], "Clinician": ["The cat sat on the mat."]})
    >>> y_pred = pd.DataFrame({"id": [0], "Clinician": ["The cat sat."]})
    >>> round(rougel_f1_score(y_true.copy(), y_pred.copy(), row_id_column_name), 4)
    0.6667
    '''

    # set expected y true and y hat column names
    _reference_column_name: str = "Clinician"
    _prediction_column_name: str = "Clinician"

    if row_id_column_name not in solution.columns or row_id_column_name not in submission.columns:
        raise ParticipantVisibleError(f"Missing ID column '{row_id_column_name}' in either solution or submission.")

    # Align rows by ID
    solution = solution.set_index(row_id_column_name).sort_index()
    submission = submission.set_index(row_id_column_name).sort_index()

    # Check for matching IDs
    if not solution.index.equals(submission.index):
        raise BaseException(f"Submission and solution {row_id_column_name} do not match.")

    # Validate column names
    if _reference_column_name not in solution.columns:
        raise BaseException(f"Solution dataframe/file must contain a '{_reference_column_name}' column.")

    if _prediction_column_name not in submission.columns:
        raise BaseException(f"Submission dataframe/file must contain a '{_prediction_column_name}' column.")

    references = solution[_reference_column_name].astype(str)
    predictions = submission[_prediction_column_name].astype(str)

    scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)

    scores = []
    for ref, pred in zip(references, predictions):
        result = scorer.score(ref, pred)
        scores.append(result["rougeL"].fmeasure)

    # Safety: check for empty evaluation
    if not scores:
        raise BaseException("No predictions to score.")

    average_f1 = sum(scores) / len(scores)

    # Final check: must return a valid float
    if not pd.notnull(average_f1) or not (0 <= average_f1 <= 1):
        raise BaseException("Metric computed an invalid score.")

    return float(average_f1)

# Model Benchmark inference and submission
---

In [None]:
max_seq_length = 8192 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = False # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = FINETUNED_MODELS_PATH, #"software-together/model-v2-16bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "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",
)

==((====))==  Unsloth 2025.5.7: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [None]:

from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.2",
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 2048, padding_idx=128004)
        (layers): ModuleList(
          (0-15): 16 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k

In [None]:
import time

In [None]:
# study how to do this better
instr_prompt2 ={

  "instruction": "You are a medical professor and practitioner with over 100 years of experience, specialized in: Maternal and Child Health, Child Health, and Adult Health. You will be given clinical case scenarios by nurses, including nurse background, patient demographics, presenting symptoms, clinical findings, and a clinical question.\n\nRespond using this format:\n\n1. Categorize the scenario by nursing specialty:\n   - Maternal and Child Health\n   - Child Health\n   - Adult Health\n\n2. Provide Differential Diagnoses (DDX) using SNOMED CT:\n   Format: 'Code | Diagnosis Description (entity type)'\n\n3. Follow this Clinical Structure:\n   - Summary: Recap the case\n   - Clinical Reasoning: Explain how the symptoms and findings lead to the DDX\n   - Differential Diagnosis: Include SNOMED CT codes\n   - Management Plan: What to do next (tests, treatments, referrals)\n   - Answers to Scenario Questions: Direct, clear responses\n\n4. Communication Style:\n   - Use a human, compassionate, professional tone\n   - Avoid excessive jargon; be clear and supportive\n   - Provide answers as an expert guiding junior clinicians or nurses\n\nObjective: Respond to each case using deep clinical reasoning, structured planning, and SNOMED CT for medical standardization.",

  "input": "i am a nurse with 17 years of experience in general nursing working in a national referral hospital in uasin gishu county in kenya. a 6-year-old girl presented with vomiting and abdominal pain. she is a known diabetic but has not been taking insulin due to lack of funds. on examination, she is confused, has kussmaul breathing, dry tongue, fruity breath, blurred vision. vitals: T: 37, P: 120, RR: 48, SpO2: 90%. what is the diagnosis? what is the most immediate management? what education is needed? what investigations are required?",

  "output": "summary Child Health\n\nSNOMED CT Differential Diagnoses:\n- 420422005 | Diabetic ketoacidosis (disorder)\n- 46635009 | Type 1 diabetes mellitus (disorder)\n\nSummary: A 6-year-old girl presents with symptoms highly consistent with diabetic ketoacidosis due to insulin non-adherence.\n\nClinical Reasoning: The patient‚Äôs vomiting, Kussmaul respirations, fruity breath, dehydration, and confusion point to metabolic acidosis caused by DKA.\n\nDifferential Diagnosis:\n- Diabetic ketoacidosis (420422005)\n- Metabolic acidosis (302866003)\n- Hypovolemia (271327008)\n\nManagement Plan:\n1. Initiate oxygen therapy immediately\n2. Start IV fluids (0.9% normal saline bolus 10‚Äì20 mL/kg)\n3. Begin insulin therapy once fluid resuscitation is underway\n4. Monitor potassium levels and electrolytes\n5. Admit for close monitoring\n\nAnswers to Questions:\n1. Diagnosis: Diabetic Ketoacidosis\n2. Immediate Management: Oxygen, fluids, insulin, monitoring\n3. Education: Importance of insulin adherence, early DKA signs, sick day management, connect to financial/social support\n4. Investigations: Blood glucose, ketones, venous blood gas, electrolytes, renal function, ECG"

}


In [None]:
instr_prompt = {
  "instruction": "You are a medical professor and practitioner with over 100 years of experience across Maternal and Child Health, Child Health, and Adult Health. Given a clinical case scenario from a nurse, categorize the nursing specialty, provide SNOMED CT differential diagnoses, and return a structured expert response.",
}

In [None]:
def llm_inference(model,
                  user_prompts:pd.DataFrame,
                  system_prompt: str,
                  prompt_column:str='Prompt',
                  record_id_column: str ='Master_Index',
                  args:dict=None
                  )->pd.DataFrame:
    """Runs inference against the model and returns a dataframe with clinician results.
    """

    results = []

    for idx, row in user_prompts.iterrows():
        prompt = row['Prompt']
        master_index = row['Master_Index']

        messages = [
          {"role": "system", "content": system_prompt},
          {"role": "user", "content": prompt}
          ]

        prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

        inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")

        outputs = model.generate(**inputs, max_new_tokens=150, num_return_sequences=1,  temperature = 1.5)

        text = tokenizer.decode(outputs[0], skip_special_tokens=True)

        result_text = text.split("assistant")[1]
        #outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True, temperature = 1.5, min_p = 0.1)
        #result_text = tokenizer.batch_decode(outputs)
        #print("result_text: ",result_text)

        results.append({
            "Master_Index": master_index,
            "Clinician": result_text
        })

        time.sleep(0.1)
        print(f"Running so far with {idx} of {len(user_prompts)}: {master_index}")

    return pd.DataFrame(results)

In [None]:
df_validation_predictions = llm_inference(model=model,
                                     user_prompts=df_validation_raw[['Master_Index','Prompt']],
                                     system_prompt=instr_prompt
                                     )

Running so far with 0 of 18: ID_OHZDT
Running so far with 1 of 18: ID_DGQWO
Running so far with 2 of 18: ID_PPJVQ
Running so far with 3 of 18: ID_TOCCU
Running so far with 4 of 18: ID_IYFIZ
Running so far with 5 of 18: ID_CVWHT
Running so far with 6 of 18: ID_RYKSB
Running so far with 7 of 18: ID_PTREU
Running so far with 8 of 18: ID_CEGVK
Running so far with 9 of 18: ID_CSMTR
Running so far with 10 of 18: ID_WTBDF
Running so far with 11 of 18: ID_ZQORV
Running so far with 12 of 18: ID_GLHTL
Running so far with 13 of 18: ID_ZOSDW
Running so far with 14 of 18: ID_NVGKJ
Running so far with 15 of 18: ID_AUQGT
Running so far with 16 of 18: ID_RZPUS
Running so far with 17 of 18: ID_BGWLU


In [None]:
# Validation scoring
rougel_f1_score(solution=df_validation_raw,
                submission=df_validation_predictions,
                row_id_column_name="Master_Index"
                )

0.16160767596599318

## Test Predictions

In [None]:
model_name = config_dict.get("model").get('base_model').replace("/","-")
run_timestamp = datetime.now().strftime("%Y%m%dT%H%M%S")

In [None]:
df_test_predictions = llm_inference(model=model,
                                     user_prompts=df_test_raw[['Master_Index','Prompt']],
                                     system_prompt=instr_prompt2
                                     )

df_test_predictions.to_csv(f"{BENCHMARK_PATH}/local_test_{model_name}_{run_timestamp}.csv", index=False)

Running so far with 0 of 38: ID_AOANH
Running so far with 1 of 38: ID_BKPAH
Running so far with 2 of 38: ID_BNHVT
Running so far with 3 of 38: ID_CHHIO
Running so far with 4 of 38: ID_CZVOY
Running so far with 5 of 38: ID_DIXUW
Running so far with 6 of 38: ID_EQRHR
Running so far with 7 of 38: ID_GERAA
Running so far with 8 of 38: ID_GMPJA
Running so far with 9 of 38: ID_HXDLY
Running so far with 10 of 38: ID_IKDJT
Running so far with 11 of 38: ID_JOBAE
Running so far with 12 of 38: ID_KCMKY
Running so far with 13 of 38: ID_KOKFK
Running so far with 14 of 38: ID_LIESO
Running so far with 15 of 38: ID_MANPW
Running so far with 16 of 38: ID_MDILB
Running so far with 17 of 38: ID_MWOML
Running so far with 18 of 38: ID_OFZHH
Running so far with 19 of 38: ID_OGGLZ
Running so far with 20 of 38: ID_OICRH
Running so far with 21 of 38: ID_OLABW
Running so far with 22 of 38: ID_ONSYX
Running so far with 23 of 38: ID_ONVGJ
Running so far with 24 of 38: ID_QRYLO
Running so far with 25 of 38: ID_RH

In [None]:
df_online_test_predictions = llm_inference(model=model,
                                     user_prompts=df_online_test_raw[['Master_Index','Prompt']],
                                     system_prompt=instr_prompt2
                                     )

df_online_test_predictions.to_csv(f"{BENCHMARK_PATH}/online_test_{model_name}_{run_timestamp}.csv", index=False)

Running so far with 0 of 100: ID_CUAOY
Running so far with 1 of 100: ID_OGSAY
Running so far with 2 of 100: ID_TYHSA
Running so far with 3 of 100: ID_CZXLD
Running so far with 4 of 100: ID_ZJQUQ
Running so far with 5 of 100: ID_HYSCV
Running so far with 6 of 100: ID_DXHPF
Running so far with 7 of 100: ID_GDFDN
Running so far with 8 of 100: ID_UFAFI
Running so far with 9 of 100: ID_KMBGG
Running so far with 10 of 100: ID_GCHQJ
Running so far with 11 of 100: ID_FBVXH
Running so far with 12 of 100: ID_GFQXW
Running so far with 13 of 100: ID_KQFSM
Running so far with 14 of 100: ID_OTEWX
Running so far with 15 of 100: ID_KTTZQ
Running so far with 16 of 100: ID_DFFBJ
Running so far with 17 of 100: ID_ZQLND
Running so far with 18 of 100: ID_LXBDD
Running so far with 19 of 100: ID_PWETS
Running so far with 20 of 100: ID_VJVBS
Running so far with 21 of 100: ID_OZCVT
Running so far with 22 of 100: ID_HBKUL
Running so far with 23 of 100: ID_ZVYUH
Running so far with 24 of 100: ID_SHIKK
Running so

In [None]:
df_online_test_predictions.head()

Unnamed: 0,Master_Index,Clinician
0,ID_CUAOY,\n\nsummary a 25 year old female presents to t...
1,ID_OGSAY,\n\nsummary patient is a three year old male w...
2,ID_TYHSA,\n\nsummary child Health the child has a weakn...
3,ID_CZXLD,\n\nas a community nurse working in a dispensa...
4,ID_ZJQUQ,\n\ngood assessment by the patient the patient...


In [None]:
model.save_pretrained("lora_model") # Local saving
tokenizer.save_pretrained("lora_model")

('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/tokenizer.json')

In [None]:
prompt_test_message = 'i am a nurse with 18 years of experience in general nursing working in a sub county hospitals and nursing homes in uasin gishu county in kenya a 4 year old child presents to the emergency department with second degree burns on the forearm after accidentally touching a hot stove the child was playing in the kitchen when they reached out to touch the stove the burns cover about 5 of the total body surface area the child is alert and crying with redness blisters and swelling on the affected area the burns appear to be superficial to moderate in severity the child is in mild pain and there is no indication of airway or breathing distress no other injuries are noted questions 1 what is the immediate treatment protocol for second degree burns in paediatric patients 2 should any tetanus prophylaxis be considered in this case 3 what follow up care should be recommended for burn healing'

In [None]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "user", "content": prompt_test_message},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Based on your scenario as a 28 year old nurse with 18 years of experience in general nursing working in a sub county hospitals and nursing homes in uasin gishu county in kenya i'll provide guidance on the immediate treatment protocol for second degree burns in paediatric patients along with suggestions on follow up care and considerations regarding tetanus prophylaxis 1 immediate treatment protocol for second degree burns in paediatric patients burn assessment carefully assess the child's burns to identify severity using the following criteria ‚Ä¢ superficial burns limited to the epidermis (outer layer of skin) ‚Ä¢ moderate burns damage extends to the dermis (middle layer of skin


In [None]:
# Merge to 16bit
#if True: model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
#if True: model.push_to_hub_merged(f"{model_name}-16bit", tokenizer, save_method = "merged_16bit", token = token)

if True:
    model.save_pretrained_gguf(
    #model.push_to_hub_gguf(
        f"{model_name}-gguf",
        tokenizer,
        quantization_method = ["q4_k_m", "q8_0", "q5_k_m",],
        #token = token,
    )

Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 2.5G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 4.44 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [00:01<00:00, 15.97it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving software-together/model-v2-gguf/pytorch_model.bin...
Done.


Unsloth: Converting llama model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m', 'q8_0', 'q5_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: CMAKE detected. Finalizing some steps for installation.
Unsloth: [1] Converting model at software-together/model-v2-gguf into f16 GGUF format.
The output location will be /content/software-together/model-v2-gguf/unsloth.F16.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: model-v2-gguf
INFO:hf-to-gguf:Model architecture: LlamaForCausalLM
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,           torch.float32 --> F32, shape = {32}
INFO:hf-to-gguf:gguf: load