This notebook takes my fine-tuned LoRA file from the MedGemma training notebook, merges it with the main MedGemma file, then quantizes to Q4_K_M, and then converts it to GGUF for easy operation using llama.cpp. <br>
I tested a few different quant variants, and Q4 provided the best results without crashing the Local Mac Mini M1 (16GB) system.

In [None]:
!pip install --upgrade pip
!pip install torch transformers accelerate peft huggingface_hub

Collecting pip
  Downloading pip-25.3-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.3-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m36.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.3


In [None]:
from huggingface_hub import login
login()  # This will ask for your HF token

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

model_id_base = "google/medgemma-1.5-4b-it"
model_id_lora = "CharlieKingOfTheRats/medgemma-1.5-4b-tccc-lora"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id_lora)

# Load base model (float16 to save RAM)
base_model = AutoModelForCausalLM.from_pretrained(
    model_id_base,
    torch_dtype="auto",
    device_map="auto"
)

# Apply LoRA
model = PeftModel.from_pretrained(base_model, model_id_lora)

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/2.55k [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json:   0%|          | 0.00/90.6k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.64G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/115 [00:00<?, ?B/s]

adapter_config.json: 0.00B [00:00, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/23.9M [00:00<?, ?B/s]

In [None]:
# Merge LoRA into base
model = model.merge_and_unload()  # returns regular HF model with LoRA merged

# Save locally
model.save_pretrained("medgemma-1.5-4b-tccc-lora-merged")
tokenizer.save_pretrained("medgemma-1.5-4b-tccc-lora-merged")

('medgemma-1.5-4b-tccc-lora-merged/tokenizer_config.json',
 'medgemma-1.5-4b-tccc-lora-merged/special_tokens_map.json',
 'medgemma-1.5-4b-tccc-lora-merged/chat_template.jinja',
 'medgemma-1.5-4b-tccc-lora-merged/tokenizer.model',
 'medgemma-1.5-4b-tccc-lora-merged/added_tokens.json',
 'medgemma-1.5-4b-tccc-lora-merged/tokenizer.json')

In [None]:
!git clone https://github.com/ggerganov/llama.cpp

Cloning into 'llama.cpp'...
remote: Enumerating objects: 77188, done.[K
remote: Counting objects: 100% (364/364), done.[K
remote: Compressing objects: 100% (203/203), done.[K
remote: Total 77188 (delta 290), reused 161 (delta 161), pack-reused 76824 (from 3)[K
Receiving objects: 100% (77188/77188), 283.97 MiB | 33.84 MiB/s, done.
Resolving deltas: 100% (55784/55784), done.


In [None]:
!rm -rf llama.cpp
!git clone https://github.com/ggerganov/llama.cpp
!ls

Cloning into 'llama.cpp'...
remote: Enumerating objects: 77188, done.[K
remote: Counting objects: 100% (355/355), done.[K
remote: Compressing objects: 100% (199/199), done.[K
remote: Total 77188 (delta 283), reused 156 (delta 156), pack-reused 76833 (from 3)[K
Receiving objects: 100% (77188/77188), 283.00 MiB | 34.24 MiB/s, done.
Resolving deltas: 100% (55783/55783), done.
llama.cpp  medgemma-1.5-4b-tccc-lora-merged  sample_data


In [None]:
!ls llama.cpp

AGENTS.md		       convert_lora_to_gguf.py	pocs
AUTHORS			       docs			poetry.lock
benches			       examples			pyproject.toml
build-xcframework.sh	       flake.lock		pyrightconfig.json
ci			       flake.nix		README.md
CLAUDE.md		       ggml			requirements
cmake			       gguf-py			requirements.txt
CMakeLists.txt		       grammars			scripts
CMakePresets.json	       include			SECURITY.md
CODEOWNERS		       LICENSE			src
common			       licenses			tests
CONTRIBUTING.md		       Makefile			tools
convert_hf_to_gguf.py	       media			vendor
convert_hf_to_gguf_update.py   models
convert_llama_ggml_to_gguf.py  mypy.ini


In [None]:
!python llama.cpp/convert_hf_to_gguf.py \
    ./medgemma-1.5-4b-tccc-lora-merged \
    --outfile medgemma-1.5-4b-tccc-lora.gguf

INFO:hf-to-gguf:Loading model: medgemma-1.5-4b-tccc-lora-merged
INFO:hf-to-gguf:Model architecture: Gemma3ForConditionalGeneration
INFO:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.index.json'
INFO:hf-to-gguf:gguf: indexing model part 'model-00001-of-00002.safetensors'
INFO:hf-to-gguf:gguf: indexing model part 'model-00002-of-00002.safetensors'
INFO:hf-to-gguf:heuristics detected bfloat16 tensor dtype, setting --outtype bf16
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:token_embd.weight,                 torch.bfloat16 --> BF16, shape = {2560, 262208}
INFO:hf-to-gguf:blk.0.attn_norm.weight,            torch.bfloat16 --> F32, shape = {2560}
INFO:hf-to-gguf:blk.0.ffn_down.weight,             torch.bfloat16 --> BF16, shape = {10240, 2560}
INFO:hf-to-gguf:blk.0.ffn_gate.weight,             torch.bfloat16 --> BF16, shape = {2560, 10240}
INFO:hf-to-gguf:blk.0.ffn_up.weight,               torch.bflo

In [None]:
!apt-get update
!apt-get install -y cmake build-essential

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
            Get:2 https://cli.github.com/packages stable InRelease [3,917 B]
0% [Connecting to archive.ubuntu.com (185.125.190.83)] [Connecting to security.0% [Connecting to archive.ubuntu.com (185.125.190.83)] [Connecting to security.                                                                               Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
0% [Connecting to archive.ubuntu.com (185.125.190.83)] [Waiting for headers] [C0% [Connecting to archive.ubuntu.com (185.125.190.83)] [Waiting for headers] [C                                                                               Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:5 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [83.8 kB]
Hit:6 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:7 https:

In [None]:
!rm -rf llama.cpp/build

In [None]:
!cd llama.cpp && cmake -B build -DCMAKE_BUILD_TYPE=Release

-- The C compiler identification is GNU 11.4.0
-- The CXX compiler identification is GNU 11.4.0
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Check for working C compiler: /usr/bin/cc - skipped
-- Detecting C compile features
-- Detecting C compile features - done
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/c++ - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
[0mCMAKE_BUILD_TYPE=Release[0m
-- Found Git: /usr/bin/git (found version "2.34.1")
-- The ASM compiler identification is GNU
-- Found assembler: /usr/bin/cc
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Success
-- Found Threads: TRUE
-- CMAKE_SYSTEM_PROCESSOR: x86_64
-- GGML_SYSTEM_ARCH: x86
-- Including CPU backend
-- Found OpenMP_C: -fopenmp (found version "4.5")
-- Found OpenMP_CXX: -fopenmp (found version "4.5")
-- Found OpenMP: TRUE (found ve

In [None]:
!cd llama.cpp && cmake --build build --target llama-quantize -- -j1

[  2%] [32mBuilding CXX object vendor/cpp-httplib/CMakeFiles/cpp-httplib.dir/httplib.cpp.o[0m
[  2%] [32m[1mLinking CXX static library libcpp-httplib.a[0m
[  2%] Built target cpp-httplib
[  4%] [32mBuilding C object ggml/src/CMakeFiles/ggml-base.dir/ggml.c.o[0m
[  4%] [32mBuilding CXX object ggml/src/CMakeFiles/ggml-base.dir/ggml.cpp.o[0m
[  4%] [32mBuilding C object ggml/src/CMakeFiles/ggml-base.dir/ggml-alloc.c.o[0m
[  4%] [32mBuilding CXX object ggml/src/CMakeFiles/ggml-base.dir/ggml-backend.cpp.o[0m
[  6%] [32mBuilding CXX object ggml/src/CMakeFiles/ggml-base.dir/ggml-opt.cpp.o[0m
[  6%] [32mBuilding CXX object ggml/src/CMakeFiles/ggml-base.dir/ggml-threading.cpp.o[0m
[  6%] [32mBuilding C object ggml/src/CMakeFiles/ggml-base.dir/ggml-quants.c.o[0m
[  6%] [32mBuilding CXX object ggml/src/CMakeFiles/ggml-base.dir/gguf.cpp.o[0m
[  8%] [32m[1mLinking CXX shared library ../../bin/libggml-base.so[0m
[  8%] Built target ggml-base
[  8%] [32mBuilding C object ggml

In [None]:
!ls llama.cpp/build/bin

libggml-base.so        libggml-cpu.so.0.9.5  libllama.so.0
libggml-base.so.0      libggml.so	     libllama.so.0.0.7839
libggml-base.so.0.9.5  libggml.so.0	     llama-quantize
libggml-cpu.so	       libggml.so.0.9.5
libggml-cpu.so.0       libllama.so


In [None]:
!./llama.cpp/build/bin/llama-quantize \
    medgemma-1.5-4b-tccc-lora.gguf \
    medgemma-1.5-4b-tccc-lora-q4.gguf \
    Q4_K_M

main: build = 7839 (8f80d1b25)
main: built with GNU 11.4.0 for Linux x86_64
main: quantizing 'medgemma-1.5-4b-tccc-lora.gguf' to 'medgemma-1.5-4b-tccc-lora-q4.gguf' as Q4_K_M
llama_model_loader: direct I/O is enabled, disabling mmap
llama_model_loader: loaded meta data with 36 key-value pairs and 444 tensors from medgemma-1.5-4b-tccc-lora.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = gemma3
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Medgemma 1.5 4b Tccc Lora Merged
llama_model_loader: - kv   3:                           general.finetune str              = tccc-lora-merged
llama_model_loader: - kv   4:                           general.basename str              = medgem

In [None]:
!ls -lh medgemma-1.5-4b-tccc-lora-q4.gguf

-rw-r--r-- 1 root root 2.4G Jan 26 19:39 medgemma-1.5-4b-tccc-lora-q4.gguf


In [None]:
from huggingface_hub import list_repo_files
print(list_repo_files("CharlieKingOfTheRats/medgemma-1.5-4b-tccc-lora-gguf"))

['.gitattributes']


In [None]:
from huggingface_hub import HfApi

api = HfApi()
repo_id = "CharlieKingOfTheRats/medgemma-1.5-4b-tccc-lora-gguf"

api.create_repo(
    repo_id=repo_id,
    repo_type="model",
    exist_ok=True,
    private=False  # set True if you want it private
)

RepoUrl('https://huggingface.co/CharlieKingOfTheRats/medgemma-1.5-4b-tccc-lora-gguf', endpoint='https://huggingface.co', repo_type='model', repo_id='CharlieKingOfTheRats/medgemma-1.5-4b-tccc-lora-gguf')

Mount to Google Drive

In [None]:
from google.colab import drive
import os
import shutil

# Mount Google Drive
drive.mount('/content/drive')

# Paths
src = "/content/medgemma-1.5-4b-tccc-lora-q4.gguf"
dst_dir = "/content/drive/MyDrive/ai_models"
dst = os.path.join(dst_dir, os.path.basename(src))

# Make destination folder if needed
os.makedirs(dst_dir, exist_ok=True)

# Copy with progress
print(f"Copying {src} to {dst}")
shutil.copy2(src, dst)

# Verify size
print("\nVerification:")
!ls -lh "$dst"


Mounted at /content/drive
Copying /content/medgemma-1.5-4b-tccc-lora-q4.gguf to /content/drive/MyDrive/ai_models/medgemma-1.5-4b-tccc-lora-q4.gguf

Verification:
-rw------- 1 root root 2.4G Jan 26 19:39 /content/drive/MyDrive/ai_models/medgemma-1.5-4b-tccc-lora-q4.gguf
