In [1]:
# Check for GPU and install build dependencies
!nvidia-smi
!apt-get update && apt-get install -y cmake build-essential python3-pip

# Clone llama.cpp repository
!git clone https://github.com/ggerganov/llama.cpp
%cd llama.cpp

# Install Python requirements for the conversion script
!pip install -r requirements.txt

# Build llama.cpp with CUDA (cuBLAS) support for GPU acceleration
!cmake -B build -DGGML_CUDA=ON
!cmake --build build --config Release -j $(nproc)

Mon Dec 29 03:39:04 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   51C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

-- The C compiler identification is GNU 11.4.0
-- The CXX compiler identification is GNU 11.4.0
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Check for working C compiler: /usr/bin/cc - skipped
-- Detecting C compile features
-- Detecting C compile features - done
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/c++ - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
[0mCMAKE_BUILD_TYPE=Release[0m
-- Found Git: /usr/bin/git (found version "2.34.1")
-- The ASM compiler identification is GNU
-- Found assembler: /usr/bin/cc
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Success
-- Found Threads: TRUE
-- CMAKE_SYSTEM_PROCESSOR: x86_64
-- GGML_SYSTEM_ARCH: x86
-- Including CPU backend
-- Found OpenMP_C: -fopenmp (found version "4.5")
-- Found OpenMP_CXX: -fopenmp (found version "4.5")
-- Found OpenMP: TRUE (found ve

In [15]:
# Install/Update Hugging Face Hub (Ensure we have the latest 'hf' command)
!pip install -U huggingface_hub

import os

# Define model ID and directory
MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"
MODEL_DIR = "model_raw"

# Method 1: Try the modern 'hf' CLI (Flag '--local-dir-use-symlinks' is REMOVED in v1.0+)
print("Attempting download with 'hf' CLI...")
exit_code = os.system(f"hf download {MODEL_ID} --local-dir {MODEL_DIR}")

# Method 2: Python Fallback (If CLI fails or isn't in PATH)
if exit_code != 0:
    print("\nCLI failed or not found. Switching to Python API...")
    from huggingface_hub import snapshot_download

    # In newer versions, 'local_dir_use_symlinks' is often implicit or deprecated,
    # but 'local_dir' ensures files are placed where we want them.
    snapshot_download(
        repo_id=MODEL_ID,
        local_dir=MODEL_DIR,
        ignore_patterns=["*.msgpack", "*.h5", "*.ot"], # Ignore non-GGUF/SafeTensors formats to save space
    )

print(f"\nSuccess! Model downloaded to: {MODEL_DIR}")

Attempting download with 'hf' CLI...

Success! Model downloaded to: model_raw


In [17]:
# Fix Version Conflict: Upgrade transformers to support Qwen 2.5 and newer huggingface-hub
!pip install -U transformers accelerate

import os

# Define paths (Ensure these match your previous cells)
MODEL_DIR = "model_raw"
fp16_output = "qwen2.5-1.5b-instruct-fp16.gguf"

# Verify model directory exists before running
if not os.path.exists(MODEL_DIR):
    print(f"Error: Model directory '{MODEL_DIR}' not found. Did Cell 2 finish successfully?")
else:
    # Run conversion script
    # Usage: convert_hf_to_gguf.py [dir] --outfile [out_file]
    !python convert_hf_to_gguf.py {MODEL_DIR} --outfile {fp16_output} --outtype f16

    print(f"Conversion complete: {fp16_output}")

Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Downloading huggingface_hub-0.36.0-py3-none-any.whl (566 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m566.1/566.1 kB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: huggingface-hub
  Attempting uninstall: huggingface-hub
    Found existing installation: huggingface_hub 1.2.3
    Uninstalling huggingface_hub-1.2.3:
      Successfully uninstalled huggingface_hub-1.2.3
Successfully installed huggingface-hub-0.36.0
INFO:hf-to-gguf:Loading model: model_raw
INFO:hf-to-gguf:Model architecture: Qwen2ForCausalLM
INFO:hf-to-gguf:gguf: indexing model part 'model.safetensors'
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:token_embd.weight,         torch.bfloat16 --> F16, shape = {1536, 151936}
INFO:hf-to-gguf:blk.0.attn_norm.weight,    torch.b

In [18]:
# Define quantization target
quantized_output = "qwen2.5-1.5b-instruct-q4_k_m.gguf"
quant_method = "q4_k_m"
binary_path = "./build/bin/llama-quantize"

# Run quantization
# Usage: ./llama-quantize [input_gguf] [output_gguf] [method]
!{binary_path} {fp16_output} {quantized_output} {quant_method}

print(f"Quantization complete. File ready: {quantized_output}")

# Cleanup the FP16 file to free up disk space before downloading
if os.path.exists(fp16_output):
    os.remove(fp16_output)
    print("Intermediate FP16 file removed to save space.")

main: build = 7565 (382808c14)
main: built with GNU 11.4.0 for Linux x86_64
main: quantizing 'qwen2.5-1.5b-instruct-fp16.gguf' to 'qwen2.5-1.5b-instruct-q4_k_m.gguf' as Q4_K_M
llama_model_loader: loaded meta data with 35 key-value pairs and 338 tensors from qwen2.5-1.5b-instruct-fp16.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = qwen2
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                     general.sampling.top_k i32              = 20
llama_model_loader: - kv   3:                     general.sampling.top_p f32              = 0.800000
llama_model_loader: - kv   4:                      general.sampling.temp f32              = 0.700000
llama_model_loader: - kv   5:                               general.name str              = 

In [19]:
# Quick inference test using llama-cli
cli_path = "./build/bin/llama-cli"
prompt = "Explain quantum computing in one sentence."

!{cli_path} -m {quantized_output} -p "{prompt}" -n 32 -t 4 -ngl 99

ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
ggml_cuda_init: found 1 CUDA devices:
  Device 0: Tesla T4, compute capability 7.5, VMM: yes

Loading model... |-\|/-\|/-\|/-\|/-\|/- 


▄▄ ▄▄
██ ██
██ ██  ▀▀█▄ ███▄███▄  ▀▀█▄    ▄████ ████▄ ████▄
██ ██ ▄█▀██ ██ ██ ██ ▄█▀██    ██    ██ ██ ██ ██
██ ██ ▀█▄██ ██ ██ ██ ▀█▄██ ██ ▀████ ████▀ ████▀
                                    ██    ██
                                    ▀▀    ▀▀

build      : b7565-382808c14
model      : qwen2.5-1.5b-instruct-q4_k_m.gguf
modalities : text

available commands:
  /exit or Ctrl+C     stop or exit
  /regen              regenerate the last response
  /clear              clear the chat history
  /read               add a text file

[1m[32m
> Explain quantum computing in one sentence.
[0m
| Quantum computing leverages quantum-mechanical phenomena to perform operations on data, potentially solving complex problems much faster than classical comput

In [20]:
from google.colab import drive
import shutil

# Mount Google Drive
drive.mount('/content/drive')

# Define destination in Drive (root folder)
destination_path = "/content/drive/My Drive/" + quantized_output

print(f"Copying {quantized_output} to Google Drive...")
shutil.copy(quantized_output, destination_path)
print(f"Successfully saved to: {destination_path}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Copying qwen2.5-1.5b-instruct-q4_k_m.gguf to Google Drive...
Successfully saved to: /content/drive/My Drive/qwen2.5-1.5b-instruct-q4_k_m.gguf
