In [1]:
!pip install transformers huggingface_hub



In [2]:
!pip install huggingface_hub[hf_xet]

Collecting hf-xet>=0.1.4 (from huggingface_hub[hf_xet])
  Downloading hf_xet-1.1.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (494 bytes)
Downloading hf_xet-1.1.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (53.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.6/53.6 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: hf-xet
Successfully installed hf-xet-1.1.0


In [3]:
!git clone https://github.com/ggerganov/llama.cpp
%cd llama.cpp
%cd /content/llama.cpp
!mkdir build
%cd build
!cmake ..
!cmake --build . --config Release

Cloning into 'llama.cpp'...
remote: Enumerating objects: 50596, done.[K
remote: Counting objects: 100% (386/386), done.[K
remote: Compressing objects: 100% (267/267), done.[K
remote: Total 50596 (delta 269), reused 125 (delta 119), pack-reused 50210 (from 4)[K
Receiving objects: 100% (50596/50596), 111.20 MiB | 37.91 MiB/s, done.
Resolving deltas: 100% (36435/36435), done.
/content/llama.cpp
/content/llama.cpp
/content/llama.cpp/build
-- The C compiler identification is GNU 11.4.0
-- The CXX compiler identification is GNU 11.4.0
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Check for working C compiler: /usr/bin/cc - skipped
-- Detecting C compile features
-- Detecting C compile features - done
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/c++ - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Found Git: /usr/bin/git (found version "2.34.1") 

In [7]:
!cmake .. -DLLAMA_CUDA=ON

  LLAMA_CUDA is deprecated and will be removed in the future.

  Use GGML_CUDA instead

Call Stack (most recent call first):
  CMakeLists.txt:114 (llama_option_depr)

[0m
-- CMAKE_SYSTEM_PROCESSOR: x86_64
-- Including CPU backend
-- x86 detected
-- Adding CPU backend variant ggml-cpu: -march=native 
-- Could not find nvcc, please set CUDAToolkit_ROOT.
[31mCMake Error at ggml/src/ggml-cuda/CMakeLists.txt:183 (message):
  CUDA Toolkit not found

[0m
-- Configuring incomplete, errors occurred!
See also "/content/llama.cpp/build/CMakeFiles/CMakeOutput.log".


In [None]:
%env HF_TOKEN=

env: HF_TOKEN=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


In [8]:
import os
import subprocess
import shutil
from huggingface_hub import snapshot_download
from transformers import AutoTokenizer

# Define paths and model ID
model_id = "unsloth/Llama-3.2-1B"
model_dir = "./llama-3.2-1b"
gguf_dir = "./llama-3.2-1b-gguf"
gguf_model_path = os.path.join(gguf_dir, "ggml-model-f16.gguf")
quantized_model_path = os.path.join(gguf_dir, "ggml-model-Q4_K_M.gguf")
llama_cpp_dir = "/content/llama.cpp"

# Create directories
os.makedirs(model_dir, exist_ok=True)
os.makedirs(gguf_dir, exist_ok=True)

def run_command(command, error_message):
    result = subprocess.run(command, capture_output=True, text=True, shell=True)
    if result.returncode != 0:
        raise RuntimeError(f"{error_message}: {result.stderr}")
    return result

try:
    # Step 0: Set up llama.cpp
    if not os.path.exists(llama_cpp_dir):
        print("Setting up llama.cpp...")
        # Install dependencies (for Ubuntu/Colab-like environments)
        run_command(
            "apt-get update && apt-get install -y git cmake build-essential",
            "Failed to install dependencies"
        )
        # Clone llama.cpp
        run_command(
            f"git clone https://github.com/ggerganov/llama.cpp {llama_cpp_dir}",
            "Failed to clone llama.cpp"
        )
        # Build llama.cpp
        os.makedirs(os.path.join(llama_cpp_dir, "build"), exist_ok=True)
        os.chdir(os.path.join(llama_cpp_dir, "build"))
        run_command(
            "cmake .. && make -j$(nproc)",
            "Failed to compile llama.cpp"
        )
        os.chdir("/content")  # Return to working directory

    # Step 1: Download model from Hugging Face
    print(f"Downloading model {model_id} to {model_dir}...")
    snapshot_download(
        repo_id=model_id,
        local_dir=model_dir,
        token=os.environ.get("HF_TOKEN")  # Or replace with token="your_token"
    )

    # Step 2: Save tokenizer
    print(f"Saving tokenizer to {model_dir}...")
    tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ.get("HF_TOKEN"))
    tokenizer.save_pretrained(model_dir)

    # Step 3: Convert to GGUF
    print("Converting model to GGUF format...")
    convert_script = os.path.join(llama_cpp_dir, "convert_hf_to_gguf.py")
    result = subprocess.run(
        ["python", convert_script, model_dir, "--outfile", gguf_model_path, "--outtype", "f16"],
        capture_output=True, text=True
    )
    if result.returncode != 0:
        raise RuntimeError(f"GGUF conversion failed: {result.stderr}")
    if not os.path.exists(gguf_model_path):
        raise FileNotFoundError("GGUF conversion failed: GGUF model not found.")

    # Step 4: Quantize to Q4_K_M
    print("Quantizing model to Q4_K_M...")
    quantize_bin = os.path.join(llama_cpp_dir, "build/bin/llama-quantize")  # Updated path
    if not os.path.exists(quantize_bin):
        raise FileNotFoundError(f"llama-quantize binary not found at {quantize_bin}. Please ensure llama.cpp was compiled.")
    result = subprocess.run(
        [quantize_bin, gguf_model_path, quantized_model_path, "Q4_K_M"],
        capture_output=True, text=True
    )
    if result.returncode != 0:
        raise RuntimeError(f"Quantization failed: {result.stderr}")

    # Verify quantized model
    if os.path.exists(quantized_model_path):
        print(f"Quantized model saved to {quantized_model_path}")
    else:
        raise FileNotFoundError("Quantization failed: Quantized model not found.")

except Exception as e:
    print(f"An error occurred: {str(e)}")
    import traceback
    traceback.print_exc()

finally:
    # Clean up
    if os.path.exists(model_dir):
        print(f"Cleaning up temporary model directory {model_dir}...")
        shutil.rmtree(model_dir)

Downloading model unsloth/Llama-3.2-1B to ./llama-3.2-1b...


Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/935 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.72k [00:00<?, ?B/s]

Saving tokenizer to ./llama-3.2-1b...
Converting model to GGUF format...
Quantizing model to Q4_K_M...
Quantized model saved to ./llama-3.2-1b-gguf/ggml-model-Q4_K_M.gguf
Cleaning up temporary model directory ./llama-3.2-1b...


In [10]:
!pip install psutil torch



In [11]:
import os
import subprocess
import shutil
import time
import psutil
from huggingface_hub import snapshot_download
from transformers import AutoTokenizer
import torch

# Define paths and model ID
model_id = "unsloth/Llama-3.2-1B"
model_dir = "./llama-3.2-1b"
gguf_dir = "./llama-3.2-1b-gguf"
gguf_model_path = os.path.join(gguf_dir, "ggml-model-f16.gguf")
quantized_model_path = os.path.join(gguf_dir, "ggml-model-Q4_K_M.gguf")
llama_cpp_dir = "/content/llama.cpp"

# Define prompts for testing
test_prompts = [
    "What is the capital of France?",
    "Explain the theory of relativity in simple terms.",
    "Write a short poem about the stars."
]

# Define quantization configurations
quantization_configs = [
    "Q4_K_M",  # Default quantization
    "Q8_0"     # Additional quantization for testing
]

# Create directories
os.makedirs(model_dir, exist_ok=True)
os.makedirs(gguf_dir, exist_ok=True)

def run_command(command, error_message):
    result = subprocess.run(command, capture_output=True, text=True, shell=True)
    if result.returncode != 0:
        raise RuntimeError(f"{error_message}: {result.stderr}")
    return result

def check_gpu_bfloat16_support():
    print("Checking GPU compatibility with bfloat16...")
    try:
        # Check if NVIDIA GPU is available
        result = subprocess.run("nvidia-smi", shell=True, capture_output=True, text=True)
        if result.returncode != 0:
            print("No NVIDIA GPU detected or nvidia-smi not installed.")
            return False

        # Check for bfloat16 support (requires CUDA and specific GPU architecture, e.g., Ampere or later)
        if torch.cuda.is_available():
            device = torch.device("cuda")
            # bfloat16 is supported on GPUs with compute capability >= 8.0 (e.g., A100, RTX 3000 series)
            compute_capability = torch.cuda.get_device_capability(device)
            if compute_capability[0] >= 8:
                print(f"GPU supports bfloat16 (Compute Capability: {compute_capability[0]}.{compute_capability[1]})")
                return True
            else:
                print(f"GPU does not support bfloat16 (Compute Capability: {compute_capability[0]}.{compute_capability[1]})")
                return False
        else:
            print("CUDA not available. GPU bfloat16 support cannot be checked.")
            return False
    except Exception as e:
        print(f"Error checking GPU bfloat16 support: {str(e)}")
        return False

def measure_memory():
    """Measure current memory usage in MB."""
    process = psutil.Process(os.getpid())
    mem_info = process.memory_info()
    return mem_info.rss / 1024 / 1024  # Convert to MB

def run_inference(model_path, prompt, model_name):
    """Run inference using llama.cpp and measure performance."""
    print(f"\nRunning inference on {model_name} with prompt: {prompt}")
    start_time = time.time()
    baseline_memory = measure_memory()

    llama_cli = os.path.join(llama_cpp_dir, "build/bin/llama-cli")
    if not os.path.exists(llama_cli):
        raise FileNotFoundError(f"llama-cli binary not found at {llama_cli}. Please compile llama.cpp.")

    # Run inference
    result = subprocess.run(
        [llama_cli, "-m", model_path, "-p", prompt, "--n-predict", "100"],
        capture_output=True, text=True
    )
    if result.returncode != 0:
        raise RuntimeError(f"Inference failed for {model_name}: {result.stderr}")

    end_time = time.time()
    inference_time = end_time - start_time
    peak_memory = measure_memory()

    print(f"Output: {result.stdout}")
    print(f"Inference Time: {inference_time:.2f} seconds")
    print(f"Memory Usage: Baseline {baseline_memory:.2f} MB, Peak {peak_memory:.2f} MB")
    return result.stdout, inference_time, peak_memory

try:
    # Step 0: Check GPU bfloat16 support
    has_bfloat16_support = check_gpu_bfloat16_support()

    # Step 1: Download model from Hugging Face (if not already done)
    if not os.path.exists(gguf_model_path):
        print(f"Downloading model {model_id} to {model_dir}...")
        snapshot_download(
            repo_id=model_id,
            local_dir=model_dir,
            token=os.environ.get("HF_TOKEN")
        )

        # Step 2: Save tokenizer
        print(f"Saving tokenizer to {model_dir}...")
        tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ.get("HF_TOKEN"))
        tokenizer.save_pretrained(model_dir)

        # Step 3: Convert to GGUF
        print("Converting model to GGUF format...")
        convert_script = os.path.join(llama_cpp_dir, "convert_hf_to_gguf.py")
        result = subprocess.run(
            ["python", convert_script, model_dir, "--outfile", gguf_model_path, "--outtype", "f16"],
            capture_output=True, text=True
        )
        if result.returncode != 0:
            raise RuntimeError(f"GGUF conversion failed: {result.stderr}")
        if not os.path.exists(gguf_model_path):
            raise FileNotFoundError("GGUF conversion failed: GGUF model not found.")

    # Step 4: Quantize model for each configuration
    quantize_bin = os.path.join(llama_cpp_dir, "build/bin/llama-quantize")
    if not os.path.exists(quantize_bin):
        raise FileNotFoundError(f"llama-quantize binary not found at {quantize_bin}. Please compile llama.cpp.")

    for quant_type in quantization_configs:
        quantized_model_path = os.path.join(gguf_dir, f"ggml-model-{quant_type}.gguf")
        if not os.path.exists(quantized_model_path):
            print(f"Quantizing model to {quant_type}...")
            result = subprocess.run(
                [quantize_bin, gguf_model_path, quantized_model_path, quant_type],
                capture_output=True, text=True
            )
            if result.returncode != 0:
                raise RuntimeError(f"Quantization failed for {quant_type}: {result.stderr}")
            if not os.path.exists(quantized_model_path):
                raise FileNotFoundError(f"Quantization failed: {quantized_model_path} not found.")
        print(f"Quantized model saved to {quantized_model_path}")

    # Step 5: Test original (FP16) and quantized models
    print("\nTesting models...")
    models_to_test = [
        (gguf_model_path, "Original FP16"),
        (os.path.join(gguf_dir, "ggml-model-Q4_K_M.gguf"), "Quantized Q4_K_M"),
        (os.path.join(gguf_dir, "ggml-model-Q8_0.gguf"), "Quantized Q8_0")
    ]

    for model_path, model_name in models_to_test:
        if os.path.exists(model_path):
            for prompt in test_prompts:
                output, inference_time, peak_memory = run_inference(model_path, prompt, model_name)
        else:
            print(f"Skipping {model_name}: Model file {model_path} not found.")

except Exception as e:
    print(f"An error occurred: {str(e)}")
    import traceback
    traceback.print_exc()

finally:
    # Clean up
    if os.path.exists(model_dir):
        print(f"Cleaning up temporary model directory {model_dir}...")
        shutil.rmtree(model_dir)

Checking GPU compatibility with bfloat16...
No NVIDIA GPU detected or nvidia-smi not installed.
Quantized model saved to ./llama-3.2-1b-gguf/ggml-model-Q4_K_M.gguf
Quantized model saved to ./llama-3.2-1b-gguf/ggml-model-Q8_0.gguf

Testing models...

Running inference on Original FP16 with prompt: What is the capital of France?
Output: What is the capital of France? A. Washington, D.C. B. Montpelier, Vermont C. Baton Rouge, Louisiana D. Paris, France
What is the capital of France? A. Washington, D.C. B. Montpelier, Vermont C. Baton Rouge, Louisiana D. Paris, France
What is the capital of France? A. Washington, D.C. B. Montpelier, Vermont C. Baton Rouge, Louisiana D. Paris, France
The capital of France is Paris, France


Inference Time: 109.96 seconds
Memory Usage: Baseline 1566.64 MB, Peak 1566.64 MB

Running inference on Original FP16 with prompt: Explain the theory of relativity in simple terms.
Output: Explain the theory of relativity in simple terms. The theory of relativity is the 