In [None]:
!pip install transformers torch

In [None]:
!pip install gguf

In [None]:
!pip install huggingface_hub[hf_xet]

In [None]:
!git clone https://github.com/ggerganov/llama.cpp
%cd llama.cpp
%cd /content/llama.cpp
!mkdir build
%cd build
!cmake ..
!cmake --build . --config Release

In [None]:
!cmake .. -DLLAMA_CUDA=ON

In [None]:
import os
import subprocess
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Step 1: Load and save the model in a compatible format
def save_hf_model(model_name, output_dir):
    print(f"Loading model {model_name}...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    print(f"Saving model to {output_dir}...")
    tokenizer.save_pretrained(output_dir)
    model.save_pretrained(output_dir)
    print("Model saved successfully.")

# Step 2: Convert to GGUF using llama.cpp's convert_hf_to_gguf.py
def convert_to_gguf(hf_model_dir, gguf_output_path, llama_cpp_path):
    # Check for possible script locations
    possible_scripts = [
        os.path.join(llama_cpp_path, "convert_hf_to_gguf.py"),
        os.path.join(llama_cpp_path, "scripts/convert_hf_to_gguf.py"),
        os.path.join(llama_cpp_path, "convert-hf-to-gguf.py")
    ]

    convert_script = None
    for script_path in possible_scripts:
        if os.path.exists(script_path):
            convert_script = script_path
            break

    if not convert_script:
        raise FileNotFoundError(f"Could not find convert_hf_to_gguf.py or similar script in {llama_cpp_path}. Run '!ls /content/llama.cpp' to verify contents.")

    print("Converting to GGUF format...")
    cmd = [
        "python", convert_script,
        hf_model_dir,
        "--outfile", gguf_output_path,
        "--model-name", "Qwen3-1.7B"
    ]
    result = subprocess.run(cmd, capture_output=True, text=True)
    if result.returncode != 0:
        print("Error during GGUF conversion:")
        print(result.stderr)
        raise RuntimeError("GGUF conversion failed")
    print(f"GGUF model saved to {gguf_output_path}")

# Step 3: Quantize the GGUF model
def quantize_gguf(gguf_input_path, quantized_output_path, llama_cpp_path, quant_type="Q4_0"):
    quantize_bin = os.path.join(llama_cpp_path, "build/bin/llama-quantize")
    if not os.path.exists(quantize_bin):
        raise FileNotFoundError(f"llama-quantize binary not found in {llama_cpp_path}/build/bin. Ensure 'cmake' and 'cmake --build' completed successfully.")

    print(f"Quantizing GGUF model to {quant_type}...")
    cmd = [
        quantize_bin,
        gguf_input_path,
        quantized_output_path,
        quant_type
    ]
    result = subprocess.run(cmd, capture_output=True, text=True)
    if result.returncode != 0:
        print("Error during quantization:")
        print(result.stderr)
        raise RuntimeError("Quantization failed")
    print(f"Quantized model saved to {quantized_output_path}")

def main():
    # Configuration
    model_name = "Qwen/Qwen3-1.7B"
    hf_model_dir = "/content/qwen3_1.7b_hf"
    gguf_output_path = "/content/qwen3_8b.gguf"
    quantized_output_path = "/content/qwen3_1.7b_q4_0.gguf"
    llama_cpp_path = "/content/llama.cpp"  # Path to llama.cpp in Colab
    quant_type = "Q4_0"  # Options: Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, etc.

    # Step 1: Save Hugging Face model
    save_hf_model(model_name, hf_model_dir)

    # Step 2: Convert to GGUF
    convert_to_gguf(hf_model_dir, gguf_output_path, llama_cpp_path)

    # Step 3: Quantize
    quantize_gguf(gguf_output_path, quantized_output_path, llama_cpp_path, quant_type)

if __name__ == "__main__":
    main()

In [None]:
!pip install huggingface_hub

In [None]:
!huggingface-cli login

In [None]:
from huggingface_hub import HfApi, create_repo, login
import os

def upload_to_huggingface(
    model_path: str,
    repo_name: str,
    token: str,
    private: bool = False,
    exist_ok: bool = True
):
    """
    Uploads a local model folder to Hugging Face Hub.

    Args:
        model_path: Path to the local model directory or file.
        repo_name: Repository name on Hugging Face (e.g., "qwen3-1.7b-gguf").
        token: Your Hugging Face API token.
        private: Whether to make the repo private.
        exist_ok: If True, uploads to existing repo if exists; otherwise raises error.
    """
    # Log in to Hugging Face
    login(token=token)

    api = HfApi()

    # Full repository ID
    repo_id = f"avinashhm/{repo_name}"

    # Create a new repo if it doesn't exist
    try:
        create_repo(repo_id=repo_id, private=private, token=token, exist_ok=exist_ok)
    except Exception as e:
        print(f"Error creating repository: {e}")
        return

    # Upload files
    print(f"Uploading model from {model_path} to {repo_id}...")
    api.upload_folder(
        folder_path=model_path,
        repo_id=repo_id,
        repo_type="model",
        token=token
    )
    print(f"Model uploaded successfully to https://huggingface.co/ {repo_id}")

if __name__ == "__main__":
    # Configuration
    HF_TOKEN = "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"  # Replace with your actual token
    MODEL_PATH = "/content"  # Folder containing the quantized .gguf model
    REPO_NAME = "qwen3-1.7b-gguf-q4_0"  # Name of the repo on Hugging Face

    upload_to_huggingface(
        model_path=MODEL_PATH,
        repo_name=REPO_NAME,
        token=HF_TOKEN,
        private=False
    )