In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
from safetensors.torch import load_file, save_file
from huggingface_hub import InferenceApi, InferenceClient, upload_folder, create_repo

import torch
import os
import json

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# === 設定路徑 ===
# 原始 MLX LoRA 檔案
mlx_path = "../adapters/lora-Llama-3.2-3B-Instruct-lr-6/adapters.safetensors"
output_dir = "../converted_adapters"  # 轉換後儲存目錄
os.makedirs(output_dir, exist_ok=True)

# === 載入 MLX 權重 ===
print(f"🔄 讀取 MLX LoRA 權重檔：{mlx_path}")
mlx_tensors = load_file(mlx_path)

# === 轉存為 Hugging Face 格式 ===
adapter_model_path = os.path.join(output_dir, "adapter_model.safetensors")
save_file(mlx_tensors, adapter_model_path)
print(f"✅ 儲存轉換後的權重到：{adapter_model_path}")

# === 建立 adapter_config.json ===
adapter_config = {
    "peft_type": "LORA",
    "base_model_name_or_path": "google/gemma-2-9b-it",
    "inference_mode": True,
    "r": 8,
    "lora_alpha": 16,
    "lora_dropout": 0.0,
    "bias": "none",
    "task_type": "CAUSAL_LM"
}

adapter_config_path = os.path.join(output_dir, "adapter_config.json")
with open(adapter_config_path, "w") as f:
    json.dump(adapter_config, f, indent=4)

print(f"✅ 建立 adapter_config.json：{adapter_config_path}")

In [None]:
base_model_name = "meta-llama/Llama-3.2-3B-Instruct"
lora_adapter_path = "../converted_adapters"
output_path = "../models/lora-Llama-3.2-3B-Instruct"

device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using device: {device}")

model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    torch_dtype=torch.float16
)
model.to(device)

tokenizer = AutoTokenizer.from_pretrained(base_model_name)

model = PeftModel.from_pretrained(
    model,
    model_id=lora_adapter_path
)
model = model.merge_and_unload()
model.to(device)

model.save_pretrained(output_path)
tokenizer.save_pretrained(output_path)

print("模型與 tokenizer 已儲存完畢")

In [28]:
upload_folder(
    repo_id="weber50432/lora-Llama-3.2-3B-Instruct",
    folder_path="../models/lora-Llama-3.2-3B-Instruct",
    path_in_repo=".",  # 上傳整包內容到根目錄
    repo_type="model"
)

CommitInfo(commit_url='https://huggingface.co/weber50432/lora-Llama-3.2-3B-Instruct/commit/11dc56166ac47baf7d120b170e9a5c73728d7314', commit_message='Upload folder using huggingface_hub', commit_description='', oid='11dc56166ac47baf7d120b170e9a5c73728d7314', pr_url=None, repo_url=RepoUrl('https://huggingface.co/weber50432/lora-Llama-3.2-3B-Instruct', endpoint='https://huggingface.co', repo_type='model', repo_id='weber50432/lora-Llama-3.2-3B-Instruct'), pr_revision=None, pr_num=None)

In [9]:
# Replace with your model repository ID
repo_id = "meta-llama/Llama-3.2-3B-Instruct"

# read API token from json file
with open("hg-api-key.json", "r") as f:
    data = json.load(f)
    token = data["key"]


def call_hf(prompt, max_new_tokens, temperature=0):
    """
    Wrapper function to query Hugging Face Inference API.
    """
    client_hf = InferenceClient(api_key=token)

    response = client_hf.chat_completion(
        model=repo_id,
        messages=[{"role": "user", "content": prompt}],
        max_tokens=max_new_tokens,
        temperature=temperature
    )
    return response.choices[0].message.content

In [10]:
prompt = "What is the meaning of life?"
max_new_tokens = 50
call_hf(prompt, max_new_tokens)

'The question of the meaning of life is one of the most profound and debated topics in human history. It has been explored by philosophers, theologians, scientists, and many others across various cultures and disciplines. While there is no one definitive answer, here'