In [None]:
%%capture
!pip install gradio
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git
!pip install peft==0.14.0
!pip install huggingface_hub
!pip install transformers==4.46.3

In [None]:
!python --version
!pip list | grep -E 'peft|transformers|unsloth'

Python 3.10.12
peft                               0.14.0
sentence-transformers              3.2.1
transformers                       4.46.3
unsloth                            2024.12.4
unsloth_zoo                        2024.12.1


In [None]:
from unsloth import FastLanguageModel
from huggingface_hub import snapshot_download
from transformers import AutoConfig
models = {
    "orpo": "EITD/orpo_1",
    # "orpo": "EITD/orpo",
    "origin_llama": "unsloth/Llama-3.2-1B-Instruct",
}

output_dir = "./models"

for model in models.values():
    print(f"Downloading model: {model}")
    snapshot_download(repo_id=model, local_dir=f"{output_dir}/{model}")
    print(f"Model {model} downloaded successfully!")

Downloading model: EITD/orpo_1
Model EITD/orpo_1 downloaded successfully!
Downloading model: unsloth/Llama-3.2-1B-Instruct
Model unsloth/Llama-3.2-1B-Instruct downloaded successfully!


In [None]:
def load_model(model_name):
    model_path = "./models/" + models.get(model_name)
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = model_path, # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = 2048,
        dtype = None,
        load_in_4bit = True,
    )
    return model, tokenizer

def update_model_and_tokenizer(model_name):
    global model, tokenizer
    model, tokenizer = load_model(model_name)
    FastLanguageModel.for_inference(model)
    print(f"now using {model_name}")

In [None]:
def respond(
    message,
    history: list[tuple[str, str]],
    model_name,
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    messages = [{"role": "system", "content": system_message}]

    for val in history:
        if val[0]:
            messages.append({"role": "user", "content": val[0]})
        if val[1]:
            messages.append({"role": "assistant", "content": val[1]})

    messages.append({"role": "user", "content": message})

    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize = True,
        add_generation_prompt = True, # Must add for generation
        return_tensors = "pt",
    )

    outputs = model.generate(input_ids = inputs, max_new_tokens = max_tokens, use_cache = True,
                         temperature = temperature, min_p = top_p)

    response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

    if "assistant" in response:
        response = response.split("assistant")[-1].strip()

    yield response

In [None]:
import gradio as gr

model, tokenizer = None, None

model_dropdown = gr.Dropdown(choices=list(models.keys()), label="Choose a Language Model", value="orpo")
system_message = gr.Textbox(value="You are a friendly Chatbot.", label="System message")
new_token = gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens")
temperature = gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature")
top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)")

update_model_and_tokenizer(model_dropdown.value)

with gr.Blocks(fill_height = True, fill_width = True) as demo:
    model_dropdown.change(update_model_and_tokenizer, inputs=model_dropdown, outputs=None)

    chatbot = gr.ChatInterface(
        respond,
        additional_inputs=[model_dropdown, system_message, new_token, temperature, top_p],
        title="HuggingFace Chatbot",
        description="A chat assistant using a Hugging Face model that supports custom system prompts, token length, generation temperature, and other parameters.",
    )


if __name__ == "__main__":
    demo.launch()

==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.46.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
now using orpo




Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://dc0a81bb3d797fb501.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
