<a href="https://colab.research.google.com/github/Aang-CHO2/gradio_LLaMa/blob/main/Llama_3_1_MedPalm2_8B.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
!pip install gradio # Install the gradio package

import os
import gradio as gr # Now this line should work
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

def get_huggingface_token():
    """Get the Hugging Face token from environment variables or user input."""
    hf_token = os.getenv('HUGGINGFACE_TOKEN')
    if not hf_token:
        print("HUGGINGFACE_TOKEN environment variable is not set.")
        hf_token = input("Please enter your Hugging Face token: ").strip()
        if not hf_token:
            raise ValueError("A valid Hugging Face token is required to proceed.")
    return hf_token

try:
    # Get Hugging Face token
    hf_token = get_huggingface_token()

    # Model name
    model_name = "Laim/Llama-3.1-MedPalm2-imitate-8B-Instruct"

    # Check if GPU is available and set the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Load the tokenizer and model, and move the model to the correct device
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=hf_token)
    model = AutoModelForCausalLM.from_pretrained(model_name, use_auth_token=hf_token).to(device)

except Exception as e:
    print(f"Error loading model: {e}")
    raise

def chatbot_fn(prompt, chatbot_history=[]):
    """Generate a response based on user input and conversation history."""
    if not prompt.strip():
        return "Please enter a valid prompt.", chatbot_history

    # Build the input text from conversation history and the new prompt
    if chatbot_history:
        conversation = [item['content'] for item in chatbot_history]
        input_text = "\n".join(conversation) + f"\nUser: {prompt}\nAssistant:"
    else:
        input_text = f"User: {prompt}\nAssistant:"

    try:
        inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=512).to(device)
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=100,
                pad_token_id=tokenizer.eos_token_id
            )
        response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        assistant_response = response_text.split("Assistant:")[-1].strip()
    except Exception as e:
        return f"An error occurred: {e}", chatbot_history

    # Append the user input and assistant response to the conversation history
    chatbot_history.append({"role": "user", "content": prompt})
    chatbot_history.append({"role": "assistant", "content": assistant_response})

    return assistant_response, chatbot_history

# Set up Gradio interface
iface = gr.Interface(
    fn=chatbot_fn,
    inputs=["text", "state"],
    outputs=["text", "state"],
    title="Llama-3.1-MedPalm2-imitate-8B-Instruct Chatbot",
    description="Chat with a Llama-3.1-MedPalm2-imitate-8B-Instruct model!",
    allow_flagging="never",
)

if __name__ == "__main__":
    iface.launch()



Collecting gradio
  Downloading gradio-5.4.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.4-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.4.2 (from gradio)
  Downloading gradio_client-1.4.2-py3-none-any.whl.metadata (7.1 kB)
Collecting huggingface-hub>=0.25.1 (from gradio)
  Downloading huggingface_hub-0.26.2-py3-none-any.whl.metadata (13 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart==0.0.12 (from gradio)
  Downloading python_multipart-0.0.12-py3-none-any.whl.metadata (1.9 kB)
Col



tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/914 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]



Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://b2cda93a899ddcb0c5.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
