<a href="https://colab.research.google.com/github/AlessandroDiLauro/Chat-with-any-LLMs---/blob/main/inference_api_with_gradio_mistralai_mistral_7b_instruct_v0_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mistral + HF Gradio Chat with any LLMs 🤘

###  Using the new powerful [Mistral 7B](https://mistral.ai/news/announcing-mistral-7b/) and [Gradio](https://www.gradio.app/) Build & Share LLM Apps.



<a target="_blank" href="https://gradio.app">
<img src="https://huggingface.co/front/assets/spaces-launch-page/gradio-logo.svg" width="270" style="display:inline;">
<div align="left">

<em>Build & share delightful machine learning apps easily</em>
</a>


<img src="https://techcrunch.com/wp-content/uploads/2023/09/mistral-7b-v0.1.jpg" width="270" style="display:inline;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
<div align="center">


---



In [None]:
pip install gradio # 'gradio' is the name of the package you want to install. Gradio makes it easy to create UIs for prototyping your machine learning model or for creating an interface for it.

In [None]:
import requests # The 'requests' library is imported. This library allows you to send HTTP requests using Python.

API_URL = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.1"
headers = {"Authorization": "Bearer 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"}

def query(payload): # The 'query' function is defined. This function sends a POST request to the API URL with the provided headers and payload (data
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()

output = query({
	"inputs": "Gli LLms sono fantastici perché riescono",
})

In [None]:
output

[{'generated_text': 'Gli LLms sono fantastici perché riescono a capire e rispondere a qualsiasi domanda che ti sia stata posta'}]

In [None]:
import gradio as gr

# The 'generate' function is defined. This function takes an input string,
#sends it to the Hugging Face Inference API, and returns the generated text.
def generate(input):
    output = query({
	"inputs": input,
})
    return output[0]['generated_text']

gr.close_all()
demo = gr.Interface(fn=generate, inputs="text", outputs="text") # A new Gradio Interface is created. The 'generate' function is passed as the function to run when the interface receives input. The types of the input and output are both set to "text".
demo.launch(share=True) # The 'launch' method of the Gradio Interface is called. This method starts the interface. The 'share' parameter is set to True, which means that a publicly shareable link to the interface will be generated.

In [None]:
pip install text_generation

In [None]:
# Helper function
import requests, json
from text_generation import Client


# Create an instance of the client, specifying the model to use
#Mistral-7B-Instruct endpoint on the text_generation library
client = Client(  "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.1", headers=headers, timeout=120)

In [None]:
prompt = "Has math been invented or discovered?"

# 'client.generate' is a method provided by the 'text_generation' package.
# It takes in a prompt and generates a response using the specified model.
# 'max_new_tokens=200' is an argument that limits the length of the generated text to 200 tokens
client.generate( prompt, max_new_tokens=200).generated_text

'\nAnswer: Discovered'

Building an app to chat with any LLM!

In [None]:
#
import gradio as gr
def generate(input, slider):
    output = client.generate(input, max_new_tokens=slider).generated_text
    return output

# Create a Gradio interface for the 'generate' function.
# The interface has two inputs: a textbox for the prompt and a slider for the maximum number of new tokens.
# The output is a textbox that will display the generated text.
demo = gr.Interface(fn=generate, inputs=[gr.Textbox(label="Prompt"), gr.Slider(label="Max new tokens", value=20,  maximum=1024, minimum=1)], outputs=[gr.Textbox(label="Completion")])
gr.close_all()
demo.launch(share=True)

gr.Chatbot() to the rescue!

In [None]:
# Define a function to format the chat prompt.
def format_chat_prompt(message, chat_history):
    # Initialize an empty string for the prompt.
    prompt = ""
     # Iterate over the chat history.
    for turn in chat_history:
        # Each turn in the chat history is a tuple containing a user message and a bot message.
        user_message, bot_message = turn
        # Add the user message and bot message to the prompt
        prompt = f"{prompt}\nUser: {user_message}\nAssistant: {bot_message}"
    # Add the current user message to the prompt and prepare for the assistant's response.
    prompt = f"{prompt}\nUser: {message}\nAssistant:"
    return prompt

# Define a function to generate a response.
def respond(message, chat_history):
        # Format the prompt using the current message and chat history.
        formatted_prompt = format_chat_prompt(message, chat_history)
        # Generate a response using the formatted prompt.
        bot_message = client.generate(formatted_prompt,
                                     max_new_tokens=1024,
                                     stop_sequences=["\nUser:", "<|endoftext|>"]).generated_text
        chat_history.append((message, bot_message))
        return "", chat_history


# Start a Gradio Blocks context. This allows you to group multiple Gradio components together
with gr.Blocks() as demo:
    # Create a Chatbot component. This will display the chat history.
    # The 'height=240' argument sets the height of the chatbot window
    chatbot = gr.Chatbot(height=240) #just to fit the notebook
    msg = gr.Textbox(label="Prompt")
    btn = gr.Button("Submit")
    clear = gr.ClearButton(components=[msg, chatbot], value="Clear console")

    # Set the 'click' event of the button to call the 'respond' function.
    # The inputs to the function are the 'msg' and 'chatbot' components.
    # The outputs of the function are also the 'msg' and 'chatbot' components.
    btn.click(respond, inputs=[msg, chatbot], outputs=[msg, chatbot])
    # Set the 'submit' event of the textbox to also call the 'respond' function.
    msg.submit(respond, inputs=[msg, chatbot], outputs=[msg, chatbot]) #Press enter to submit

# Set the 'submit' event of the textbox to also call the 'respond' function.
gr.close_all()
# Launch the Gradio interface and make it shareable.
demo.launch(share=True)

Adding other advanced features

In [None]:
 # Initialize the prompt with the system instruction.
def format_chat_prompt(message, chat_history, instruction):
    prompt = f"System:{instruction}"  # New: Initialize the prompt with the system instruction.
    for turn in chat_history:
        user_message, bot_message = turn
        prompt = f"{prompt}\nUser: {user_message}\nAssistant: {bot_message}"
    prompt = f"{prompt}\nUser: {message}\nAssistant:"
    return prompt

# Define a function to generate a response.
def respond(message, chat_history, instruction, temperature=0.7):
    prompt = format_chat_prompt(message, chat_history, instruction) # Format the prompt using the current message, chat history, and instruction.
    chat_history = chat_history + [[message, ""]] # Append the current message to the chat history.
    # Generate a response using the formatted prompt.
    # The 'generate_stream' method is used for token streaming.
    stream = client.generate_stream(prompt,
                                      max_new_tokens=1024,
                                      stop_sequences=["\nUser:", "<|endoftext|>"],
                                      temperature=temperature)
                                      #stop_sequences to not generate the user answer
    acc_text = ""
    #Streaming the tokens
    # This loop iterates through the stream of responses, where each response represents a token generated by the model.
    # idx keeps track of the index of the response in the stream.
    for idx, response in enumerate(stream):
            text_token = response.token.text #Extracts the text content of the current response token

            if response.details: #Checks if the response has any details. If it does, it returns from the respond function
                return
            #This condition checks if it's the first token in the response (idx == 0) and if it starts with a space.
            #If so, it removes the leading space from the text_token.
            if idx == 0 and text_token.startswith(" "):
                text_token = text_token[1:]

            #Accumulates the current text_token into the acc_text variable. This is used to build up the full response text.
            acc_text += text_token
            #Retrieves the last turn from the chat_history list and converts it into a list. This list contains the user and assistant messages from the last turn.
            last_turn = list(chat_history.pop(-1))
            last_turn[-1] += acc_text #Appends the accumulated acc_text to the last turn's assistant message. This is done to construct a complete response.
            chat_history = chat_history + [last_turn]# Adds the modified last turn back to the chat_history list, effectively updating the conversation history.
            yield "", chat_history #Yields an empty string and the updated chat_history.
            acc_text = "" # Resets the acc_text variable to an empty string for the next token. This ensures that each token is accumulated separately.

with gr.Blocks() as demo:
    chatbot = gr.Chatbot(height=240) #just to fit the notebook
    msg = gr.Textbox(label="Prompt")
    with gr.Accordion(label="Advanced options",open=False):# Create an accordion component with advanced options (initially closed).

        # Create a textbox component for the system message with 2 lines of text.
        #This is the default or initial value of the text field. When the system component is created, the text field contains the text specified as the default value. Users can later modify it.
        #In this case, the default value is "A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers," which represents an example system message briefly describing the conversation and the behavior of the AI assistant.
        system = gr.Textbox(label="System message", lines=2, value="A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.")
         # Create a slider component labeled "temperature" for adjusting a parameter.
        temperature = gr.Slider(label="temperature", minimum=0.1, maximum=1, value=0.7, step=0.1)
    btn = gr.Button("Submit")
    clear = gr.ClearButton(components=[msg, chatbot], value="Clear console")# Create a clear button that clears specified components (msg and chatbot).

    btn.click(respond, inputs=[msg, chatbot, system], outputs=[msg, chatbot])
    msg.submit(respond, inputs=[msg, chatbot, system], outputs=[msg, chatbot]) #Press enter to submit
gr.close_all()
demo.queue().launch(share=True)

In [None]:
gr.close_all()

Closing server running on port: 7860
Closing server running on port: 7860
