In [None]:
!pip -q install git+https://github.com/huggingface/transformers # need to install from github
#!pip install -q datasets loralib
!pip -q install bitsandbytes
#!pip -q install langchain
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install huggingface_hub
!pip install --update gradio
#!pip install torch

In [None]:
from huggingface_hub import notebook_login
notebook_login()

## Approach 1.

In [None]:
import torch
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
# Load your base model
base_model_id = "mistralai/Mistral-7B-v0.1"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    use_auth_token=True
)

# Load the tokenizer
eval_tokenizer = AutoTokenizer.from_pretrained(base_model_id, add_bos_token=True, trust_remote_code=True)

# Load the fine-tuned model
ft_model = PeftModel.from_pretrained(base_model, "Phanh2532/GAML_Mistral7B")
ft_model.eval()

In [None]:
import torch
import gradio as gr
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig


#ft_model.save_pretrained('/content/mistral-allblocksft/')
#ft_model.eval()
conversation_history = []

def chat(input_text):

    #conversation_history.append(input_text)
    #history_string = "\n".join(conversation_history)

    inputs = eval_tokenizer(input_text, return_tensors='pt').to("cuda")
    #inputs = eval_tokenizer(text, return_tensors='pt').to("cuda")
    #input_length = inputs.input_ids.shape[1]
    ft_model.eval()
    with torch.no_grad():
        outputs = eval_tokenizer.decode(ft_model.generate(**model_input, max_new_tokens=700, repetition_penalty=1.15)[0], skip_special_tokens=True).strip()
    return outputs
'''
# Define the Gradio process function
def generate_response(prompt):
    model_input = eval_tokenizer(prompt, return_tensors="pt").to("cuda")

    # Generate a response using the fine-tuned model
    with torch.no_grad():
        response = ft_model.generate(**model_input, max_new_tokens=700, repetition_penalty=1.15)[0]

    # Decode the response
    decoded_response = eval_tokenizer.decode(response, skip_special_tokens=True)

    return f"Response: {decoded_response}\n"
'''
# Create the Gradio interface
demo = gr.Interface(
    fn=chat,
    inputs=gr.inputs.Textbox(placeholder="Start a question..."),
    outputs=gr.outputs.Textbox(),
    live=False,
    title="Mistral GAML",
    description="Have a question with the chatbot.",
    #allow_screenshot=True,
    #allow_flagging="never"
)

In [None]:
demo.launch(share=True)

## 2. Approach 2

In [None]:
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

title = "Mistral7B for code generation"

def load_model():
    # Load your base model
    base_model_id = "mistralai/Mistral-7B-v0.1"
    base_model = AutoModelForCausalLM.from_pretrained(base_model_id)

    # Load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(base_model_id)

    return base_model, tokenizer

def generate_code(inp_chat):
    model, tokenizer = load_model()

    # Tokenize the input
    model_input = tokenizer(prompt, return_tensors="pt").to("cuda")

    # Generate response
    with torch.no_grad():
        response = model.generate(input_ids, max_length=50, num_return_sequences=1, do_sample=True)[0]

    # Decode the response
    decoded_response = tokenizer.decode(response, skip_special_tokens=True)

    return decoded_response

examples = [
    'Write me a GAML code snippet to simulate water pollution'
]

gr.Interface(
    fn=generate_code,
    enable_queue=True,
    inputs="text",
    outputs="text",
    title=title,
    examples=examples
).queue().launch()


## 3. Approach 3

In [4]:
!pip install --upgrade gradio

Collecting gradio
  Downloading gradio-4.16.0-py3-none-any.whl (16.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.7/16.7 MB[0m [31m33.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl (15 kB)
Collecting fastapi (from gradio)
  Downloading fastapi-0.109.0-py3-none-any.whl (92 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.0/92.0 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpy (from gradio)
  Downloading ffmpy-0.3.1.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gradio-client==0.8.1 (from gradio)
  Downloading gradio_client-0.8.1-py3-none-any.whl (305 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m305.2/305.2 kB[0m [31m39.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting httpx (from gradio)
  Downloading httpx-0.26.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [1]:
import gradio as gr
import os
import time
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Load Mistral 7B model and tokenizer
model_id = "Phanh2532/GAML-151-500"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    use_auth_token=True
)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, add_bos_token=True, trust_remote_code=True)

def greet(name):

    return f"Hello {name}! I'm Gemini Chatbot. \n A chat bot, who will help you answer your question."

def vote(data: gr.LikeData):
    if data.liked:
        print("You upvoted this response: " + data.value)
    else:
        print("You downvoted this response: " + data.value)

def generate_response(history):
    # Concatenate chat history
    chat_history = ""
    for entry in history:
        chat_history += entry[0] + "\n"

    # Tokenize input
    model_input = tokenizer(chat_history, return_tensors="pt").to("cuda")

    # Generate response
    output = model.generate(**model_input, max_new_tokens=700, repetition_penalty=1.15)

    # Decode and return response
    response = tokenizer.decode(output[0], skip_special_tokens=True)

    # Check if the response contains code
    if "{" in response:
        response = f"```\n{response}\n```"  # Enclose response in triple backticks for Markdown code block

    return response

def bot(history):
    response = generate_response(history)
    history[-1][1] = ""
    for character in response:
        history[-1][1] += character
        time.sleep(0.05)
        yield history

def print_like_dislike(x):
    print(x.index, x.value, x.liked)

def add_text(history, text):
    history = history + [(text, None)]
    return history, gr.Textbox(value="", interactive=False)

def add_file(history, file):
    history = history + [((file.name,), None)]
    return history

avatar_image_url = "https://i.imgur.com/DxhdL3t.jpg"  # Modify this path accordingly

with gr.Blocks() as demo:
    gr.Markdown(
        """
        <div style='display: flex; justify-content: center; align-items: center; font-family: SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace;'>
            <div style='margin-right: 10px;'>
                <img src='https://i.imgur.com/DuzoviE.png' alt='Your Image' width='30'>
            </div>
            <div style='text-align: center;'>
                <h1>GAMA PLATFORM CHATBOT</h1>
            </div>

        </div>
        """
    )

    greet_input = gr.Textbox(
        placeholder="What is your name?")
    greet_output = gr.Textbox()
    greet_input.change(greet, greet_input, greet_output)

    chatbot = gr.Chatbot(
        [],
        elem_id="chatbot",
        bubble_full_width=False,
        avatar_images=(None, avatar_image_url),
    )

    with gr.Row():
        txt = gr.Textbox(
            scale=30,
            show_label=False,
            placeholder="Enter text and press enter, or upload an text file",
            container=False,
        )
        '''
          experiment_prompt_input = gr.Textbox(
              label="Experiment Prompt",
              placeholder="Enter your experiment prompt here",
              container=False
          )
        '''
        btn = gr.UploadButton("📁", file_types=["text"])

    txt_msg = txt.submit(add_text, [chatbot, txt], [chatbot, txt], queue=False).then(
        bot, chatbot, chatbot, api_name="bot_response"
    )
    txt_msg.then(lambda: gr.Textbox(interactive=True), None, [txt], queue=False)
    file_msg = btn.upload(add_file, [chatbot, btn], [chatbot], queue=False).then(
        bot, chatbot, chatbot
    )

    chatbot.like(print_like_dislike, None, None)

demo.queue()
demo.launch(share=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

TypeError: Chatbot.__init__() got an unexpected keyword argument 'placeholder'