# Importing Libraries

In [8]:
!pip install -q transformers accelerate bitsandbytes gradio torch

from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch
import gradio as gr


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


# Model Tokenizer & pipeline

In [9]:
MODEL_ID = "Qwen/Qwen3-4B-Instruct-2507"

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map="auto",
    torch_dtype=torch.float16,
    load_in_4bit=True
)

chatbot = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer
)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0


# chat_response

In [10]:
def ask_bot(user_input, max_tokens=300):
    
    messages = [
        {"role": "system", "content": "You are a helpful AI assistant. Answer with short, useful replies in the user's language."},
        {"role": "user", "content": user_input}
    ]

    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    outputs = chatbot(
        prompt,
        max_new_tokens=max_tokens,
        temperature=0.6,
        top_p=0.85,
        do_sample=True
    )

    reply = outputs[0]["generated_text"][len(prompt):].strip()
    return reply

# Gradio Interface

In [11]:
with gr.Blocks(title=" Chatbot (Zaki) ") as demo:
    with gr.Row():
        user_input = gr.Textbox(label=" Write your question ")
        output = gr.Textbox(label=" answer ")
    submit_btn = gr.Button("send")
    submit_btn.click(fn=ask_bot, inputs=user_input, outputs=output)

demo.launch(share=True)

* Running on local URL:  http://127.0.0.1:7861
* Running on public URL: https://b58dfad5dc62123079.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


