# üß† Qwen3-1.7B ‚Äî Chat with Real-Time Thinking

Run all cells to launch a web UI. A **public link** will appear at the bottom ‚Äî click it to start chatting!

> **Requirements**: Google Colab with **T4 GPU** runtime (free tier).

In [None]:
# Cell 1 ‚Äî Install Dependencies
!pip install -q transformers accelerate gradio torch bitsandbytes

In [None]:
# Cell 2 ‚Äî Load Model & Tokenizer
import torch
import gc
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig

# Clear any leftover GPU memory from previous runs
gc.collect()
torch.cuda.empty_cache()

MODEL_ID = "Qwen/Qwen3-1.7B"

# 4-bit quantization to save GPU memory
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
)

print(f"‚è≥ Loading tokenizer for {MODEL_ID}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

print(f"‚è≥ Loading model {MODEL_ID} with 4-bit quantization...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
)

print(f"‚úÖ Model loaded! GPU memory used: {torch.cuda.memory_allocated()/1024**3:.1f} GB")

In [None]:
# Cell 3 ‚Äî Chat Logic with Thinking Parser
import threading
import re

def respond(message, history):
    """
    Generator that streams the model's response.
    Parses <think>...</think> tags to separate thinking from the answer.
    Yields (thinking_text, answer_text) tuples as tokens arrive.
    """
    # Build conversation messages
    messages = []
    for user_msg, bot_msg in history:
        messages.append({"role": "user", "content": user_msg})
        if bot_msg:
            messages.append({"role": "assistant", "content": bot_msg})
    messages.append({"role": "user", "content": message})

    # Apply chat template with thinking enabled
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=True,
    )
    inputs = tokenizer([text], return_tensors="pt").to(model.device)

    # Set up streamer
    streamer = TextIteratorStreamer(
        tokenizer, skip_prompt=True, skip_special_tokens=True
    )

    # Generation config
    gen_kwargs = dict(
        **inputs,
        max_new_tokens=8192,
        streamer=streamer,
        temperature=0.6,
        top_p=0.95,
        top_k=20,
        do_sample=True,
    )

    # Start generation in a separate thread
    thread = threading.Thread(target=model.generate, kwargs=gen_kwargs)
    thread.start()

    # Stream tokens and parse thinking vs answer
    full_response = ""
    thinking_text = ""
    answer_text = ""
    in_thinking = False
    thinking_done = False

    for new_token in streamer:
        full_response += new_token

        if not thinking_done:
            # Check if we've entered the thinking block
            if "<think>" in full_response and not in_thinking:
                in_thinking = True

            # Check if thinking block has ended
            if "</think>" in full_response and in_thinking:
                thinking_done = True
                in_thinking = False
                # Extract thinking content
                think_match = re.search(r"<think>(.*?)</think>", full_response, re.DOTALL)
                if think_match:
                    thinking_text = think_match.group(1).strip()
                # Extract answer (everything after </think>)
                answer_text = full_response.split("</think>")[-1].strip()
                yield thinking_text, answer_text
                continue

            if in_thinking:
                # Extract partial thinking (remove the <think> tag)
                partial = full_response.split("<think>")[-1].strip()
                thinking_text = partial
                yield thinking_text, ""
            else:
                # Before <think> tag appears, show as answer
                answer_text = full_response.strip()
                yield "", answer_text
        else:
            # After thinking is done, everything is the answer
            answer_text = full_response.split("</think>")[-1].strip()
            yield thinking_text, answer_text

    thread.join()

print("‚úÖ Chat logic ready.")

In [None]:
# Cell 4 ‚Äî Gradio Web UI
import gradio as gr

CUSTOM_CSS = """
/* ‚îÄ‚îÄ Global ‚îÄ‚îÄ */
.gradio-container {
    max-width: 900px !important;
    margin: 0 auto !important;
    font-family: 'Inter', 'Segoe UI', sans-serif !important;
}

/* ‚îÄ‚îÄ Title ‚îÄ‚îÄ */
.title-text {
    text-align: center;
    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
    -webkit-background-clip: text;
    -webkit-text-fill-color: transparent;
    font-size: 2rem;
    font-weight: 800;
    margin-bottom: 0;
}

/* ‚îÄ‚îÄ Thinking Panel ‚îÄ‚îÄ */
.thinking-box {
    background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%) !important;
    border: 1px solid #334155 !important;
    border-radius: 12px !important;
    padding: 16px !important;
    font-size: 0.9rem !important;
    line-height: 1.6 !important;
    color: #94a3b8 !important;
    max-height: 350px !important;
    overflow-y: auto !important;
}

/* ‚îÄ‚îÄ Answer Panel ‚îÄ‚îÄ */
.answer-box {
    background: linear-gradient(135deg, #0f172a 0%, #1e293b 100%) !important;
    border: 1px solid #475569 !important;
    border-radius: 12px !important;
    padding: 16px !important;
    font-size: 1rem !important;
    line-height: 1.7 !important;
    color: #e2e8f0 !important;
    min-height: 80px !important;
}

/* ‚îÄ‚îÄ Chat History ‚îÄ‚îÄ */
.chat-history {
    border: 1px solid #334155 !important;
    border-radius: 12px !important;
    max-height: 400px !important;
    overflow-y: auto !important;
}

/* ‚îÄ‚îÄ Input Box ‚îÄ‚îÄ */
.input-box textarea {
    border-radius: 12px !important;
    border: 1px solid #475569 !important;
    font-size: 1rem !important;
}

/* ‚îÄ‚îÄ Buttons ‚îÄ‚îÄ */
.send-btn {
    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
    border: none !important;
    border-radius: 12px !important;
    color: white !important;
    font-weight: 600 !important;
    min-height: 45px !important;
}
.send-btn:hover {
    opacity: 0.9 !important;
    transform: translateY(-1px);
}
.clear-btn {
    border-radius: 12px !important;
    min-height: 45px !important;
}
"""

def user_submit(message, history):
    """Add user message to history and clear input."""
    history = history + [[message, None]]
    return "", history

def bot_respond(history):
    """
    Generator: streams thinking + answer and updates Gradio components live.
    """
    if not history:
        yield history, "", ""
        return

    user_message = history[-1][0]
    # Build clean history (exclude latest)
    clean_history = [(h[0], h[1] or "") for h in history[:-1]]

    thinking = ""
    answer = ""

    for thinking_chunk, answer_chunk in respond(user_message, clean_history):
        thinking = thinking_chunk
        answer = answer_chunk

        # Update bot message in history with the answer
        history[-1][1] = answer if answer else "‚è≥ Thinking..."

        # Format thinking display
        thinking_display = ""
        if thinking:
            thinking_display = f"üí≠ **Model's Reasoning:**\n\n{thinking}"

        yield history, thinking_display, answer

    # Final update
    if not answer:
        history[-1][1] = thinking if thinking else "(No response generated)"
    else:
        history[-1][1] = answer

    thinking_display = ""
    if thinking:
        thinking_display = f"üí≠ **Model's Reasoning:**\n\n{thinking}"

    yield history, thinking_display, answer


with gr.Blocks(
    theme=gr.themes.Soft(primary_hue="purple", neutral_hue="slate"),
    css=CUSTOM_CSS,
    title="Qwen3 Chat",
) as demo:

    gr.HTML("<h1 class='title-text'>üß† Qwen3-1.7B Chat</h1>")
    gr.HTML("<p style='text-align:center;color:#94a3b8;margin-top:-8px;'>Chat with real-time thinking ¬∑ Powered by Qwen3</p>")

    with gr.Row():
        with gr.Column(scale=3):
            chatbot = gr.Chatbot(
                label="üí¨ Conversation",
                height=420,
                elem_classes=["chat-history"],
                show_copy_button=True,
            )
            with gr.Row():
                msg_input = gr.Textbox(
                    placeholder="Type your message here... (Enter to send)",
                    show_label=False,
                    scale=5,
                    elem_classes=["input-box"],
                    lines=1,
                )
                send_btn = gr.Button("Send ‚ñ∂", scale=1, elem_classes=["send-btn"])
                clear_btn = gr.Button("üóëÔ∏è Clear", scale=1, elem_classes=["clear-btn"])

        with gr.Column(scale=2):
            with gr.Accordion("üîç Thinking Process", open=True):
                thinking_display = gr.Markdown(
                    value="*Thinking will appear here in real-time as the model reasons...*",
                    elem_classes=["thinking-box"],
                )
            with gr.Accordion("üìù Final Answer", open=True):
                answer_display = gr.Markdown(
                    value="*The model's answer will appear here...*",
                    elem_classes=["answer-box"],
                )

    # ‚îÄ‚îÄ Event Handlers ‚îÄ‚îÄ
    # Submit on Enter
    msg_input.submit(
        fn=user_submit,
        inputs=[msg_input, chatbot],
        outputs=[msg_input, chatbot],
    ).then(
        fn=bot_respond,
        inputs=[chatbot],
        outputs=[chatbot, thinking_display, answer_display],
    )

    # Submit on button click
    send_btn.click(
        fn=user_submit,
        inputs=[msg_input, chatbot],
        outputs=[msg_input, chatbot],
    ).then(
        fn=bot_respond,
        inputs=[chatbot],
        outputs=[chatbot, thinking_display, answer_display],
    )

    # Clear button
    clear_btn.click(
        fn=lambda: ([], "*Thinking will appear here...*", "*Answer will appear here...*"),
        outputs=[chatbot, thinking_display, answer_display],
    )

print("üöÄ Launching Gradio UI...")
demo.launch(share=True, debug=False)

---
## ‚õî Stop the Chatbot
Run the cell below to **stop** the chatbot server and free GPU memory.

In [None]:
# Cell 5 ‚Äî Stop the chatbot server
demo.close()
print("üõë Chatbot server stopped.")
print("üí° To restart, just re-run Cell 4 above.")