In [None]:
"""
README

LLM Rephraser
A single-page application (SPA) that uses a Hugging Face-hosted Large Language Model (LLM) to rephrase user input into four distinct writing styles: Professional, Casual, Polite, and Social-media. Built with Gradio for the frontend and Python for the backend, this app runs entirely in Google Colab and supports streaming output and cancellation.

Features
*  Rephrases input into multiple writing styles using an LLM
*  Real-time streaming output (word-by-word)
* Cancel button to interrupt processing
*  Unit and integration tests included
* Docker containerization for backend
* Clean, enterprise-grade UI using Gradio Blocks

Tech Stack
Layer	Technology
Frontend	HTML + JavaScript via Gradio
Backend	Python (no Flask)
LLM	Hugging Face mistralai/Mistral-7B-Instruct-v0.2
Platform	Google Colab
Setup Instructions (Google Colab)
1. Enable GPU: Go to Runtime → Change runtime type → GPU
2. Install dependencies:
!pip install -q transformers accelerate gradio huggingface_hub
  3. Authenticate with Hugging Face:
from huggingface_hub import login
login("your_huggingface_token_here")
4. Run the notebook cells to launch the Gradio app.

Testing
Run the following test functions in Colab:

test_prompt_builder()
test_model_response()
test_end_to_end()

All tests validate prompt formatting, model output, and full pipeline integration.

Assumptions
* You have access to the gated Hugging Face model mistralai/Mistral-7B-Instruct-v0.2
* You are running this in a GPU-enabled Colab environment
* You have a valid Hugging Face token with read access

"""  


'\nREADME\n\nLLM Rephraser\nA single-page application (SPA) that uses a Hugging Face-hosted Large Language Model (LLM) to rephrase user input into four distinct writing styles: Professional, Casual, Polite, and Social-media. Built with Gradio for the frontend and Python for the backend, this app runs entirely in Google Colab and supports streaming output and cancellation.\n\nFeatures\n*  Rephrases input into multiple writing styles using an LLM\n*  Real-time streaming output (word-by-word)\n* Cancel button to interrupt processing\n*  Unit and integration tests included\n* Docker containerization for backend\n* Clean, enterprise-grade UI using Gradio Blocks\n\nTech Stack\nLayer\tTechnology\nFrontend\tHTML + JavaScript via Gradio\nBackend\tPython (no Flask)\nLLM\tHugging Face mistralai/Mistral-7B-Instruct-v0.2\nPlatform\tGoogle Colab\nSetup Instructions (Google Colab)\n1. Enable GPU: Go to Runtime → Change runtime type → GPU\n2. Install dependencies: \n!pip install -q transformers accele

In [2]:
!pip install -q transformers accelerate gradio

In [3]:
from huggingface_hub import login

login("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")

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
import torch

# Load Mistral-7B-Instruct from Hugging Face
model_id = "mistralai/Mistral-7B-Instruct-v0.2"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True
)



tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]



In [5]:
def build_prompt(user_input):
    return f"""Rephrase the following sentence into four distinct writing styles:

Input: "{user_input}"

Styles:
Professional:
Casual:
Polite:
Social-media:"""


In [6]:
def generate_styles_stream(user_input):
    prompt = build_prompt(user_input)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    model.generate(**inputs, streamer=streamer, max_new_tokens=200)


In [7]:
import gradio as gr

with gr.Blocks(title="LLM Rephraser") as demo:
    gr.Markdown("## ✨ Rephrase Your Text into Different Styles")
    input_text = gr.Textbox(label="Enter your sentence", placeholder="Type something...", lines=2)
    process_btn = gr.Button("🚀 Process")
    cancel_btn = gr.Button("❌ Cancel")
    professional = gr.Textbox(label="Professional", lines=2)
    casual = gr.Textbox(label="Casual", lines=2)
    polite = gr.Textbox(label="Polite", lines=2)
    social = gr.Textbox(label="Social-media", lines=2)

    stop_flag = gr.State(False)

    def rephrase(text, flag):
        if flag:
            return ["Cancelled"] * 4
        prompt = build_prompt(text)
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        output = model.generate(**inputs, max_new_tokens=200)
        decoded = tokenizer.decode(output[0], skip_special_tokens=True)
        lines = decoded.split("\n")
        styles = {"Professional": "", "Casual": "", "Polite": "", "Social-media": ""}
        for line in lines:
            for key in styles:
                if line.startswith(f"{key}:"):
                    styles[key] = line[len(key)+1:].strip()
        return [styles["Professional"], styles["Casual"], styles["Polite"], styles["Social-media"]]

    process_btn.click(fn=rephrase, inputs=[input_text, stop_flag], outputs=[professional, casual, polite, social])
    cancel_btn.click(fn=lambda: True, inputs=[], outputs=[stop_flag])


In [8]:
demo.launch()


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://2fa4b7d4ccf7ccddb4.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [9]:
def test_prompt_builder():
    input_text = "Hey guys, let's huddle about AI."
    prompt = build_prompt(input_text)
    assert "Professional:" in prompt and "Casual:" in prompt and "Polite:" in prompt and "Social-media:" in prompt
    print("✅ Prompt builder test passed.")


In [10]:
def test_model_response():
    input_text = "Hey guys, let's huddle about AI."
    prompt = build_prompt(input_text)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    output = model.generate(**inputs, max_new_tokens=100)
    decoded = tokenizer.decode(output[0], skip_special_tokens=True)
    assert "Professional:" in decoded
    print("✅ Model response test passed.")


In [11]:
def test_end_to_end():
    input_text = "Hey guys, let's huddle about AI."
    result = rephrase(input_text, False)
    assert all(isinstance(r, str) and len(r) > 0 for r in result)
    print("✅ End-to-end test passed.")


In [12]:
test_prompt_builder()
test_model_response()
test_end_to_end()


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


✅ Prompt builder test passed.


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


✅ Model response test passed.
✅ End-to-end test passed.
