In [None]:
!pip install -U vllm --pre --extra-index-url https://wheels.vllm.ai/nightly
!pip install pillow

Looking in indexes: https://pypi.org/simple, https://wheels.vllm.ai/nightly
Collecting vllm
  Downloading https://wheels.vllm.ai/7b926e890195f026b60f36506503a80afc583b33/vllm-0.14.0rc1.dev58%2Bg7b926e890-cp38-abi3-manylinux_2_31_x86_64.whl (487.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.3/487.3 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting blake3 (from vllm)
  Downloading blake3-1.0.8-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.6 kB)
Collecting prometheus-fastapi-instrumentator>=7.0.0 (from vllm)
  Downloading prometheus_fastapi_instrumentator-7.1.0-py3-none-any.whl.metadata (13 kB)
Collecting lm-format-enforcer==0.11.3 (from vllm)
  Downloading lm_format_enforcer-0.11.3-py3-none-any.whl.metadata (17 kB)
Collecting llguidance<1.4.0,>=1.3.0 (from vllm)
  Downloading llguidance-1.3.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting outlines_core==0.2.11 (from vllm)
  Downloading outlin



In [None]:
from vllm import LLM, SamplingParams
# Note: DeepSeek-OCR specific imports might change based on the nightly build version,
# but assuming the library is updated:
from vllm.model_executor.models.deepseek_ocr import NGramPerReqLogitsProcessor
from PIL import Image
import requests
from io import BytesIO



In [None]:
import os

# Image ka path set karein
image_path = "/content/Car.jpg"

# Check karein ki file wahan hai ya nahi
if os.path.exists(image_path):
    # Local file ko seedha load karein
    image_1 = Image.open(image_path).convert("RGB")
    image_2 = Image.open(image_path).convert("RGB") # Agar testing ke liye same image chahiye
    print("Image successfully load ho gayi!")
else:
    print(f"Error: '{image_path}' file nahi mili. Please Colab ke left side me folder icon pe click karke image upload karein.")

Image successfully load ho gayi!


In [None]:
# 2. Model Initialization
# Colab T4 GPU (16GB VRAM) ke liye hame thoda careful rehna padega.
# Agar OutOfMemory (OOM) error aaye, toh 'gpu_memory_utilization' ko kam karein (e.g., 0.9)
# ya 'max_model_len' ko limit karein.
llm = LLM(
    model="deepseek-ai/DeepSeek-OCR",
    enable_prefix_caching=False,
    mm_processor_cache_gb=0,
    logits_processors=[NGramPerReqLogitsProcessor],
    trust_remote_code=True, # Aksar naye models ke liye ye zaroori hota hai
    gpu_memory_utilization=0.9, # T4 GPU ke liye safe limit
    max_model_len=4096 # Memory bachane ke liye context length thoda kam kiya hai
)

INFO 12-22 18:47:23 [utils.py:253] non-default args: {'trust_remote_code': True, 'max_model_len': 4096, 'enable_prefix_caching': False, 'disable_log_stats': True, 'mm_processor_cache_gb': 0, 'logits_processors': [<class 'vllm.model_executor.models.deepseek_ocr.NGramPerReqLogitsProcessor'>], 'model': 'deepseek-ai/DeepSeek-OCR'}


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


INFO 12-22 18:47:24 [model.py:514] Resolved architecture: DeepseekOCRForCausalLM
INFO 12-22 18:47:24 [model.py:1667] Using max model len 4096
INFO 12-22 18:47:28 [scheduler.py:230] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 12-22 18:49:02 [llm.py:344] Supported tasks: ['generate']


In [None]:
# 3. Prompt setup
prompt = "<image>\nFree OCR."

model_input = [
    {
        "prompt": prompt,
        "multi_modal_data": {"image": image_1}
    },
    {
        "prompt": prompt,
        "multi_modal_data": {"image": image_2}
    }
]

In [None]:
# 4. Sampling Params
sampling_param = SamplingParams(
    temperature=0.0,
    max_tokens=2048, # T4 memory issue se bachne ke liye thoda kam kiya hai
    extra_args=dict(
        ngram_size=30,
        window_size=90,
        whitelist_token_ids={128821, 128822},
    ),
    skip_special_tokens=False,
)

In [None]:
# 5. Generate
model_outputs = llm.generate(model_input, sampling_param)

# 6. Print Output
for output in model_outputs:
    print("-" * 50)
    print(output.outputs[0].text)

Adding requests:   0%|          | 0/2 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/2 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

--------------------------------------------------
MH 20 EE 7602
--------------------------------------------------
MH 20 EE 7602


In [None]:
!pip install gradio vllm pillow



In [None]:
import gradio as gr
from vllm import LLM, SamplingParams
from vllm.model_executor.models.deepseek_ocr import NGramPerReqLogitsProcessor
from PIL import Image
import torch

In [None]:
# Global variable to store the model instance
llm_model = None

In [None]:
def load_model():
    """
    Initializes the DeepSeek-OCR model using the settings from the notebook.
    This runs once to load the model into GPU memory.
    """
    global llm_model
    if llm_model is None:
        print("Initializing DeepSeek-OCR model... this may take a minute.")
        # Configuration matches the notebook for T4 GPU compatibility
        llm_model = LLM(
            model="deepseek-ai/DeepSeek-OCR",
            enable_prefix_caching=False,
            mm_processor_cache_gb=0,
            logits_processors=[NGramPerReqLogitsProcessor],
            trust_remote_code=True,
            gpu_memory_utilization=0.9, # Optimized for T4 16GB VRAM
            max_model_len=4096
        )
        print("Model loaded successfully!")
    return llm_model

In [None]:
def run_ocr(image):
    """
    Processing function that takes a PIL image from Gradio,
    passes it to the vLLM model, and returns the extracted text.
    """
    if image is None:
        return "Error: No image provided."

    # Ensure model is loaded
    model = load_model()

    # Define the specific prompt required by DeepSeek-OCR
    prompt = "<image>\nFree OCR."

    # Prepare input for vLLM
    model_input = {
        "prompt": prompt,
        "multi_modal_data": {"image": image}
    }

    # Specific sampling parameters required for this model (from notebook cell 5)
    # These args (ngram_size, window_size, whitelist) are crucial for accuracy
    sampling_params = SamplingParams(
        temperature=0.0,
        max_tokens=2048,
        extra_args={
            "ngram_size": 30,
            "window_size": 90,
            "whitelist_token_ids": {128821, 128822},
        },
        skip_special_tokens=False,
    )

    # Generate output
    # vLLM expects a list of inputs
    outputs = model.generate([model_input], sampling_params)

    # Extract the generated text from the output object
    generated_text = outputs[0].outputs[0].text

    return generated_text

In [None]:
# Define the Gradio Interface
with gr.Blocks(title="DeepSeek-OCR Interface") as demo:
    gr.Markdown("# 📄 DeepSeek-OCR Demo")
    gr.Markdown("Upload an image containing text to extract it using the DeepSeek-OCR model powered by vLLM.")

    with gr.Row():
        with gr.Column():
            input_image = gr.Image(type="pil", label="Upload Image")
            submit_btn = gr.Button("Extract Text", variant="primary")

        with gr.Column():
            output_text = gr.Textbox(
                label="Extracted Text",
                lines=15,
                show_copy_button=True,
                placeholder="OCR output will appear here..."
            )

    # Link the button click to the function
    submit_btn.click(fn=run_ocr, inputs=input_image, outputs=output_text)

In [None]:
# Launch the app
if __name__ == "__main__":
    # Pre-load the model immediately on script start
    load_model()
    # share=True creates a public link (useful for Colab)
    demo.launch(share=True)

Initializing DeepSeek-OCR model... this may take a minute.
INFO 12-22 19:09:38 [utils.py:253] non-default args: {'trust_remote_code': True, 'max_model_len': 4096, 'enable_prefix_caching': False, 'disable_log_stats': True, 'mm_processor_cache_gb': 0, 'logits_processors': [<class 'vllm.model_executor.models.deepseek_ocr.NGramPerReqLogitsProcessor'>], 'model': 'deepseek-ai/DeepSeek-OCR'}


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


INFO 12-22 19:09:38 [model.py:514] Resolved architecture: DeepseekOCRForCausalLM
INFO 12-22 19:09:38 [model.py:1667] Using max model len 4096
INFO 12-22 19:09:38 [scheduler.py:230] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 12-22 19:11:09 [llm.py:344] Supported tasks: ['generate']
Model loaded successfully!
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://b15acdebb780245315.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
