# Qwen Model

In [None]:
!pip install uv

Collecting uv
  Downloading uv-0.9.5-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading uv-0.9.5-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (21.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m93.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: uv
Successfully installed uv-0.9.5


In [None]:
!uv pip install transformers==4.57.1 torch==2.8.0 bitsandbytes==0.48.1 flash-attn==2.8.3 --no-build-isolation

[2mUsing Python 3.12.12 environment at: /usr[0m
[2K[2mResolved [1m43 packages[0m [2min 1.07s[0m[0m
[2K[2mPrepared [1m2 packages[0m [2min 10.13s[0m[0m
[2K[2mInstalled [1m2 packages[0m [2min 5ms[0m[0m
 [32m+[39m [1mbitsandbytes[0m[2m==0.48.1[0m
 [32m+[39m [1mflash-attn[0m[2m==2.8.3[0m


In [None]:
import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

model_id = "Qwen/Qwen2.5-VL-3B-Instruct"
device = "cuda" if torch.cuda.is_available() else "cpu"
print("device:", device)
print("Model downloading!")

try:
    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
        model_id,
        quantization_config=bnb_config,
        device_map="auto" if device == "cuda" else None,
        attn_implementation="flash_attention_2" if device == "cuda" else "eager",
        trust_remote_code=True
    ).to(device)
    print("Model loaded successfully!")
except Exception as e:
    print(f"Error loading model: {e}")
    raise

print("Processor downloading!")
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
print("Processor loaded successfully!")

device: cuda
Model downloading!


config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.53G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/216 [00:00<?, ?B/s]

Model loaded successfully!
Processor downloading!


preprocessor_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

chat_template.json: 0.00B [00:00, ?B/s]

Processor loaded successfully!


In [None]:
!uv pip install -U qwen_vl_utils

In [None]:
from qwen_vl_utils import process_vision_info

device = "cuda" if torch.cuda.is_available() else "cpu"

def qwen_inference(messages):
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, _ = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        padding=True,
        return_tensors="pt"
    ).to(model.device)
    generated_ids = model.generate(**inputs, max_new_tokens=512)
    generated_ids_trimmed = [out_id[len(in_id):] for in_id, out_id in zip(inputs.input_ids, generated_ids)]
    output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
    return output_text[0]

In [None]:
import torch

def clear_gpu_cache():
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        print("GPU cache cleared.")
    else:
        print("No GPU available.")

clear_gpu_cache()

GPU cache cleared.


# Gradio Client

In [None]:
from gradio_client import Client

class GradioRetriever():
    def __init__(self, gradio_url: str, k: int = 1):
        self.client = Client(gradio_url)
        self.k = k

    def _get_relevant_documents(self, query: str, collection_name="colpali-financial-multimodal-db"):
        try:
            results = self.client.predict(
                query,
                collection_name,
                self.k,
                api_name="/predict"
            )

            if not results or results == "No relevant docs found.":
              print("No relevant docs found.")
              return []
            return results

        except Exception as e:
            print(f"Retriever error: {e}")
            return []

In [None]:
from langchain.prompts import PromptTemplate
import base64
from PIL import Image
from io import BytesIO

prompt_template = PromptTemplate(
    input_variables=["query"],
    template="Context: Analyze given financial documents like charts/graphs/text/image and answer: {query}"
)

class QwenRAGChain:
    def __init__(self, retriever, vlm_inference):
        self.retriever = retriever
        self.vlm_inference = vlm_inference
        self.max_context_length = 5000

    def run(self, query):
        docs = self.retriever._get_relevant_documents(query)
        if not docs:
            return "No relevant docs found."

        content = []
        for doc in docs:
            if 'image_base64' in doc.get('payload', {}):
                try:
                    image_data = base64.b64decode(doc['payload']['image_base64'])
                    image = Image.open(BytesIO(image_data))
                    content.append({"type": "image", "image": image})
                except Exception as e:
                    print(f"Error decoding image: {e}")
                    continue

        content.append({"type": "text", "text": prompt_template.format(query=query)})

        messages = [
            {
                "role": "user",
                "content": content,
            }
        ]
        return self.vlm_inference(messages)

In [None]:
gradio_url="https://1ab40c6e6ab750e021.gradio.live"
retriever = GradioRetriever(gradio_url)
qa_chain = QwenRAGChain(retriever=retriever, vlm_inference=qwen_inference)

Loaded as API: https://1ab40c6e6ab750e021.gradio.live/ ✔


In [None]:
import time

def financial_chatbot(query):
  if not query:
    return "Please provide a query."
  try:
    clear_gpu_cache()

    start = time.time()
    answer = qa_chain.run(query)
    qa_time = time.time() - start
    print(f"QA Chain Time: {qa_time}s")
  except Exception as e:
    print(f"Error: {e}")
    raise
  return answer

In [None]:
user_input = "What is the role of THE COMMITTEE of Unilever?"
print(financial_chatbot(user_input))

GPU cache cleared.
QA Chain Time: 16.399452686309814s
The Committee of Unilever is concerned with the remuneration and benefits of the Directors and other members of the Unilever Leadership Executive. It also has responsibility for the design and terms of all-employee share-based incentive plans and Executive cash or share-based incentive plans. Finally, it sets the remuneration policy for, and is responsible for the performance evaluation of, the Unilever Leadership Executive and Executive Directors.


In [None]:
user_input = "What is the Coca-Cola Reported Operating Income in Latin America?"
print(financial_chatbot(user_input))

GPU cache cleared.
QA Chain Time: 8.137307405471802s
The Coca-Cola Reported Operating Income in Latin America is 4.
