In [1]:
pip install transformers

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [3]:
from flask import Flask, request, render_template_string
from PIL import Image
from io import BytesIO
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from faster_whisper import WhisperModel
from tempfile import NamedTemporaryFile

In [4]:
app = Flask(__name__)

In [5]:
import torch
print(torch.__version__)
print(torch.cuda.is_available()) 

2.8.0+cpu
False


In [7]:
import sys
print(sys.executable)

C:\ProgramData\anaconda3\python.exe


In [8]:
# Load BLIP2 model and processor once at startup
processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
blip2_model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-flan-t5-xl")

# Load Whisper model once at startup
MODEL_SIZE = "base"
DEVICE = "cpu"  # Change to "cuda" if GPU available
whisper_model = WhisperModel(MODEL_SIZE, device=DEVICE)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
# HTML templates
HOME_HTML = """
<!doctype html>
<title>Multi-Modal Flask App</title>
<h1>Welcome to the Multi-Modal Flask App</h1>
<ul>
  <li><a href="/describe">Describe an Image</a></li>
  <li><a href="/transcribe">Transcribe Audio</a></li>
</ul>
"""

IMAGE_UPLOAD_HTML = """
<!doctype html>
<title>Upload Image</title>
<h1>Upload an image for description</h1>
<form method="POST" enctype="multipart/form-data">
  <input type="file" name="image" accept="image/*" required>
  <input type="submit" value="Upload">
</form>
{% if caption %}
  <h2>Caption:</h2>
  <p>{{ caption }}</p>
{% endif %}
<a href="/">Back to Home</a>
"""

AUDIO_UPLOAD_HTML = """
<!doctype html>
<title>Upload Audio for Transcription</title>
<h1>Upload audio file for speech-to-text transcription</h1>
<form method="POST" enctype="multipart/form-data">
  <input type="file" name="file" accept="audio/*" required>
  <input type="submit" value="Upload & Transcribe">
</form>
{% if language %}
  <h3>Detected Language: {{ language }} (Probability: {{ language_prob }})</h3>
  <h3>Full Transcript:</h3>
  <p>{{ transcript }}</p>
{% endif %}
<a href="/">Back to Home</a>
"""

@app.route("/")
def home():
    return render_template_string(HOME_HTML)

@app.route("/describe", methods=["GET", "POST"])
def describe():
    caption = None
    if request.method == "POST":
        if 'image' not in request.files:
            caption = "No image file provided."
        else:
            image_file = request.files['image']
            try:
                image = Image.open(image_file.stream).convert("RGB")
                inputs = processor(images=image, return_tensors="pt")
                generated_ids = blip2_model.generate(**inputs)
                caption = processor.decode(generated_ids[0], skip_special_tokens=True)
            except Exception as e:
                caption = f"Error processing image: {str(e)}"
    return render_template_string(IMAGE_UPLOAD_HTML, caption=caption)

@app.route("/transcribe", methods=["GET", "POST"])
def transcribe():
    language = None
    language_prob = None
    transcript = None
    if request.method == "POST":
        if "file" not in request.files:
            transcript = "No file part."
        else:
            file = request.files["file"]
            if file.filename == "":
                transcript = "No selected file."
            else:
                try:
                    with NamedTemporaryFile(suffix=".wav") as temp_audio:
                        file.save(temp_audio.name)
                        segments, info = whisper_model.transcribe(temp_audio.name, beam_size=5)
                        segments = list(segments)
                        full_text = " ".join(seg.text.strip() for seg in segments)
                        language = info.language
                        language_prob = f"{info.language_probability:.2f}"
                        transcript = full_text
                except Exception as e:
                    transcript = f"Error transcribing audio: {str(e)}"
    return render_template_string(AUDIO_UPLOAD_HTML, language=language, language_prob=language_prob, transcript=transcript)


In [None]:
if __name__ == "__main__":
    app.run(debug=True, use_reloader=False)

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [26/Sep/2025 11:31:53] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [26/Sep/2025 11:31:53] "GET /favicon.ico HTTP/1.1" 404 -
127.0.0.1 - - [26/Sep/2025 11:31:55] "GET /describe HTTP/1.1" 200 -
127.0.0.1 - - [26/Sep/2025 11:31:56] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [26/Sep/2025 11:31:57] "GET /transcribe HTTP/1.1" 200 -
127.0.0.1 - - [26/Sep/2025 11:31:58] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [26/Sep/2025 11:32:20] "GET /describe HTTP/1.1" 200 -
127.0.0.1 - - [26/Sep/2025 11:32:21] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [26/Sep/2025 11:32:21] "GET /transcribe HTTP/1.1" 200 -
127.0.0.1 - - [26/Sep/2025 11:32:22] "GET / HTTP/1.1" 200 -
