In [None]:
!pip install fastapi uvicorn transformers torch Pillow opencv-python-headless pyngrok
!git clone https://github.com/X-PLUG/mPLUG-Owl.git


In [1]:
%%writefile model.py
import torch
from transformers import AutoModelForVision2Seq, AutoTokenizer

model_name = "/content/mPLUG-Owl"  # path to the cloned repo
model = AutoModelForVision2Seq.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def describe_key_elements(image):
    inputs = tokenizer(images=image, return_tensors="pt").to(device)
    outputs = model.generate(**inputs)
    description = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return description


Writing model.py


In [2]:
%%writefile process_a.py
from model import describe_key_elements
import cv2
import numpy as np
from PIL import Image
import io

def process_a1(image_bytes):
    image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
    description = describe_key_elements(image)
    return description

def process_a2(image_bytes, heatmap_bytes):
    image = np.array(Image.open(io.BytesIO(image_bytes)).convert("RGB"))
    heatmap = np.array(Image.open(io.BytesIO(heatmap_bytes)).convert("RGB"))

    red_channel = heatmap[:,:,0]
    _, thresholded = cv2.threshold(red_channel, 200, 255, cv2.THRESH_BINARY)
    contours, _ = cv2.findContours(thresholded, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    contours = sorted(contours, key=cv2.contourArea, reverse=True)

    salient_elements = []

    for contour in contours[:5]:
        x, y, w, h = cv2.boundingRect(contour)
        roi = image[y:y+h, x:x+w]
        roi_image = Image.fromarray(roi)
        description = describe_key_elements(roi_image)
        salient_elements.append(description)

    return salient_elements


Writing process_a.py


In [None]:
%%writefile process_b.py
from model import assess_cognitive_load
from PIL import Image
import io

cognitive_load_definition = """
Cognitive Load Theory is based on the model of human information processing. This model describes memory as having three main parts: sensory, working, and long-term. Sensory memory filters out most of what is going on around us, passing select information on to our working memory for additional processing. Working memory can typically process 5-9 pieces, or chunks, of information at any given time. Our working memory either discards the information or categorizes it for storing in our long-term memory. Long-term memory stores information in structures called “schemas,” which organize information based on how we use it. The more we use these schemas, the more developed they become and the easier it is to recall them. Cognitive load refers to the amount of information our working memory can process at any given time. For educational purposes, cognitive load theory helps us to avoid overloading learners with more than they can effectively process into schemas for long-term memory storage and future recall.
"""

def process_b(image_bytes):
    image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
    analysis = assess_cognitive_load(image, cognitive_load_definition)
    return analysis

Overwriting process_b.py


In [3]:
%%writefile process_c.py
def process_c(description_a1, salient_elements_a2, cognitive_load_b):
    summary = {
        "key_elements_description": description_a1,
        "salient_elements": salient_elements_a2,
        "cognitive_load_assessment": cognitive_load_b
    }
    return summary


Writing process_c.py


In [4]:
%%writefile main.py
from fastapi import FastAPI, UploadFile, File
from fastapi.responses import JSONResponse
from process_a import process_a1, process_a2
from process_b import process_b
from process_c import process_c
import uvicorn

app = FastAPI()

@app.post("/analyze")
async def analyze_image(image_file: UploadFile = File(...), heatmap_file: UploadFile = File(...)):
    image_bytes = await image_file.read()
    heatmap_bytes = await heatmap_file.read()

    # Process A1
    description_a1 = process_a1(image_bytes)

    # Process A2
    salient_elements_a2 = process_a2(image_bytes, heatmap_bytes)

    # Process B
    cognitive_load_b = process_b(image_bytes)

    # Process C
    final_output = process_c(description_a1, salient_elements_a2, cognitive_load_b)

    return JSONResponse(content=final_output)

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8000)


Writing main.py


In [None]:
!pip install pyngrok


In [None]:
from pyngrok import ngrok

# Terminate open tunnels if necessary
ngrok.kill()

# Start ngrok tunnel

public_url = ngrok.connect(port='8000')
print(f"Public URL: {public_url}")

# Run the FastAPI app
!python main.py


In [None]:
curl -X POST "{ngrok_url}/analyze" -F "image_file=@path_to_image.jpg" -F "heatmap_file=@path_to_heatmap.jpg"
