In [None]:
!pip install ultralytics
!pip install transformers_stream_generator
!pip install -U bitsandbytes
!pip install paddlepaddle-gpu==3.0.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
!pip install paddleocr
!pip install -q ultralytics
!pip install -q accelerate bitsandbytes
!pip install -q gradio
!wget -q -O license-plate-finetune-v1x.pt https://huggingface.co/morsetechlab/yolov11-license-plate-detection/resolve/main/license-plate-finetune-v1x.pt
!git clone https://github.com/A190nux/AI-Powered-Traffic-Scene-Analysis

In [None]:
import torch
import gradio as gr
from PIL import Image
from ultralytics import YOLO
from paddleocr import PaddleOCR
from transformers import AutoProcessor, LlavaForConditionalGeneration
import json
import requests
import numpy as np
import cv2

In [None]:
# Load VLLM (LLaVA) for scene description
llava_model = LlavaForConditionalGeneration.from_pretrained(
    "llava-hf/llava-1.5-7b-hf",
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    load_in_8bit=True,
)
llava_processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")

# Load Detection Model (YOLOv11)
yolo_model = YOLO('/content/license-plate-finetune-v1x.pt')

# Load OCR Model (PaddleOCR)
ocr_model = PaddleOCR(use_textline_orientation=True)

print("Models loaded successfully!")

In [None]:
def process_traffic_image(image_input, yolo_conf, ocr_conf, temperature, top_p):
    """
    This function processes an image through the full pipeline:
    1. Describes the scene using LLaVA.
    2. Detects license plates using YOLO.
    3. Extracts text from plates using PaddleOCR.
    4. Returns a combined JSON output.
    """
    # Ensure the input is a PIL Image
    if not isinstance(image_input, Image.Image):
        image_input = Image.fromarray(image_input)

    # Traffic Scene Description using LLaVA
    conversation = [
        {"role": "user", "content": [
            {"type": "text", "text": "Please describe this traffic scene in detail. You should focus on the following points: The state of the road itself and whether is has any problems such as cracks or potholes, how busy the road is both in terms of people and vehicles, is the traffic flowing normally or if there is a problem as well as the state of traffic lights, and finally, whether there are any unique events such as an accident or any congregation of people. The description also shouldn't mention things like the general atmosphere, we need a concise and factual description of the state of the traffic in the image that is reliable for monitoring the area and send personnel to interfere if neccesary such as traffic police, ambulances, etc. Do not count the vehicles or pedestrians in the image. Ignore the buildings and focus on the road."},
            {"type": "image"},
        ]},
    ]
    prompt = llava_processor.apply_chat_template(conversation, add_generation_prompt=True)
    inputs = llava_processor(images=image_input, text=prompt, return_tensors='pt').to("cuda", torch.float16)

    # Generate description
    output = llava_model.generate(
        **inputs,
        max_new_tokens=250,
        do_sample=True, # Set to True to use temperature and top_p
        temperature=temperature,
        top_p=top_p
    )
    scene_description = llava_processor.decode(output[0], skip_special_tokens=True)
    # Clean up the output to only get the assistant's response
    scene_description = scene_description.split("ASSISTANT:")[-1].strip()


    # Detect license plates with specified confidence
    yolo_results = yolo_model.predict(source=image_input, conf=yolo_conf)
    detected_plates_list = []

    # Get bounding boxes
    boxes = yolo_results[0].boxes.xyxy.cpu().numpy()

    for i, box in enumerate(boxes):
        x1, y1, x2, y2 = map(int, box)

        # Crop the image to the detected license plate
        plate_image = np.asarray(image_input.crop((x1, y1, x2, y2)))

        # Perform OCR on the cropped plate
        ocr_result = ocr_model.predict(plate_image)

        # Process OCR results based on the confirmed output structure
        if ocr_result and isinstance(ocr_result[0], dict):
            ocr_data = ocr_result[0] # The result is the dictionary itself
            texts = ocr_data.get('rec_texts', [])
            scores = ocr_data.get('rec_scores', [])

            # Pair recognized texts with their corresponding scores
            recognized_pairs = zip(texts, scores)

            # Filter the pairs based on the OCR confidence threshold
            filtered_pairs = [pair for pair in recognized_pairs if pair[1] > ocr_conf]

            if filtered_pairs:
                # Combine text from the filtered results
                plate_text = " ".join([res[0] for res in filtered_pairs])
                # Find the highest confidence score from the filtered results
                highest_confidence = max([res[1] for res in filtered_pairs])

                plate_data = {
                    'bounding_box': [x1, y1, x2, y2],
                    'text': plate_text,
                    'confidence': float(highest_confidence)
                }
                detected_plates_list.append(plate_data)


    # Combine Outputs into a single JSON
    number_of_plates = len(detected_plates_list)

    combined_json = {
        'scene_description': scene_description,
        'number_of_plates_detected': number_of_plates,
        'detected_plates': detected_plates_list
    }

    return scene_description, number_of_plates, combined_json

In [None]:
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🚦 AI-Powered Traffic Scene Analysis")
    gr.Markdown(
        "Upload an image of a traffic scene. The application will generate a detailed description of the scene, "
        "detect license plates, and extract the text from them."
    )

    with gr.Row():
        with gr.Column(scale=1):
            input_image = gr.Image(type="pil", label="Upload Traffic Scene Image")

            gr.Markdown("### ⚙️ Adjustable Parameters")
            with gr.Accordion("Fine-tune model settings", open=False):
                temp = gr.Slider(minimum=0.1, maximum=1.0, value=0.3, step=0.1, label="VLLM Temperature")
                top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.3, step=0.1, label="VLLM Top_p")
                yolo_confidence = gr.Slider(minimum=0.1, maximum=0.9, value=0.25, step=0.05, label="YOLO Confidence Threshold")
                ocr_confidence = gr.Slider(minimum=0.1, maximum=0.9, value=0.25, step=0.05, label="OCR Confidence Threshold")

            submit_btn = gr.Button("Analyze Image", variant="primary")

        with gr.Column(scale=2):
            gr.Markdown("### 📝 Analysis Results")
            output_description = gr.Textbox(label="Scene Description", lines=5)
            output_plate_count = gr.Number(label="Number of Plates Detected")
            output_json = gr.JSON(label="Combined JSON Output")

    submit_btn.click(
        fn=process_traffic_image,
        inputs=[input_image, yolo_confidence, ocr_confidence, temp, top_p],
        outputs=[output_description, output_plate_count, output_json]
    )

    gr.Examples(
        examples=[
            [
                "/content/AI-Powered-Traffic-Scene-Analysis/images/Example_Image.jpg",
                0.25,
                0.25,
                0.3,
                0.3
            ]
        ],
        inputs=[input_image, yolo_confidence, ocr_confidence, temp, top_p],
        outputs=[output_description, output_plate_count, output_json],
        fn=process_traffic_image,
        cache_examples=True,
        examples_per_page=1
    )

# Launch the Gradio App
demo.launch(debug=True, share=True)