# 1. Install Gradio and Required Libraries
### Start by installing Gradio if it's not already installed.

In [None]:
! pip install gradio
! pip install cv
! pip install ultralytics
! pip install supervision

# 2. Import Libraries
### Getting all the necessary Libraries

In [None]:
import gradio as gr
import random
import numpy as np
from PIL import Image
import cv2
import time
from ultralytics import YOLO
import supervision as sv
import pandas as pd
from google.colab.patches import cv2_imshow
from IPython.display import clear_output
from collections import defaultdict, deque

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


# 3. Import Drive


In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


# 4. Brand Recognition Backend



### Model for Grocery Detection

In [None]:
model = YOLO('/content/drive/MyDrive/kitkat_s.pt')

### Image uploading for Grocery detection

In [None]:
def detect_grocery_items(image):
    image = np.array(image)[:, :, ::-1]
    results = model(image)
    annotated_image = results[0].plot()

    class_ids = results[0].boxes.cls.cpu().numpy()
    confidences = results[0].boxes.conf.cpu().numpy()

    threshold = 0.4
    class_counts = {}
    class_confidences = {}

    for i, class_id in enumerate(class_ids):
        confidence = confidences[i]
        if confidence >= threshold:
            class_name = model.names[int(class_id)]

            if class_name in class_counts:
                class_counts[class_name] += 1
            else:
                class_counts[class_name] = 1

            if class_name in class_confidences:
                class_confidences[class_name].append(confidence)
            else:
                class_confidences[class_name] = [confidence]

    if not class_counts:
        return image, [], "The model failed to recognize items or the image may contain untrained objects."

    summary_table = [[class_name, count, f"{np.mean(class_confidences[class_name]):.2f}"]
                     for class_name, count in class_counts.items()]

    annotated_image_rgb = annotated_image[:, :, ::-1]
    return annotated_image_rgb, summary_table, "Object Recognised Successfully 🥳 "


### Detect Grovcery brand from video

In [None]:
def iou(box1, box2):
    # Calculate intersection over union
    x1 = max(box1[0], box2[0])
    y1 = max(box1[1], box2[1])
    x2 = min(box1[2], box2[2])
    y2 = min(box1[3], box2[3])

    intersection = max(0, x2 - x1) * max(0, y2 - y1)
    area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
    area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])

    iou = intersection / float(area1 + area2 - intersection)
    return iou

In [None]:
def smooth_box(box_history):
    if not box_history:
        return None
    return np.mean(box_history, axis=0)

In [None]:
def process_video(input_path, output_path):
    cap = cv2.VideoCapture(input_path)

    # Get video properties
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(cap.get(cv2.CAP_PROP_FPS))

    # Initialize video writer
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    # Initialize variables for tracking
    detected_items = {}
    frame_count = 0

    # For result confirmation
    detections_history = defaultdict(lambda: defaultdict(int))

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        frame_count += 1

        # Run YOLO detection every 5th frame
        if frame_count % 5 == 0:
            results = model(frame)

            current_frame_detections = []

            for r in results:
                boxes = r.boxes
                for box in boxes:
                    x1, y1, x2, y2 = box.xyxy[0].tolist()
                    conf = box.conf.item()
                    cls = int(box.cls.item())
                    brand = model.names[cls]

                    current_frame_detections.append((brand, [x1, y1, x2, y2], conf))

            # Match current detections with existing items
            for brand, box, conf in current_frame_detections:
                matched = False
                for item_id, item_info in detected_items.items():
                    if iou(box, item_info['smoothed_box']) > 0.5:
                        item_info['frames_detected'] += 1
                        item_info['total_conf'] += conf
                        item_info['box_history'].append(box)
                        if len(item_info['box_history']) > 10:
                            item_info['box_history'].popleft()
                        item_info['smoothed_box'] = smooth_box(item_info['box_history'])
                        item_info['last_seen'] = frame_count
                        matched = True
                        break

                if not matched:
                    item_id = len(detected_items)
                    detected_items[item_id] = {
                        'brand': brand,
                        'box_history': deque([box], maxlen=10),
                        'smoothed_box': box,
                        'frames_detected': 1,
                        'total_conf': conf,
                        'last_seen': frame_count
                    }

                detections_history[brand][frame_count] += 1


        for item_id, item_info in list(detected_items.items()):
            if frame_count - item_info['last_seen'] > fps * 2:  # 2 seconds
                del detected_items[item_id]
                continue

            # Interpolate box position
            if item_info['smoothed_box'] is not None:
                alpha = 0.3
                current_box = item_info['smoothed_box']
                target_box = item_info['box_history'][-1] if item_info['box_history'] else current_box
                interpolated_box = [
                    current_box[i] * (1 - alpha) + target_box[i] * alpha
                    for i in range(4)
                ]
                item_info['smoothed_box'] = interpolated_box

                x1, y1, x2, y2 = map(int, interpolated_box)
                cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
                cv2.putText(frame, f"{item_info['brand']}",
                            (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)

        out.write(frame)

    cap.release()
    out.release()

    # Calculate final counts and confirm results
    total_frames = frame_count
    confirmed_items = {}
    for brand, frame_counts in detections_history.items():
        detection_frames = len(frame_counts)
        if detection_frames > total_frames * 0.1:
            avg_count = sum(frame_counts.values()) / detection_frames
            confirmed_items[brand] = round(avg_count)

    return confirmed_items

In [None]:
def annotate_video(input_video):
    output_path = 'annotated_output.mp4'
    confirmed_items = process_video(input_video, output_path)

    item_list = [(brand, quantity) for brand, quantity in confirmed_items.items()]

    status_message = "Video processed successfully!"

    return output_path, item_list, status_message

# 5. OCR Backend


In [None]:
def process_OCR(image):
    image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
    results = model(image)

    if isinstance(results, list):
        results = results[0]

    result = {
        "boxes": [],
        "class_names": [],
        "confidences": []
    }

    for box in results.boxes:
        result["boxes"].append(box.xyxy.tolist())
        result["confidences"].append(box.conf.item())
        result["class_names"].append(results.names[int(box.cls)])

    result_image = results.plot()
    result_image = cv2.cvtColor(result_image, cv2.COLOR_BGR2RGB)

    return result_image, result

# Dummy function for detect_and_ocr
def detect_and_ocr(image):
    # Simulate the object detection and OCR process
    # Dummy bounding box image (for illustration)
    result_image = np.array(image)  # Just return the original image in this dummy function

    # Dummy extracted text (you'll replace this with real OCR output)
    extracted_text = "Detected Text: Example Text from Image"

    # Dummy refined text (you'll replace this with your Gemini processing result)
    refined_text = "Refined Text: Cleaned up text"

    # Dummy validated output (this will be the validated text from further steps)
    validated_text = "Validated Output: Verified text after review"

    return result_image, extracted_text, refined_text, validated_text

# Dummy function for further_processing
def further_processing(image, extracted_text):
    # Simulate further processing on the extracted text
    refined_text = f"Further refined: {extracted_text} (Refined Again)"
    return refined_text

# Dummy function for handle_processing
def handle_processing(validated_text):
    # Simulate checking the validated text and showing/hiding the further processing button
    # Here, we show the button if the validated text is non-empty
    if validated_text.strip():
        return gr.update(visible=True)
    else:
        return gr.update(visible=False)


# 5. Frontend Of Brand Recognition

## Layout for Image interface

In [None]:
def create_image_interface():
    return gr.Interface(
        fn=detect_grocery_items,
        inputs=gr.Image(label="Upload Image", height=400, width=400),
        outputs=[
            gr.Image(label="Image with Bounding Boxes", height=400, width=400),
            gr.Dataframe(headers=["Item", "Quantity", "Avg Confidence"], label="Detected Items and Quantities", elem_id="summary_table"),
            gr.Textbox(label="Status", elem_id="status_message")
        ],
        title="Grocery Item Detection in an Image",
        description="Upload an image for object detection. The model will return an annotated image, item quantities, and average confidence scores.",
        css=".gr-table { font-size: 16px; text-align: left; width: 50%; margin: auto; } #summary_table { margin-top: 20px; }"
    )

## Layout For Video Interface

In [None]:
def create_video_interface():
    return gr.Interface(
        fn=annotate_video,  # This is the function that processes the video and returns the results
        inputs=gr.Video(label="Upload Video", height=400, width=400),
        outputs=[
            gr.Video(label="Annotated Video", height=400, width=400),  # To display the annotated video
            gr.Dataframe(headers=["Item", "Quantity"], label="Detected Items and Quantities", elem_id="summary_table"),
            gr.Textbox(label="Status", elem_id="status_message")  # Any additional status messages
        ],
        title="Grocery Item Detection in a Video",
        description="Upload a video for object detection. The model will return an annotated video with bounding boxes and item quantities. Low confidence values may indicate incorrect detection.",
        css="""
            .gr-table { font-size: 16px; text-align: left; width: 50%; margin: auto; }
            #summary_table { margin-top: 20px; }
        """
    )

In [None]:
def create_brand_recog_interface():
    with gr.Blocks() as demo:
        gr.Markdown("# Flipkart Grid Robotics Track - Brand Recognition Interface")

        with gr.Tabs():
            with gr.Tab("Image"):
                create_image_interface()
            with gr.Tab("Video"):
                create_video_interface()
    return demo

Brand_recog = create_brand_recog_interface()


# Frontend Of OCR

In [None]:
def create_ocr_interface():
    with gr.Blocks() as ocr_interface:
        gr.Markdown("# Flipkart Grid Robotics Track - OCR Interface")

        with gr.Tabs():
            with gr.TabItem("Upload & Detection"):
                with gr.Row():
                    input_image = gr.Image(type="pil", label="Upload Image", height=400, width=400)
                    output_image = gr.Image(label="Image with Bounding Boxes", height=400, width=400)

                btn = gr.Button("Detect and OCR")

            with gr.TabItem("OCR Results"):
                with gr.Row():
                    extracted_textbox = gr.Textbox(label="Extracted Text", lines=5)
                with gr.Row():
                    refined_textbox = gr.Textbox(label="Refined Text from Gemini", lines=5)
                with gr.Row():
                    validated_textbox = gr.Textbox(label="Validated Output", lines=5)

                further_button = gr.Button("Further Processing", visible=False)

        # Button click event for OCR detection
        btn.click(
            detect_and_ocr,
            inputs=[input_image],
            outputs=[output_image, extracted_textbox, refined_textbox, validated_textbox]
        )

        # Further processing button click event
        further_button.click(
            further_processing,
            inputs=[input_image, extracted_textbox],
            outputs=refined_textbox
        )

        # Monitor validated output to control button visibility
        validated_textbox.change(
            handle_processing,
            inputs=[validated_textbox],
            outputs=[further_button],
        )

    return ocr_interface

# Create the OCR interface
ocr_interface = create_ocr_interface()


# 6. Create a Tabbed Interface for Both Image and Video
### Here, we combine the image and video interfaces into a tabbed structure so users can switch between them easily.

In [None]:
def create_tabbed_interface():
    return gr.TabbedInterface(
        [Brand_recog,  ocr_interface ],
        ["Brand Recongnition", "OCR"]
    )

tabbed_interface = create_tabbed_interface()

# 7. Launch the Gradio Interface
### Finally, launch the Gradio interface to make it interactable.

In [None]:
tabbed_interface.launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://c034444fe19ff19929.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


