<a href="https://colab.research.google.com/github/ArunAravind2001/Image-segmentation-object-detection-and-text-extraction-using-detetcron2/blob/main/Image%20segmentation/object%20detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
import zipfile
import os
from tqdm import tqdm

def download_file(url, file_name):
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        total_size = int(response.headers.get('content-length', 0))

        with open(file_name, 'wb') as f, tqdm(
            desc=file_name,
            total=total_size,
            unit='iB',
            unit_scale=True,
            unit_divisor=1024,
        ) as progress_bar:
            for data in response.iter_content(chunk_size=1024):
                size = f.write(data)
                progress_bar.update(size)
        return True
    else:
        print(f"Failed to download {file_name}. Status code: {response.status_code}")
        return False

def extract_zip(file_name, extract_dir):
    print(f"Extracting {file_name}...")
    try:
        with zipfile.ZipFile(file_name, 'r') as zip_ref:
            zip_ref.extractall(extract_dir)
        print(f"Extraction of {file_name} completed successfully!")
        os.remove(file_name)
        print(f"Removed {file_name}")
    except zipfile.BadZipFile:
        print(f"The file {file_name} is not a valid ZIP file. Please check the URL and try again.")

# URLs for COCO 2017 Mini dataset components
urls = {
    "train_images": "http://images.cocodataset.org/zips/train2017.zip",
    "val_images": "http://images.cocodataset.org/zips/val2017.zip",
    "test_images": "http://images.cocodataset.org/zips/test2017.zip",
    "annotations": "http://images.cocodataset.org/annotations/annotations_trainval2017.zip",
    "stuff_annotations": "http://images.cocodataset.org/annotations/stuff_annotations_trainval2017.zip",
    "panoptic_annotations": "http://images.cocodataset.org/annotations/panoptic_annotations_trainval2017.zip",
}

extract_dir = "coco_2017_mini_dataset"
os.makedirs(extract_dir, exist_ok=True)

for key, url in urls.items():
    file_name = f"{key}.zip"
    print(f"Downloading {file_name}...")
    if download_file(url, file_name):
        extract_zip(file_name, extract_dir)
    print()

print("COCO 2017 Mini dataset download and extraction completed!")
print(f"Dataset is located in: {os.path.abspath(extract_dir)}")

Downloading train_images.zip...


train_images.zip: 100%|██████████| 18.0G/18.0G [06:37<00:00, 48.6MiB/s]


Extracting train_images.zip...
Extraction of train_images.zip completed successfully!
Removed train_images.zip

Downloading val_images.zip...


val_images.zip: 100%|██████████| 778M/778M [00:19<00:00, 42.1MiB/s]


Extracting val_images.zip...
Extraction of val_images.zip completed successfully!
Removed val_images.zip

Downloading test_images.zip...


test_images.zip: 100%|██████████| 6.19G/6.19G [02:20<00:00, 47.5MiB/s]


Extracting test_images.zip...
Extraction of test_images.zip completed successfully!
Removed test_images.zip

Downloading annotations.zip...


annotations.zip: 100%|██████████| 241M/241M [00:05<00:00, 49.2MiB/s]


Extracting annotations.zip...
Extraction of annotations.zip completed successfully!
Removed annotations.zip

Downloading stuff_annotations.zip...


stuff_annotations.zip: 100%|██████████| 1.07G/1.07G [00:23<00:00, 49.3MiB/s]


Extracting stuff_annotations.zip...
Extraction of stuff_annotations.zip completed successfully!
Removed stuff_annotations.zip

Downloading panoptic_annotations.zip...


panoptic_annotations.zip: 100%|██████████| 821M/821M [00:19<00:00, 44.1MiB/s]


Extracting panoptic_annotations.zip...
Extraction of panoptic_annotations.zip completed successfully!
Removed panoptic_annotations.zip

COCO 2017 Mini dataset download and extraction completed!
Dataset is located in: /content/coco_2017_mini_dataset


In [2]:
!pip install torch torchvision streamlit opencv-python pillow pandas pycocotools


Collecting streamlit
  Downloading streamlit-1.38.0-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting tenacity<9,>=8.1.0 (from streamlit)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Collecting gitpython!=3.1.19,<4,>=3.0.7 (from streamlit)
  Downloading GitPython-3.1.43-py3-none-any.whl.metadata (13 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting watchdog<5,>=2.1.5 (from streamlit)
  Downloading watchdog-4.0.2-py3-none-manylinux2014_x86_64.whl.metadata (38 kB)
Collecting gitdb<5,>=4.0.1 (from gitpython!=3.1.19,<4,>=3.0.7->streamlit)
  Downloading gitdb-4.0.11-py3-none-any.whl.metadata (1.2 kB)
Collecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->gitpython!=3.1.19,<4,>=3.0.7->streamlit)
  Downloading smmap-5.0.1-py3-none-any.whl.metadata (4.3 kB)
Downloading streamlit-1.38.0-py2.py3-none-any.whl (8.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.7/8.7 MB[0m [31m82.3 MB

In [3]:
# Install PyTorch and torchvision
!pip install torch torchvision

# Install Detectron2
!pip install 'git+https://github.com/facebookresearch/detectron2.git'


Collecting git+https://github.com/facebookresearch/detectron2.git
  Cloning https://github.com/facebookresearch/detectron2.git to /tmp/pip-req-build-q8aj0ac4
  Running command git clone --filter=blob:none --quiet https://github.com/facebookresearch/detectron2.git /tmp/pip-req-build-q8aj0ac4
  Resolved https://github.com/facebookresearch/detectron2.git to commit ebe8b45437f86395352ab13402ba45b75b4d1ddb
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting yacs>=0.1.8 (from detectron2==0.6)
  Downloading yacs-0.1.8-py3-none-any.whl.metadata (639 bytes)
Collecting fvcore<0.1.6,>=0.1.5 (from detectron2==0.6)
  Downloading fvcore-0.1.5.post20221221.tar.gz (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.2/50.2 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting iopath<0.1.10,>=0.1.7 (from detectron2==0.6)
  Downloading iopath-0.1.9-py3-none-any.whl.metadata (370 bytes)
Collecting omegaconf<2.

In [1]:
import json
import os
from collections import defaultdict

def load_coco_annotations(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

def explore_coco_dataset(annotations_dir):
    # Paths to annotation files
    instances_file = os.path.join(annotations_dir, 'instances_train2017.json')

    # Load annotations
    instances_data = load_coco_annotations(instances_file)

    # Extract category information
    categories = {cat['id']: cat['name'] for cat in instances_data['categories']}

    # Count instances per category
    instance_counts = defaultdict(int)
    for annotation in instances_data['annotations']:
        category_id = annotation['category_id']
        instance_counts[categories[category_id]] += 1

    # Print dataset information
    print(f"Total number of images: {len(instances_data['images'])}")
    print(f"Total number of annotations: {len(instances_data['annotations'])}")
    print(f"Number of categories: {len(categories)}")

    print("\nCategories and instance counts:")
    for category, count in sorted(instance_counts.items(), key=lambda x: x[1], reverse=True):
        print(f"{category}: {count}")

# Set the path to your COCO dataset annotations
coco_annotations_dir = '/content/coco_2017_mini_dataset/annotations'

# Explore the dataset
explore_coco_dataset(coco_annotations_dir)

Total number of images: 118287
Total number of annotations: 860001
Number of categories: 80

Categories and instance counts:
person: 262465
car: 43867
chair: 38491
book: 24715
bottle: 24342
cup: 20650
dining table: 15714
bowl: 14358
traffic light: 12884
handbag: 12354
umbrella: 11431
bird: 10806
boat: 10759
truck: 9973
bench: 9838
sheep: 9509
banana: 9458
kite: 9076
motorcycle: 8725
backpack: 8720
potted plant: 8652
cow: 8147
wine glass: 7913
carrot: 7852
knife: 7770
broccoli: 7308
donut: 7179
bicycle: 7113
skis: 6646
vase: 6613
horse: 6587
tie: 6496
cell phone: 6434
orange: 6399
cake: 6353
sports ball: 6347
clock: 6334
suitcase: 6192
spoon: 6165
surfboard: 6126
bus: 6069
apple: 5851
pizza: 5821
tv: 5805
couch: 5779
remote: 5703
sink: 5610
skateboard: 5543
elephant: 5513
dog: 5508
fork: 5479
zebra: 5303
airplane: 5135
giraffe: 5131
laptop: 4970
tennis racket: 4812
teddy bear: 4793
cat: 4768
train: 4571
sandwich: 4373
bed: 4192
toilet: 4157
baseball glove: 3747
oven: 3334
baseball bat: 

In [2]:
from detectron2.config import get_cfg
from detectron2 import model_zoo

def setup_cfg():
    # Get the basic model configuration from the model zoo
    cfg = get_cfg()

    # Merge with base config from COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml
    cfg.merge_from_file(model_zoo.get_config_file("COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml"))

    # Set the dataset paths
    cfg.DATASETS.TRAIN = ("coco_2017_train",)
    cfg.DATASETS.TEST = ("coco_2017_val",)

    # Set the number of classes (80 for COCO)
    cfg.MODEL.ROI_HEADS.NUM_CLASSES = 80

    # Set the base learning rate
    cfg.SOLVER.BASE_LR = 0.00025

    # Set the number of iterations
    cfg.SOLVER.MAX_ITER = 300000

    # Set the checkpoint period
    cfg.SOLVER.CHECKPOINT_PERIOD = 5000

    # Set the evaluation period
    cfg.TEST.EVAL_PERIOD = 5000

    # Set the batch size per image
    cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 128

    # Set the number of images per batch
    cfg.SOLVER.IMS_PER_BATCH = 2

    # Set the path to the pre-trained model weights
    cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml")

    # Set the output directory for saving results
    cfg.OUTPUT_DIR = "./output"

    return cfg

# Create the configuration
cfg = setup_cfg()

# Now you can use cfg for training or inference
print(cfg.DATASETS.TRAIN)
print(cfg.MODEL.ROI_HEADS.NUM_CLASSES)

('coco_2017_train',)
80


In [4]:
!pip install opencv-python-headless pytesseract detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu102/torch1.9/index.html


Looking in links: https://dl.fbaipublicfiles.com/detectron2/wheels/cu102/torch1.9/index.html
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13


In [5]:
# Install Tesseract OCR
!apt-get install tesseract-ocr

# Import the required libraries
import pytesseract

# Specify the path to the Tesseract executable
pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [7]:
import os
import cv2
import pytesseract
import torch
import matplotlib.pyplot as plt
from detectron2.engine import DefaultTrainer, DefaultPredictor
from detectron2.config import get_cfg
from detectron2 import model_zoo
from detectron2.data.datasets import register_coco_instances
from detectron2.utils.visualizer import Visualizer
from detectron2.utils.visualizer import ColorMode
from detectron2.data import MetadataCatalog

# Function to set up the Mask R-CNN configuration for segmentation
def setup_mask_rcnn_cfg():
    dataset_root = "/content/coco_2017_mini_dataset"  # Update this path based on your Colab mount point
    train_images_path = os.path.join(dataset_root, "train2017")
    val_images_path = os.path.join(dataset_root, "val2017")
    annotation_train = os.path.join(dataset_root, "annotations", "instances_train2017.json")
    annotation_val = os.path.join(dataset_root, "annotations", "instances_val2017.json")

    # Register the COCO dataset for training and validation
    register_coco_instances("coco_train", {}, annotation_train, train_images_path)
    register_coco_instances("coco_val", {}, annotation_val, val_images_path)

    # Set up the Mask R-CNN configuration
    cfg = get_cfg()
    cfg.merge_from_file(model_zoo.get_config_file("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"))

    cfg.DATASETS.TRAIN = ("coco_train",)
    cfg.DATASETS.TEST = ("coco_val",)

    # Set the number of classes (COCO has 80 classes)
    cfg.MODEL.ROI_HEADS.NUM_CLASSES = 80

    # Set the base learning rate and other parameters
    cfg.SOLVER.BASE_LR = 0.00025
    cfg.SOLVER.MAX_ITER = 10000
    cfg.SOLVER.CHECKPOINT_PERIOD = 5000
    cfg.TEST.EVAL_PERIOD = 5000
    cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 128
    cfg.SOLVER.IMS_PER_BATCH = 4

    # Use pre-trained weights from COCO model
    cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml")
    cfg.OUTPUT_DIR = "./output"
    os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)

    return cfg

# Function to perform segmentation using Mask R-CNN
def run_mask_rcnn(image_path, cfg):
    predictor = DefaultPredictor(cfg)
    img = cv2.imread(image_path)
    outputs = predictor(img)

    # Visualize the Mask R-CNN results
    v = Visualizer(img[:, :, ::-1],
                   MetadataCatalog.get(cfg.DATASETS.TRAIN[0]),
                   scale=1.2,
                   instance_mode=ColorMode.IMAGE_BW  # grayscale visualization
                   )
    out = v.draw_instance_predictions(outputs["instances"].to("cpu"))
    result_img = out.get_image()[:, :, ::-1]

    return result_img, outputs

# Use YOLO model for object detection
def run_yolo_detection(image_path):
    # Load YOLO model (assuming you have a YOLOv5 PyTorch model)
    model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True)

    # Load image and perform object detection
    results = model(image_path)

    # YOLO will return bounding boxes and other data
    detections = results.xyxy[0]  # x1, y1, x2, y2, conf, cls

    return detections, results.render()

# Function to extract text using OCR (Tesseract)
def extract_text_from_image(image_path):
    img = cv2.imread(image_path)

    # Convert the image to grayscale for better OCR accuracy
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Apply OCR using pytesseract
    extracted_text = pytesseract.image_to_string(gray)

    return extracted_text

# Function to display images using matplotlib
def show_image(image, title="Image"):
    plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    plt.title(title)
    plt.axis('off')
    plt.show()

# Main function to run the segmentation, detection, and text extraction
def process_image(image_path, cfg):
    print(f"Processing image: {image_path}")

    # 1. Run Mask R-CNN for segmentation
    segmented_img, segmentation_outputs = run_mask_rcnn(image_path, cfg)
    print(f"Segmentation completed for {image_path}.")

    # 2. Run YOLO for object detection
    yolo_detections, yolo_img = run_yolo_detection(image_path)
    print(f"Object detection completed for {image_path}.")

    # 3. Extract text using OCR
    extracted_text = extract_text_from_image(image_path)
    print(f"Text extracted: {extracted_text}")

    # Save extracted text to a file
    text_file_path = "./output/extracted_text.txt"
    with open(text_file_path, 'w') as text_file:
        text_file.write(extracted_text)
    print(f"Extracted text saved to {text_file_path}")

    # Show the Mask R-CNN output
    show_image(segmented_img, title="Segmented Image")

    # Show the YOLO detections
    show_image(yolo_img[0], title="YOLO Detection")  # Rendered YOLO image

    # Output the results (you can save images, return data, or process further)
    return segmented_img, yolo_img[0], extracted_text

# Create output directory if it doesn't exist
os.makedirs("./output", exist_ok=True)

# Setup the Mask R-CNN configuration
cfg = setup_mask_rcnn_cfg()

# Process a sample image
sample_image = "/content/coco_2017_mini_dataset/test2017/000000000016.jpg"  # Update with your image path
segmented_result, yolo_result, text_result = process_image(sample_image, cfg)

# Optional: Save the outputs
cv2.imwrite("./output/segmented_result.jpg", segmented_result)
cv2.imwrite("./output/yolo_result.jpg", yolo_result)
print("Results saved.")


Processing image: /content/coco_2017_mini_dataset/test2017/000000000016.jpg


Using cache found in /root/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2024-9-25 Python-3.10.12 torch-2.4.1+cu121 CUDA:0 (Tesla T4, 15102MiB)



Segmentation completed for /content/coco_2017_mini_dataset/test2017/000000000016.jpg.


Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 
  with amp.autocast(autocast):


Object detection completed for /content/coco_2017_mini_dataset/test2017/000000000016.jpg.
Text extracted:  

Extracted text saved to ./output/extracted_text.txt
Results saved.


In [8]:
!pip install streamlit




In [None]:
import streamlit as st
import os
import cv2
import pytesseract
import torch
import matplotlib.pyplot as plt
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2 import model_zoo
from detectron2.utils.visualizer import Visualizer
from detectron2.utils.visualizer import ColorMode
from detectron2.data import MetadataCatalog

# Function to set up the Mask R-CNN configuration for segmentation
def setup_mask_rcnn_cfg():
    dataset_root = "/content/coco_2017_mini_dataset"  # Update this path based on your Colab mount point
    train_images_path = os.path.join(dataset_root, "train2017")
    val_images_path = os.path.join(dataset_root, "val2017")
    annotation_train = os.path.join(dataset_root, "annotations", "instances_train2017.json")
    annotation_val = os.path.join(dataset_root, "annotations", "instances_val2017.json")

    # Register the COCO dataset for training and validation
    register_coco_instances("coco_train", {}, annotation_train, train_images_path)
    register_coco_instances("coco_val", {}, annotation_val, val_images_path)

    # Set up the Mask R-CNN configuration
    cfg = get_cfg()
    cfg.merge_from_file(model_zoo.get_config_file("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"))

    cfg.DATASETS.TRAIN = ("coco_train",)
    cfg.DATASETS.TEST = ("coco_val",)

    cfg.MODEL.ROI_HEADS.NUM_CLASSES = 80
    cfg.SOLVER.BASE_LR = 0.00025
    cfg.SOLVER.MAX_ITER = 10000
    cfg.SOLVER.CHECKPOINT_PERIOD = 5000
    cfg.TEST.EVAL_PERIOD = 5000
    cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 128
    cfg.SOLVER.IMS_PER_BATCH = 4
    cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml")
    cfg.OUTPUT_DIR = "./output"
    os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)

    return cfg

# Function to run Mask R-CNN
def run_mask_rcnn(image_path, cfg):
    predictor = DefaultPredictor(cfg)
    img = cv2.imread(image_path)
    outputs = predictor(img)

    v = Visualizer(img[:, :, ::-1],
                   MetadataCatalog.get(cfg.DATASETS.TRAIN[0]),
                   scale=1.2,
                   instance_mode=ColorMode.IMAGE_BW)
    out = v.draw_instance_predictions(outputs["instances"].to("cpu"))
    result_img = out.get_image()[:, :, ::-1]

    return result_img, outputs

# Function to run YOLO detection
def run_yolo_detection(image_path):
    model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True)
    results = model(image_path)
    detections = results.xyxy[0]  # x1, y1, x2, y2, conf, cls

    return detections, results.render()

# Function to extract text
def extract_text_from_image(image_path):
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    extracted_text = pytesseract.image_to_string(gray)

    return extracted_text

# Main function to process the image
def process_image(image_path, cfg):
    segmented_img, segmentation_outputs = run_mask_rcnn(image_path, cfg)
    yolo_detections, yolo_img = run_yolo_detection(image_path)
    extracted_text = extract_text_from_image(image_path)

    return segmented_img, yolo_img[0], extracted_text

# Streamlit UI
st.title("Image Processing App")
st.write("Upload an image for segmentation, detection, and text extraction.")

uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])

if uploaded_file is not None:
    image_path = os.path.join("/content", uploaded_file.name)
    with open(image_path, "wb") as f:
        f.write(uploaded_file.getbuffer())

    st.image(image_path, caption='Uploaded Image', use_column_width=True)

    cfg = setup_mask_rcnn_cfg()
    segmented_result, yolo_result, text_result = process_image(image_path, cfg)

    st.image(segmented_result, caption='Segmented Image', use_column_width=True)
    st.image(yolo_result, caption='YOLO Detection', use_column_width=True)
    st.text_area("Extracted Text", value=text_result, height=300)



In [None]:
# Install required packages
!pip install streamlit
!pip install opencv-python-headless
!pip install pytesseract
!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113
!pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu113/torch1.12/index.html
!apt-get install -y tesseract-ocr
!apt-get install -y libtesseract-dev

# Create the Streamlit app script
with open('app.py', 'w') as f:
    f.write("""
import streamlit as st
import os
import cv2
import pytesseract
import torch
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2 import model_zoo
from detectron2.utils.visualizer import Visualizer
from detectron2.utils.visualizer import ColorMode
from detectron2.data import MetadataCatalog

# Function to set up the Mask R-CNN configuration for segmentation
def setup_mask_rcnn_cfg():
    dataset_root = "/content/coco_2017_mini_dataset"
    train_images_path = os.path.join(dataset_root, "train2017")
    val_images_path = os.path.join(dataset_root, "val2017")
    annotation_train = os.path.join(dataset_root, "annotations", "instances_train2017.json")
    annotation_val = os.path.join(dataset_root, "annotations", "instances_val2017.json")

    # Register the COCO dataset for training and validation
    register_coco_instances("coco_train", {}, annotation_train, train_images_path)
    register_coco_instances("coco_val", {}, annotation_val, val_images_path)

    # Set up the Mask R-CNN configuration
    cfg = get_cfg()
    cfg.merge_from_file(model_zoo.get_config_file("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"))
    cfg.DATASETS.TRAIN = ("coco_train",)
    cfg.DATASETS.TEST = ("coco_val",)
    cfg.MODEL.ROI_HEADS.NUM_CLASSES = 80
    cfg.SOLVER.BASE_LR = 0.00025
    cfg.SOLVER.MAX_ITER = 10000
    cfg.SOLVER.CHECKPOINT_PERIOD = 5000
    cfg.TEST.EVAL_PERIOD = 5000
    cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 128
    cfg.SOLVER.IMS_PER_BATCH = 4
    cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml")
    cfg.OUTPUT_DIR = "./output"
    os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)
    return cfg

# Function to run Mask R-CNN
def run_mask_rcnn(image_path, cfg):
    predictor = DefaultPredictor(cfg)
    img = cv2.imread(image_path)
    outputs = predictor(img)

    v = Visualizer(img[:, :, ::-1],
                   MetadataCatalog.get(cfg.DATASETS.TRAIN[0]),
                   scale=1.2,
                   instance_mode=ColorMode.IMAGE_BW)
    out = v.draw_instance_predictions(outputs["instances"].to("cpu"))
    result_img = out.get_image()[:, :, ::-1]
    return result_img, outputs

# Function to extract text
def extract_text_from_image(image_path):
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    extracted_text = pytesseract.image_to_string(gray)
    return extracted_text

# Main function to process the image
def process_image(image_path, cfg):
    segmented_img, _ = run_mask_rcnn(image_path, cfg)
    extracted_text = extract_text_from_image(image_path)
    return segmented_img, extracted_text

# Streamlit UI
st.title("Image Processing App")
st.write("Upload an image for segmentation and text extraction.")

uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])

if uploaded_file is not None:
    image_path = os.path.join("/content", uploaded_file.name)
    with open(image_path, "wb") as f:
        f.write(uploaded_file.getbuffer())

    st.image(image_path, caption='Uploaded Image', use_column_width=True)

    cfg = setup_mask_rcnn_cfg()
    segmented_result, text_result = process_image(image_path, cfg)

    st.image(segmented_result, caption='Segmented Image', use_column_width=True)
    st.text_area("Extracted Text", value=text_result, height=300)
""")

# Run Streamlit app with localtunnel
!streamlit run app.py & npx localtunnel --port 8501


NEW trial


In [19]:
# Install required packages
!pip install streamlit
!pip install opencv-python-headless
!pip install pytesseract
!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113
!pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu113/torch1.12/index.html
!apt-get install -y tesseract-ocr
!apt-get install -y libtesseract-dev

# Create the Streamlit app script
with open('app.py', 'w') as f:
    f.write("""
import streamlit as st
import os
import cv2
import pytesseract
import torch
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2 import model_zoo
from detectron2.utils.visualizer import Visualizer
from detectron2.utils.visualizer import ColorMode
from detectron2.data import MetadataCatalog
from detectron2.data.datasets import register_coco_instances

# Function to set up the Mask R-CNN configuration for segmentation
def setup_mask_rcnn_cfg():
    dataset_root = "/content/coco_2017_mini_dataset"
    train_images_path = os.path.join(dataset_root, "train2017")
    val_images_path = os.path.join(dataset_root, "val2017")
    annotation_train = os.path.join(dataset_root, "annotations", "instances_train2017.json")
    annotation_val = os.path.join(dataset_root, "annotations", "instances_val2017.json")

    # Register the COCO dataset for training and validation
    register_coco_instances("coco_train", {}, annotation_train, train_images_path)
    register_coco_instances("coco_val", {}, annotation_val, val_images_path)

    # Set up the Mask R-CNN configuration
    cfg = get_cfg()
    cfg.merge_from_file(model_zoo.get_config_file("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"))
    cfg.DATASETS.TRAIN = ("coco_train",)
    cfg.DATASETS.TEST = ("coco_val",)
    cfg.MODEL.ROI_HEADS.NUM_CLASSES = 80

    # Set the thing_classes for the COCO dataset
    MetadataCatalog.get("coco_train").set(thing_classes=[
        "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat",
        "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat",
        "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack",
        "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball",
        "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket",
        "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
        "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake",
        "chair", "couch", "potted plant", "bed", "dining table", "toilet", "TV", "laptop",
        "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink",
        "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier",
        "toothbrush"
    ])

    cfg.SOLVER.BASE_LR = 0.00025
    cfg.SOLVER.MAX_ITER = 10000
    cfg.SOLVER.CHECKPOINT_PERIOD = 5000
    cfg.TEST.EVAL_PERIOD = 5000
    cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 128
    cfg.SOLVER.IMS_PER_BATCH = 4
    cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml")
    cfg.OUTPUT_DIR = "./output"
    os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)
    return cfg

# Function to run Mask R-CNN
def run_mask_rcnn(image_path, cfg):
    predictor = DefaultPredictor(cfg)
    img = cv2.imread(image_path)
    outputs = predictor(img)

    # Get class names from the outputs
    pred_classes = outputs["instances"].pred_classes.to("cpu").numpy()
    pred_boxes = outputs["instances"].pred_boxes.to("cpu").tensor.numpy()

    # Create a Visualizer object
    v = Visualizer(img[:, :, ::-1],
                   MetadataCatalog.get(cfg.DATASETS.TRAIN[0]),
                   scale=1.2,
                   instance_mode=ColorMode.IMAGE_BW)

    # Draw the predictions
    out = v.draw_instance_predictions(outputs["instances"].to("cpu"))

    # Overlay class names on the segmented image
    segmented_img = out.get_image()[:, :, ::-1]

    return segmented_img, pred_classes

# Function to process the image
def process_image(image_path, cfg):
    segmented_img, pred_classes = run_mask_rcnn(image_path, cfg)
    text = pytesseract.image_to_string(segmented_img)
    return segmented_img, text

# Streamlit app interface
st.title("Image Segmentation and Text Extraction")
uploaded_file = st.file_uploader("Upload an Image", type=["jpg", "jpeg", "png"])

if uploaded_file is not None:
    image_path = uploaded_file.name
    with open(image_path, "wb") as f:
        f.write(uploaded_file.getbuffer())

    cfg = setup_mask_rcnn_cfg()
    segmented_result, text_result = process_image(image_path, cfg)

    st.image(segmented_result, caption="Segmented Image", use_column_width=True)
    st.write("Extracted Text:")
    st.text(text_result)
""")


Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu113
Looking in links: https://dl.fbaipublicfiles.com/detectron2/wheels/cu113/torch1.12/index.html
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
libtesseract-dev is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [20]:
!streamlit run app.py



Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.169.33.145:8501[0m
[0m
[34m  Stopping...[0m
[34m  Stopping...[0m
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/click/core.py", line 1088, in main
    ctx.exit()
  File "/usr/local/lib/python3.10/dist-packages/click/core.py", line 692, in exit
    raise Exit(code)
click.exceptions.Exit: 0

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/bin/streamlit", line 8, in <module>
    sys.exit(main())
  File "/usr/local/lib/python3.10/dist-packages/click/core.py", line 1157, in __call__
    return self.main(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/

#final script

In [29]:
!streamlit run app.py & npx localtunnel --port 8501



Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.169.33.145:8501[0m
[0m
your url is: https://eleven-houses-shine.loca.lt
  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
2024-09-25 09:29:15.579 Uncaught app exception
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/streamlit/runtime/scriptrunner/exec_code.py", line 88, in exec_func_with_error_handling
    result = func()
  File "/usr/local/lib/python3.10/dist-packages/streamlit/runtime/scriptrunner/script_runner.py", line 590, in code_to_exec
    exec(code, module.__dict__)
  File "/content/app.py", line 102, in <module>
    segmented_result, text_result = process_image(image_path, cfg)
  File "/content/app.py", line 8

In [16]:
!curl https://loca.lt/mytunnelpassword


34.169.33.145

In [23]:
# Correct final Install required packages
!pip install streamlit
!pip install opencv-python-headless
!pip install pytesseract
!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113
!pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu113/torch1.12/index.html
!apt-get install -y tesseract-ocr
!apt-get install -y libtesseract-dev

# Clone the YOLOv5 repository
!git clone https://github.com/ultralytics/yolov5.git
%cd yolov5

# Install required packages
!pip install -r requirements.txt

# Install additional libraries if needed (like opencv, etc.)
!pip install opencv-python-headless

# Download the pretrained weights (if needed)
!curl -L "https://github.com/ultralytics/yolov5/releases/download/v6.2/yolov5s.pt" -o yolov5s.pt


# Create the Streamlit app script
with open('app.py', 'w') as f:
    f.write("""
import streamlit as st
import os
import cv2
import pytesseract
import torch
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2 import model_zoo
from detectron2.utils.visualizer import Visualizer
from detectron2.utils.visualizer import ColorMode
from detectron2.data import MetadataCatalog
from detectron2.data.datasets import register_coco_instances
from yolov5 import YOLOv5

# Function to set up the Mask R-CNN configuration for segmentation
def setup_mask_rcnn_cfg():
    dataset_root = "/content/coco_2017_mini_dataset"
    train_images_path = os.path.join(dataset_root, "train2017")
    val_images_path = os.path.join(dataset_root, "val2017")
    annotation_train = os.path.join(dataset_root, "annotations", "instances_train2017.json")
    annotation_val = os.path.join(dataset_root, "annotations", "instances_val2017.json")

    # Register the COCO dataset for training and validation
    register_coco_instances("coco_train", {}, annotation_train, train_images_path)
    register_coco_instances("coco_val", {}, annotation_val, val_images_path)

    # Set up the Mask R-CNN configuration
    cfg = get_cfg()
    cfg.merge_from_file(model_zoo.get_config_file("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"))
    cfg.DATASETS.TRAIN = ("coco_train",)
    cfg.DATASETS.TEST = ("coco_val",)
    cfg.MODEL.ROI_HEADS.NUM_CLASSES = 80

    cfg.SOLVER.BASE_LR = 0.00025
    cfg.SOLVER.MAX_ITER = 10000
    cfg.SOLVER.CHECKPOINT_PERIOD = 5000
    cfg.TEST.EVAL_PERIOD = 5000
    cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 128
    cfg.SOLVER.IMS_PER_BATCH = 4
    cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml")
    cfg.OUTPUT_DIR = "./output"
    os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)
    return cfg

# Function to run Mask R-CNN
def run_mask_rcnn(image_path, cfg):
    predictor = DefaultPredictor(cfg)
    img = cv2.imread(image_path)
    outputs = predictor(img)

    pred_boxes = outputs["instances"].pred_boxes.to("cpu").tensor.numpy()

    # Create a Visualizer object
    v = Visualizer(img[:, :, ::-1],
                   MetadataCatalog.get(cfg.DATASETS.TRAIN[0]),
                   scale=1.2,
                   instance_mode=ColorMode.IMAGE_BW)

    # Draw the predictions (only valid classes)
    out = v.draw_instance_predictions(outputs["instances"].to("cpu"))
    segmented_img = out.get_image()[:, :, ::-1]

    return segmented_img, pred_boxes

# Function to run YOLO for object detection
def run_yolo(image_path):
    model = YOLOv5('yolov5s.pt')  # Load the YOLOv5 model
    results = model.predict(image_path)

    # Get bounding boxes from results
    pred_boxes = results.xyxy[0].cpu().numpy()  # Convert to numpy
    return pred_boxes

# Function to process the image
def process_image(image_path, cfg):
    # First, extract text
    img = cv2.imread(image_path)
    text = pytesseract.image_to_string(img).strip()

    # Then, perform image segmentation
    segmented_img, pred_boxes = run_mask_rcnn(image_path, cfg)

    # Finally, run YOLO on the segmented image
    yolo_boxes = run_yolo(image_path)

    return segmented_img, text, pred_boxes, yolo_boxes

# Streamlit app interface
st.title("Image Segmentation and Text Extraction")
uploaded_file = st.file_uploader("Upload an Image", type=["jpg", "jpeg", "png"])

if uploaded_file is not None:
    image_path = uploaded_file.name
    with open(image_path, "wb") as f:
        f.write(uploaded_file.getbuffer())

    cfg = setup_mask_rcnn_cfg()
    segmented_result, extracted_text, pred_boxes, yolo_boxes = process_image(image_path, cfg)

    # Create an outline image using the original image and predicted boxes
    original_img = cv2.imread(image_path)
    outline_img = original_img.copy()

    for box in pred_boxes:
        x1, y1, x2, y2 = map(int, box)
        cv2.rectangle(outline_img, (x1, y1), (x2, y2), (255, 0, 0), 2)  # Draw rectangle in blue

    # Draw YOLO boxes
    for box in yolo_boxes:
        x1, y1, x2, y2 = map(int, box[:4])  # Extract coordinates
        cv2.rectangle(outline_img, (x1, y1), (x2, y2), (0, 255, 0), 2)  # Draw rectangle in green

    st.image(segmented_result, caption="Segmented Image", use_column_width=True)
    st.image(outline_img, caption="Outline Image with YOLO Detections", use_column_width=True)
    st.write("Extracted Text:")
    st.text(extracted_text)

""")


Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu113
Looking in links: https://dl.fbaipublicfiles.com/detectron2/wheels/cu113/torch1.12/index.html
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
libtesseract-dev is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.
Cloning into 'yolov5'...
remote: Enumerating objects: 16965, done.[K
remote: Counting objects: 100% (160/160), done.[K
remote: Compressing objects: 100% (111/111), done.[K
remote: Total 16965 (delta 79), reused 99 (delta 49), pack-reused 16805 (from 1)[K
Receiving objects: 100% (16965/16965), 15.71 MiB | 19.48 MiB/s, done.
Resolving deltas: 100% (11612/11612), done

In [16]:
!git clone https://github.com/ultralytics/yolov5.git
!cd yolov5


Cloning into 'yolov5'...
remote: Enumerating objects: 16965, done.[K
remote: Counting objects: 100% (160/160), done.[K
remote: Compressing objects: 100% (110/110), done.[K
remote: Total 16965 (delta 79), reused 102 (delta 50), pack-reused 16805 (from 1)[K
Receiving objects: 100% (16965/16965), 15.71 MiB | 17.04 MiB/s, done.
Resolving deltas: 100% (11617/11617), done.


In [20]:
# Clone the YOLOv5 repository
!git clone https://github.com/ultralytics/yolov5.git
%cd yolov5

# Install required packages
!pip install -r requirements.txt

# Install additional libraries if needed (like opencv, etc.)
!pip install opencv-python-headless

# Download the pretrained weights (if needed)
!curl -L "https://github.com/ultralytics/yolov5/releases/download/v6.2/yolov5s.pt" -o yolov5s.pt


fatal: destination path 'yolov5' already exists and is not an empty directory.
/content/yolov5
Collecting thop>=0.1.1 (from -r requirements.txt (line 14))
  Downloading thop-0.1.1.post2209072238-py3-none-any.whl.metadata (2.7 kB)
Downloading thop-0.1.1.post2209072238-py3-none-any.whl (15 kB)
Installing collected packages: thop
Successfully installed thop-0.1.1.post2209072238
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 14.1M  100 14.1M    0     0  10.1M      0  0:00:01  0:00:01 --:--:-- 34.7M


In [21]:
# Create the Streamlit app script

# Clone the YOLOv5 repository
!git clone https://github.com/ultralytics/yolov5.git
%cd yolov5

# Install required packages
!pip install -r requirements.txt

# Install additional libraries if needed (like opencv, etc.)
!pip install opencv-python-headless

# Download the pretrained weights (if needed)
!curl -L "https://github.com/ultralytics/yolov5/releases/download/v6.2/yolov5s.pt" -o yolov5s.pt

with open('app.py', 'w') as f:
    f.write("""
import streamlit as st
import os
import cv2
import pytesseract
import torch
import numpy as np
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2 import model_zoo
from detectron2.utils.visualizer import Visualizer
from detectron2.utils.visualizer import ColorMode
from detectron2.data import MetadataCatalog
from detectron2.data.datasets import register_coco_instances
from yolov5 import YOLOv5

# Function to set up the Mask R-CNN configuration for segmentation
def setup_mask_rcnn_cfg():
    dataset_root = "/content/coco_2017_mini_dataset"
    train_images_path = os.path.join(dataset_root, "train2017")
    val_images_path = os.path.join(dataset_root, "val2017")
    annotation_train = os.path.join(dataset_root, "annotations", "instances_train2017.json")
    annotation_val = os.path.join(dataset_root, "annotations", "instances_val2017.json")

    # Register the COCO dataset for training and validation
    register_coco_instances("coco_train", {}, annotation_train, train_images_path)
    register_coco_instances("coco_val", {}, annotation_val, val_images_path)

    # Set up the Mask R-CNN configuration
    cfg = get_cfg()
    cfg.merge_from_file(model_zoo.get_config_file("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"))
    cfg.DATASETS.TRAIN = ("coco_train",)
    cfg.DATASETS.TEST = ("coco_val",)
    cfg.MODEL.ROI_HEADS.NUM_CLASSES = 80

    cfg.SOLVER.BASE_LR = 0.00025
    cfg.SOLVER.MAX_ITER = 10000
    cfg.SOLVER.CHECKPOINT_PERIOD = 5000
    cfg.TEST.EVAL_PERIOD = 5000
    cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 128
    cfg.SOLVER.IMS_PER_BATCH = 4
    cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml")
    cfg.OUTPUT_DIR = "./output"
    os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)
    return cfg

# Function to run Mask R-CNN
def run_mask_rcnn(image_path, cfg):
    predictor = DefaultPredictor(cfg)
    img = cv2.imread(image_path)
    outputs = predictor(img)

    pred_boxes = outputs["instances"].pred_boxes.to("cpu").tensor.numpy()

    # Create a Visualizer object
    v = Visualizer(img[:, :, ::-1],
                   MetadataCatalog.get(cfg.DATASETS.TRAIN[0]),
                   scale=1.2,
                   instance_mode=ColorMode.IMAGE_BW)

    # Draw the predictions (only valid classes)
    out = v.draw_instance_predictions(outputs["instances"].to("cpu"))
    segmented_img = out.get_image()[:, :, ::-1]

    return segmented_img, pred_boxes

# Function to run YOLO for object detection
def run_yolo(image_path):
    model = YOLOv5('yolov5s.pt')  # Load the YOLOv5 model
    results = model.predict(image_path)

    # Get bounding boxes from results
    pred_boxes = results.xyxy[0].cpu().numpy()  # Convert to numpy
    return pred_boxes

# Function to process the image
def process_image(image_path, cfg):
    # First, extract text
    img = cv2.imread(image_path)
    text = pytesseract.image_to_string(img).strip()

    # Then, perform image segmentation
    segmented_img, pred_boxes = run_mask_rcnn(image_path, cfg)

    # Finally, run YOLO on the segmented image
    yolo_boxes = run_yolo(image_path)

    return segmented_img, text, pred_boxes, yolo_boxes

# Streamlit app interface
st.title("Image Segmentation and Text Extraction")
uploaded_file = st.file_uploader("Upload an Image", type=["jpg", "jpeg", "png"])

if uploaded_file is not None:
    image_path = uploaded_file.name
    with open(image_path, "wb") as f:
        f.write(uploaded_file.getbuffer())

    cfg = setup_mask_rcnn_cfg()
    segmented_result, extracted_text, pred_boxes, yolo_boxes = process_image(image_path, cfg)

    # Create an outline image using the original image and predicted boxes
    original_img = cv2.imread(image_path)
    outline_img = original_img.copy()

    for box in pred_boxes:
        x1, y1, x2, y2 = map(int, box)
        cv2.rectangle(outline_img, (x1, y1), (x2, y2), (255, 0, 0), 2)  # Draw rectangle in blue

    # Draw YOLO boxes
    for box in yolo_boxes:
        x1, y1, x2, y2 = map(int, box[:4])  # Extract coordinates
        cv2.rectangle(outline_img, (x1, y1), (x2, y2), (0, 255, 0), 2)  # Draw rectangle in green

    st.image(segmented_result, caption="Segmented Image", use_column_width=True)
    st.image(outline_img, caption="Outline Image with YOLO Detections", use_column_width=True)
    st.write("Extracted Text:")
    st.text(extracted_text)


    """)


In [24]:
!streamlit run app.py & npx localtunnel --port 8501



Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.169.33.145:8501[0m
[0m
your url is: https://cold-taxes-tell.loca.lt
2024-09-25 10:38:50.988 Uncaught app exception
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/streamlit/runtime/scriptrunner/exec_code.py", line 88, in exec_func_with_error_handling
    result = func()
  File "/usr/local/lib/python3.10/dist-packages/streamlit/runtime/scriptrunner/script_runner.py", line 590, in code_to_exec
    exec(code, module.__dict__)
  File "/content/yolov5/yolov5/app.py", line 14, in <module>
    from yolov5 import YOLOv5
ModuleNotFoundError: No module named 'yolov5'
[34m  Stopping...[0m
^C


In [33]:
# Step 1: Install Required Packages
!pip install streamlit
!pip install opencv-python-headless
!pip install pytesseract
!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113
!apt-get install -y tesseract-ocr
!apt-get install -y libtesseract-dev

# Install Detectron2
!pip install 'git+https://github.com/facebookresearch/detectron2.git'



# Step 2: Create the Streamlit app script
with open('app.py', 'w') as f:
    f.write("""
import streamlit as st
import cv2
import pytesseract
import torch
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2 import model_zoo
from detectron2.utils.visualizer import Visualizer
from detectron2.utils.visualizer import ColorMode
from detectron2.data import MetadataCatalog
import tempfile

# Function to set up Detectron2 configuration
def setup_cfg():
    cfg = get_cfg()
    cfg.merge_from_file(model_zoo.get_config_file("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"))
    cfg.MODEL.ROI_HEADS.NUM_CLASSES = 80  # Set number of classes
    cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml")
    cfg.MODEL.EVAL_PERIOD = 5000
    cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5  # Set the testing threshold
    return cfg

# Function to run Detectron2 for object detection and segmentation
def run_inference(image_path):
    cfg = setup_cfg()
    predictor = DefaultPredictor(cfg)

    img = cv2.imread(image_path)
    outputs = predictor(img)

    # Visualize the output
    v = Visualizer(img[:, :, ::-1],
                   MetadataCatalog.get(cfg.DATASETS.TRAIN[0]),
                   scale=1.2,
                   instance_mode=ColorMode.IMAGE_BW)  # Render in black and white
    out = v.draw_instance_predictions(outputs["instances"].to("cpu"))

    # Get bounding boxes
    pred_boxes = outputs["instances"].pred_boxes.tensor.cpu().numpy()
    return out.get_image()[:, :, ::-1], pred_boxes  # Return the image and bounding boxes

# Function to extract text from the image
def extract_text(image, boxes):
    text_results = []
    for box in boxes:
        x1, y1, x2, y2 = map(int, box[:4])  # Use the first 4 values (x1, y1, x2, y2)
        # Ensure the box coordinates are within the image dimensions
        if x1 < 0: x1 = 0
        if y1 < 0: y1 = 0
        if x2 > image.shape[1]: x2 = image.shape[1]
        if y2 > image.shape[0]: y2 = image.shape[0]

        roi = image[y1:y2, x1:x2]
        text = pytesseract.image_to_string(roi)
        text_results.append(text.strip())
    return text_results

# Streamlit app interface
st.title("Text Extraction, Object Detection, and Segmentation with Detectron2")
uploaded_file = st.file_uploader("Upload an Image", type=["jpg", "jpeg", "png"])

if uploaded_file is not None:
    # Save the uploaded file temporarily
    tfile = tempfile.NamedTemporaryFile(delete=False)
    tfile.write(uploaded_file.read())
    tfile.close()

    # Read the image for processing
    original_img = cv2.imread(tfile.name)

    # Run inference for object detection and segmentation
    segmented_img, pred_boxes = run_inference(tfile.name)

    # Extract text from detected regions
    text_results = extract_text(original_img, pred_boxes)

    # Display results
    st.image(segmented_img, caption="Detected Objects and Segmentation", use_column_width=True)
    st.write("Extracted Texts:")
    for text in text_results:
        st.text(text)


""")

# Step 3: Run the Streamlit app
!streamlit run app.py & npx localtunnel --port 8501


Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu113
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
libtesseract-dev is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.
Collecting git+https://github.com/facebookresearch/detectron2.git
  Cloning https://github.com/facebookresearch/detectron2.git to /tmp/pip-req-build-d4pi48_m
  Running command git clone --filter=blob:none --quiet https://github.com/facebookresearch/detectron2.git /tmp/pip-req-build-d4pi48_m
  Resolved https://github.com/facebookresearch/detectron2.git to commit ebe8b45437f86395352ab13402ba45b75b4d1ddb
  Preparing metadata (setup.py) ... [?25l[?25hdone

Collec

In [36]:
# Install required packages
!pip install streamlit
!pip install opencv-python-headless
!pip install pytesseract
!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113
!pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu113/torch1.12/index.html
!apt-get install -y tesseract-ocr
!apt-get install -y libtesseract-dev

# Create the Streamlit app script
with open('app.py', 'w') as f:
    f.write("""
import streamlit as st
import os
import cv2
import pytesseract
import numpy as np
import torch
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2 import model_zoo
from detectron2.utils.visualizer import Visualizer
from detectron2.utils.visualizer import ColorMode
from detectron2.data import MetadataCatalog

# Function to set up the Mask R-CNN configuration for segmentation
def setup_mask_rcnn_cfg():
    cfg = get_cfg()
    cfg.merge_from_file(model_zoo.get_config_file("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"))
    cfg.MODEL.ROI_HEADS.NUM_CLASSES = 80  # Number of classes for COCO
    cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml")
    cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5  # Set the testing threshold
    return cfg

# Function to run Mask R-CNN
def run_mask_rcnn(image):
    cfg = setup_mask_rcnn_cfg()
    predictor = DefaultPredictor(cfg)
    outputs = predictor(image)

    # Get class names and boxes from the outputs
    pred_classes = outputs["instances"].pred_classes.to("cpu").numpy()
    pred_boxes = outputs["instances"].pred_boxes.to("cpu").tensor.numpy()

    # Create a Visualizer object
    v = Visualizer(image[:, :, ::-1],
                   MetadataCatalog.get(cfg.DATASETS.TRAIN[0]),
                   scale=1.2,
                   instance_mode=ColorMode.IMAGE_BW)

    # Draw the predictions
    out = v.draw_instance_predictions(outputs["instances"].to("cpu"))
    segmented_img = out.get_image()[:, :, ::-1]

    return segmented_img, pred_boxes

# Function to extract text from the image
def extract_text(image, boxes):
    text_results = []
    for box in boxes:
        x1, y1, x2, y2 = map(int, box[:4])
        # Ensure box coordinates are within image dimensions
        x1 = max(0, x1)
        y1 = max(0, y1)
        x2 = min(image.shape[1], x2)
        y2 = min(image.shape[0], y2)

        roi = image[y1:y2, x1:x2]
        gray_roi = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)

        # Apply thresholding for better text extraction
        threshold = cv2.threshold(gray_roi, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
        text = pytesseract.image_to_string(threshold)
        text_results.append(text.strip())
    return text_results

# Function to process the image
def process_image(image):
    segmented_img, pred_boxes = run_mask_rcnn(image)
    text_results = extract_text(image, pred_boxes)
    return segmented_img, text_results

# Streamlit app interface
st.title("Image Segmentation and Text Extraction")
uploaded_file = st.file_uploader("Upload an Image", type=["jpg", "jpeg", "png"])

if uploaded_file is not None:
    # Read the image file
    file_bytes = np.asarray(bytearray(uploaded_file.read()), dtype=np.uint8)
    image = cv2.imdecode(file_bytes, 1)

    # Display user input image first
    st.image(image, caption="Input Image", use_column_width=True)

    # Process the image
    segmented_result, text_results = process_image(image)

    # Display processed image and extracted text
    st.image(segmented_result, caption="Segmented Image with Outlines", use_column_width=True)
    st.write("Extracted Text:")
    if text_results:
        for text in text_results:
            st.text(text)
    else:
        st.text("No text extracted from the image.")

    """)

# Step 3: Run the Streamlit app
!streamlit run app.py & npx localtunnel --port 8501


Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu113
Looking in links: https://dl.fbaipublicfiles.com/detectron2/wheels/cu113/torch1.12/index.html
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
libtesseract-dev is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.

Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.169.33.145:8501[0m
[0m
your url is: https://loose-yaks-

In [None]:
# Install required packages
!pip install streamlit
!pip install opencv-python-headless
!pip install pytesseract
!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113
!pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu113/torch1.12/index.html
!apt-get install -y tesseract-ocr
!apt-get install -y libtesseract-dev



# Create the Streamlit app script
with open('app.py', 'w') as f:
    f.write("""
import streamlit as st
import os
import cv2
import pytesseract
import numpy as np
import torch
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2 import model_zoo
from detectron2.utils.visualizer import Visualizer
from detectron2.utils.visualizer import ColorMode
from detectron2.data import MetadataCatalog

# Function to set up the Mask R-CNN configuration for segmentation
def setup_mask_rcnn_cfg():
    cfg = get_cfg()
    cfg.merge_from_file(model_zoo.get_config_file("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"))
    cfg.MODEL.ROI_HEADS.NUM_CLASSES = 80  # Number of classes for COCO
    cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml")
    cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5  # Set the testing threshold
    return cfg

# Function to run Mask R-CNN and return the segmented image
def run_mask_rcnn(image):
    cfg = setup_mask_rcnn_cfg()
    predictor = DefaultPredictor(cfg)
    outputs = predictor(image)

    # Draw segmentation masks and bounding boxes on the image
    v = Visualizer(image[:, :, ::-1], MetadataCatalog.get(cfg.DATASETS.TRAIN[0]), scale=1.2)
    v = v.draw_instance_predictions(outputs["instances"].to("cpu"))

    segmented_img = v.get_image()[:, :, ::-1]

    # Return both segmentation result and bounding boxes
    return segmented_img, outputs["instances"].pred_masks.to("cpu").numpy(), outputs["instances"].pred_boxes.to("cpu").tensor.numpy()

# Function to draw outlines for segmentation masks
def draw_segment_outlines(image, masks):
    outline_image = np.zeros_like(image)
    for mask in masks:
        mask = (mask * 255).astype(np.uint8)
        contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        cv2.drawContours(outline_image, contours, -1, (0, 255, 0), 2)  # Green contours for outline
    return outline_image

# Function to extract text from the entire input image
def extract_text_from_image(image):
    gray_img = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    threshold = cv2.threshold(gray_img, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]

    # Extract text from the entire input image
    text = pytesseract.image_to_string(threshold)
    return text.strip()

# Function to process the image (segmentation + object detection)
def process_image(image):
    segmented_img, masks, pred_boxes = run_mask_rcnn(image)
    outline_img = draw_segment_outlines(image, masks)
    return segmented_img, outline_img

# Streamlit app interface
st.title("Image Segmentation, Object Detection, and Text Extraction")

uploaded_file = st.file_uploader("Upload an Image", type=["jpg", "jpeg", "png"])
if uploaded_file is not None:
    # Read the image file
    file_bytes = np.asarray(bytearray(uploaded_file.read()), dtype=np.uint8)
    image = cv2.imdecode(file_bytes, 1)

    # Display user input image first
    st.image(image, caption="Input Image", use_column_width=True)

    # Extract text from the input image (before any processing)
    extracted_text = extract_text_from_image(image)

    # Display the extracted text
    st.write("Extracted Text from Input Image:")
    if extracted_text:
        st.text(extracted_text)
    else:
        st.text("No text extracted from the image.")

    # Process the image (segmentation + object detection)
    segmented_img, outline_img = process_image(image)

    # Display segmented image (segmentation + object detection)
    st.image(segmented_img, caption="Segmented Image with Object Detection", use_column_width=True)

    # Display outline image (only outlines of segments)
    st.image(outline_img, caption="Outline of Segments", use_column_width=True)
    """)

# Step 3: Run the Streamlit app
!streamlit run app.py & npx localtunnel --port 8501




Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu113
Looking in links: https://dl.fbaipublicfiles.com/detectron2/wheels/cu113/torch1.12/index.html
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
libtesseract-dev is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.

Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.169.33.145:8501[0m
[0m
your url is: https://heavy-jars-