# Import Libraries

In [54]:
from google.colab import drive
drive.mount('/content/drive')
import os
import shutil
import cv2
import numpy as np
from pathlib import Path
from tqdm import tqdm
from PIL import Image
import matplotlib.pyplot as plt

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# An Algorithm for Detecting and Removing Predominantly Low-Intensity Images

In [55]:
class ImageSeperator:
  def __init__(self, dataset_path, lookfor, out):
    self.dataset_path = dataset_path
    self.source_word = lookfor
    self.out = out
    self.valid_extension = ".jpg"

  def is_mostly_black(self, img_path, mean_thresh=10, bright_pixel_ratio=0.2):
    img = cv2.imread(str(img_path), cv2.IMREAD_GRAYSCALE)
    if img is None:
        return True  # skip unreadable images

    mean_intensity = img.mean()

    # Ratio of pixels brighter than 50 (adjustable)
    bright_pixels = np.sum(img > 50)
    ratio = bright_pixels / img.size

    # Mostly black if mean very low OR almost all pixels are dark
    return mean_intensity < mean_thresh or ratio < bright_pixel_ratio

  def make_directory(self, name):
    no_black_folder = os.path.join(self.dataset_path, os.path.join(name, self.out))
    os.makedirs(no_black_folder, exist_ok=True)
    return no_black_folder

  def process_images(self, source, apply_to):
    source_path = Path(self.dataset_path) / source / self.source_word
    out_folder = self.make_directory(source)

    apply_path = Path(self.dataset_path) / apply_to / self.source_word
    out_apply_to = self.make_directory(apply_to)

    print(f"Processing from: {source_path}")
    print(f"Applying to: {apply_path}")
    print(f"Outputting to: {out_folder}")
    print(f"Outputting to: {out_apply_to}")

    # Process all images in source folder
    count_total = 0
    count_removed = 0
    for img_path in source_path.glob("*.jpg"):
      count_total += 1

      mask_path = apply_path / img_path.name.replace(".jpg", "_m.jpg")
      if not mask_path.exists():
        continue  # skip broken pairs

      if self.is_mostly_black(img_path):
        count_removed += 1
        continue

      shutil.copy2(img_path, os.path.join(out_folder, img_path.name))
      shutil.copy2(mask_path, os.path.join(out_apply_to, mask_path.name))

    print(f"Processed {count_total} images, removed {count_removed} mostly black images.")

## For Segmentation Images

In [56]:
segmentation_dataset_path = "/content/drive/MyDrive/Colab Notebooks/Data Science Group Project/data/interim/segmentation/"
image_seperator = ImageSeperator(segmentation_dataset_path, lookfor="original", out="no_black")
image_seperator.process_images(source="images", apply_to="mask")

Processing from: /content/drive/MyDrive/Colab Notebooks/Data Science Group Project/data/interim/segmentation/images/original
Applying to: /content/drive/MyDrive/Colab Notebooks/Data Science Group Project/data/interim/segmentation/mask/original
Outputting to: /content/drive/MyDrive/Colab Notebooks/Data Science Group Project/data/interim/segmentation/images/no_black
Outputting to: /content/drive/MyDrive/Colab Notebooks/Data Science Group Project/data/interim/segmentation/mask/no_black
Processed 6380 images, removed 151 mostly black images.


In [57]:
no_black_images_len = len(os.listdir(os.path.join(segmentation_dataset_path, "images/no_black")))
no_black_masks_len = len(os.listdir(os.path.join(segmentation_dataset_path, "mask/no_black")))

print("merged_images_len:", no_black_images_len)
print("merged_masks_len:", no_black_masks_len)

merged_images_len: 6228
merged_masks_len: 6228
