# Image Cleanser Pipeline


In [1]:
params = {
    "on_Google_drive": False,
    "copy_from_original": False,
    "download_dataset": False,
    "train_M": False,
}

## R and R_prime

### Environment

In [2]:
if not params["on_Google_drive"]:
  print ("Skipping mounting Google Drive")
else:
  from google.colab import drive
  drive.mount('/content/drive')
  import os
  os.chdir('/content/drive/MyDrive/UCSD_COURSES/ECE253/ImageCleanser/repos/ImageCleanser')

Skipping mounting Google Drive


In [3]:
%pip install parameters fire lmdb pillow torchvision nltk natsort

Collecting parameters
  Downloading parameters-0.2.1.tar.gz (38 kB)
Collecting fire
  Downloading fire-0.7.1-py3-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 5.4 MB/s eta 0:00:01
[?25hCollecting lmdb
  Downloading lmdb-1.7.5-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (292 kB)
[K     |████████████████████████████████| 292 kB 26.5 MB/s eta 0:00:01
Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 31.2 MB/s eta 0:00:01
[?25hCollecting natsort
  Downloading natsort-8.4.0-py3-none-any.whl (38 kB)
Collecting typing-extensions>=4.8.0
  Using cached typing_extensions-4.13.2-py3-none-any.whl (45 kB)
Building wheels for collected packages: parameters
  Building wheel for parameters (setup.py) ... [?25ldone
[?25h  Created wheel for parameters: filename=parameters-0.2.1-py3-none-any.whl size=24744 sha256=31808fab85a6427e1c41695c2298aba69ed62bb89ced449acecdcf104b1b8

In [4]:
import os
import re
import shutil
import fire
import lmdb

### Dataset Preparation

In [5]:
copy_params = {
    "copy_D": False,
    "split_D": False,
    "make_D_gt": False,
    "make_D_lmdb": False,
}

In [6]:
if not params["copy_from_original"]:
  print ("Skipping copying from original")
else:
  if not copy_params["copy_D"]:
      print("Skipping copying D from IC13.")
  else:
    # Paths, replace actually path where IC13 is stored at https://www.dropbox.com/scl/fi/fa0l8twkbu73pzj0czrvr/image_release_190624.zip?e=3&file_subpath=%2Fimage_release_190624%2Fbenchmark_cleansed%2FIC13&rlkey=o2937dp9cvp8x0qor7jldruws&st=aa8vmuq9&dl=0
    src_dir = "/content/drive/MyDrive/UCSD_COURSES/ECE253/ImageCleanser/benchmark_cleansed/IC13/"
    txt_file = "/content/drive/MyDrive/UCSD_COURSES/ECE253/ImageCleanser/benchmark_cleansed/IC13/filtered_IC13_clean_1.txt"
    dst_dir = "datasets/D"

    os.makedirs(dst_dir, exist_ok=False)

    # Read the lines and clean them
    with open(txt_file, 'r', encoding='utf-8') as f:
        lines = [line.strip() for line in f if line.strip()]

    count = 0
    for line in lines:
        if count >= 600:
            break

        # Extract filename and label
        # Example line pattern: something.png Tiredness
        parts = line.split()
        if len(parts) < 2:
            continue  # skip malformed line
        filename = parts[0]                # e.g. "abc123.png"
        label = " ".join(parts[1:])        # e.g. "Tiredness"

        src_path = os.path.join(src_dir, filename)
        if not os.path.exists(src_path):
            print(f"[WARN] File not found: {src_path}")
            continue

        count += 1
        dst_name = f"{count}_{label}.png"
        dst_path = os.path.join(dst_dir, dst_name)

        shutil.copy(src_path, dst_path)
        print(f"{count}: copied {filename} -> {dst_name}")

    print(f"\nDone. Copied {count} images to {dst_dir}")


Skipping copying from original


In [7]:
if not params["copy_from_original"]:
  print ("Skipping copying from original")
else:
  if not copy_params["split_D"]:
      print("Skipping splitting D.")
  else:
    src_dir = "datasets/D"
    train_dir = os.path.join(src_dir, "D_train")
    valid_dir = os.path.join(src_dir, "D_valid")

    # make sure subfolders exist
    os.makedirs(train_dir, exist_ok=False)
    os.makedirs(valid_dir, exist_ok=False)

    # --- natural numeric sort helper ---
    def natural_key(name):
        return [int(t) if t.isdigit() else t.lower() for t in re.split(r"(\d+)", name)]

    # get all images in numeric order
    images = sorted(
        [f for f in os.listdir(src_dir)
        if f.lower().endswith(('.png', '.jpg', '.jpeg'))],
        key=natural_key
    )

    n = len(images)
    print(f"Total images found: {n}")

    train_count, valid_count = 400, 200
    if n < train_count + valid_count:
        raise ValueError(f"Not enough images ({n}) for 400/200 split!")

    # move first 400 → train
    for fname in images[:train_count]:
        shutil.move(os.path.join(src_dir, fname),
                    os.path.join(train_dir, fname))
    print(f"Moved {train_count} images to {train_dir}")

    # move last 200 → valid
    for fname in images[-valid_count:]:
        shutil.move(os.path.join(src_dir, fname),
                    os.path.join(valid_dir, fname))
    print(f"Moved {valid_count} images to {valid_dir}")

    print("✅ Split complete.")

Skipping copying from original


In [8]:
if not params["copy_from_original"]:
  print ("Skipping copying from original")
else:
  if not copy_params["make_D_gt"]:
      print("Skipping making gt for D.")
  else:
    base_dir = "datasets/D"
    splits = ["D_train", "D_valid"]

    def natural_key(name):
        """Split filename into text and number chunks for natural sorting."""
        return [int(t) if t.isdigit() else t.lower() for t in re.split(r"(\d+)", name)]

    for split in splits:
        folder = os.path.join(base_dir, split)
        txt_path = os.path.join(base_dir, f"{split}.txt")

        # get all image files, sorted numerically
        images = sorted(
            [f for f in os.listdir(folder)
            if f.lower().endswith(('.png', '.jpg', '.jpeg'))],
            key=natural_key
        )

        with open(txt_path, "w", encoding="utf-8") as f:
            for fname in images:
                # e.g. "123_Tiredness.png" → label = "Tiredness"
                name_no_ext = os.path.splitext(fname)[0]
                parts = name_no_ext.split("_", 1)
                if len(parts) < 2:
                    print(f"[WARN] Skipping malformed name: {fname}")
                    continue

                label = parts[1]
                f.write(f"{fname}\t{label}\n")

        print(f"✅ Wrote {len(images)} entries in numerical order to {txt_path}")

Skipping copying from original


In [9]:
if not params["copy_from_original"]:
  print ("Skipping copying from original")
else:
  if not copy_params["make_D_lmdb"]:
      print("Skipping making lmdb for D.")
  else:
    image_directory_original = 'datasets/D/D_train'
    # IMPORTANT: Replace '/path/to/your/original/dataset/gt.txt' with the actual path to your original gt.txt file
    gt_file_original = 'datasets/D/D_train.txt'
    lmdb_output_dir = 'datasets/D/D_train_lmdb' # Output directory for the new LMDB

    # Create the output directory if it doesn't exist
    os.makedirs(lmdb_output_dir, exist_ok=False)


    # Ensure the create_lmdb_dataset.py script exists in your current directory
    if os.path.exists('create_lmdb_dataset.py'):
        print(f"Creating LMDB dataset from {image_directory_original} to {lmdb_output_dir} using {gt_file_original}...")
        # The script create_lmdb_dataset.py expects inputPath, gtFile, and outputPath positionally
        create_lmdb_command = f'python3 create_lmdb_dataset.py {image_directory_original} {gt_file_original} {lmdb_output_dir}'
        !{create_lmdb_command}
        print("LMDB dataset creation finished.")
    else:
        print("Error: create_lmdb_dataset.py not found. Please make sure it's in the current directory.")

    image_directory_original = 'datasets/D/D_valid'
    # IMPORTANT: Replace '/path/to/your/original/dataset/gt.txt' with the actual path to your original gt.txt file
    gt_file_original = 'datasets/D/D_valid.txt'
    lmdb_output_dir = 'datasets/D/D_valid_lmdb' # Output directory for the new LMDB

    # Create the output directory if it doesn't exist
    os.makedirs(lmdb_output_dir, exist_ok=False)


    # Ensure the create_lmdb_dataset.py script exists in your current directory
    if os.path.exists('create_lmdb_dataset.py'):
        print(f"Creating LMDB dataset from {image_directory_original} to {lmdb_output_dir} using {gt_file_original}...")
        # The script create_lmdb_dataset.py expects inputPath, gtFile, and outputPath positionally
        create_lmdb_command = f'python3 create_lmdb_dataset.py {image_directory_original} {gt_file_original} {lmdb_output_dir}'
        !{create_lmdb_command}
        print("LMDB dataset creation finished.")
    else:
        print("Error: create_lmdb_dataset.py not found. Please make sure it's in the current directory.")

Skipping copying from original


In [20]:
# !zip -r D_prime.zip datasets/D_prime

  adding: datasets/D_prime/ (stored 0%)
  adding: datasets/D_prime/D_prime_lmdb/ (stored 0%)
  adding: datasets/D_prime/D_prime_lmdb/D_prime/ (stored 0%)
  adding: datasets/D_prime/D_prime_lmdb/D_prime/data.mdb (deflated 2%)
  adding: datasets/D_prime/D_prime_lmdb/D_prime/lock.mdb (deflated 99%)
  adding: datasets/D_prime/D_prime.txt (deflated 58%)
  adding: datasets/D_prime/D_prime/ (stored 0%)
  adding: datasets/D_prime/D_prime/313_HERE.png (deflated 0%)
  adding: datasets/D_prime/D_prime/225_WALLACE.png (stored 0%)
  adding: datasets/D_prime/D_prime/160_GmbH.png (deflated 0%)
  adding: datasets/D_prime/D_prime/49_Essex.png (stored 0%)
  adding: datasets/D_prime/D_prime/327_for.png (deflated 0%)
  adding: datasets/D_prime/D_prime/127_unauthorised.png (deflated 0%)
  adding: datasets/D_prime/D_prime/238_AN.png (stored 0%)
  adding: datasets/D_prime/D_prime/61_SIGNAL.png (stored 0%)
  adding: datasets/D_prime/D_prime/311_YOU.png (stored 0%)
  adding: datasets/D_prime/D_prime/314_DEPART

### Download dataset directly

In [10]:
# if not params["download_dataset"]:
#   print ("Skipping downloading dataset")
# else:
#   %pip install -q gdown
#   !gdown --id 14Hveo70hTod8WOb3kh3_vDpkN6FC5f-T -O D_prime.zip
#   !unzip -q D_prime.zip -d .

Downloading...
From (original): https://drive.google.com/uc?id=14Hveo70hTod8WOb3kh3_vDpkN6FC5f-T
From (redirected): https://drive.google.com/uc?id=14Hveo70hTod8WOb3kh3_vDpkN6FC5f-T&confirm=t&uuid=c3bf2da2-c39e-4dd6-81f8-f14de0df4b22
To: /home/xun_liu/Projects/ImageCleanser/D.zip
100%|██████████████████████████████████████| 94.9M/94.9M [00:03<00:00, 27.0MB/s]
