In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import os
from numpy.typing import NDArray
from PIL import Image

# TODO: for This Dataset, 250k for jpeg is too much. I tried and pictures deviate a lot from desired size.

In [17]:
def get_png_image(file_path:Path):
    if os.path.isfile(file_path):
        image = Image.open(file_path)
        if image.mode == "RGBA":
            image = image.convert("RGB")
        return image
    print("returned none")
    return None

def save_jpeg2000_img(img:Image,name:str,quality: float) -> None:
    img.save(os.path.join(destinationFolder, f"{name}.jp2")
             ,format="JPEG2000"
             ,quality_mode = 'rates'
             ,quality_layers = [quality])
    

def get_image_size(folder:Path,base_name:str,format:str) -> float:
    test_image_path = os.path.join(folder, f"{base_name}.{format}")
    test_image_size = os.path.getsize(test_image_path) / 1000
    return np.around(test_image_size,decimals=2)

def remove_image(folder:Path,base_name:str,format:str):
    file_path = os.path.join(folder, f"{base_name}.{format}")
    os.remove(file_path)
    
def calculate_jpeg2000_layer_quality(png_size:float, desired_jpeg2000_size:float) -> float: 
    return np.around(1.515151*png_size/desired_jpeg2000_size,decimals=3)

In [18]:
sourceFolder = "cropped_images"
destinationFolder = "compressed_draft"

if not os.path.exists(destinationFolder):
    os.makedirs(destinationFolder)

sorted_sourcefolder_walk = sorted(os.listdir(sourceFolder), key=lambda x: int(x[:-4]))    

In [19]:
desired_jpeg2000_size_kb = 20
# TODO: for 20k compression far more pictures were generated than 100k. it seems the more aggressive the compression the better 1.5151 approxiamation performs which is better for my hard drive storage space as well !!!

tolerated_file_size_deviation = 2

jpeg2000_quality_step = .2

jpeg2000_file_sizes : dict[str:int] = {}


# TODO: add or subtract a constant epsilon from 1.5151 to make it always lower file size than desired in order to be on the safe side on things
for file_name in sorted_sourcefolder_walk:
    png_path = os.path.join(sourceFolder, file_name)
    image= get_png_image(png_path)
    base_name = os.path.splitext(file_name)[0]
    png_size = get_image_size(sourceFolder,base_name,'png')
    "Some pictures are too small, meaning even with highest compression ratio they do not reach our desired size on disk"
    if png_size < 1.5* desired_jpeg2000_size_kb:
        continue
    "First attempt to compress to desired file size."
    base_jpeg2000_quality = calculate_jpeg2000_layer_quality(png_size,desired_jpeg2000_size_kb)
    save_jpeg2000_img(img=image,name=base_name,quality=base_jpeg2000_quality)
    jpeg2000_size = get_image_size(destinationFolder,base_name,'jp2')
    new_jpeg2000_quality = base_jpeg2000_quality

    if np.abs(jpeg2000_size-desired_jpeg2000_size_kb) < tolerated_file_size_deviation:
        jpeg2000_file_sizes[base_name] = new_jpeg2000_quality
        image.close()
        print(f"{base_name} was below threshold")
    else:
        remove_image(destinationFolder,base_name,'jp2')
        continue
        



1 was below threshold
2 was below threshold
3 was below threshold
5 was below threshold
9 was below threshold
10 was below threshold
11 was below threshold
12 was below threshold
15 was below threshold
16 was below threshold
19 was below threshold
22 was below threshold
25 was below threshold
30 was below threshold
34 was below threshold
35 was below threshold
38 was below threshold
42 was below threshold
45 was below threshold
49 was below threshold
51 was below threshold
52 was below threshold
53 was below threshold
56 was below threshold
59 was below threshold
66 was below threshold
69 was below threshold
70 was below threshold
73 was below threshold
75 was below threshold
76 was below threshold
78 was below threshold
82 was below threshold
83 was below threshold
86 was below threshold
88 was below threshold
92 was below threshold
93 was below threshold
97 was below threshold
98 was below threshold
101 was below threshold
102 was below threshold
103 was below threshold
104 was below

In [20]:
print("Done !!!")

Done !!!
