In [1]:
from PIL import Image
import os
import pytesseract
from tqdm import tqdm
from joblib import Parallel, delayed

# Specify Tesseract path (example for Windows)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

#-------------------------------------------------------------------------------------JUPYTER NOTEBOOK SETTINGS-------------------------------------------------------------------------------------
from IPython.core.display import display, HTML                                    
display(HTML("<style>.container { width:100% !important; }</style>"))    

  from IPython.core.display import display, HTML


In [3]:
def extract_text_from_image(image_path, top_left_coordinates, dimension):
    try:
        img = Image.open(image_path)
        region = img.crop((top_left_coordinates[0], top_left_coordinates[1], top_left_coordinates[0] + dimension[0], top_left_coordinates[1] + dimension[1]))
        text = pytesseract.image_to_string(region).strip()
        return text
    except Exception as e:
        print(f"Error: {e}")
        return None

def delete_images_with_short_text(args):
    image_path, top_left_coordinates, dimension, min_text_length = args
    extracted_text = extract_text_from_image(image_path, top_left_coordinates, dimension)
    if extracted_text and len(extracted_text) < min_text_length:
        os.remove(image_path)
        return f"Deleted: {os.path.basename(image_path)}"
    return None

if __name__ == "__main__":
    folder_path = "pascal_voc_datasets/VOCdevkit/PlotsEnchanced_NoAugmentation/images/coinmarketcap.com/1429x909"
    top_left_x, top_left_y = 0, 0  # Define the top-left coordinates here
    top_left_coordinates = (top_left_x, top_left_y)
    region_dimension = (20, 150)  # Set the region dimension here
    min_text_length = 10  # Set the minimum text length here

    image_paths = [os.path.join(folder_path, filename) for filename in os.listdir(folder_path)
                   if filename.lower().endswith((".jpg", ".png", ".jpeg"))]

    num_cpus = os.cpu_count()  # Get the number of available CPU cores

    results = Parallel(n_jobs=num_cpus)(
        delayed(delete_images_with_short_text)(args) for args in tqdm(
            [(image_path, top_left_coordinates, region_dimension, min_text_length) for image_path in image_paths],
            total=len(image_paths),
            desc="Processing images"
        )
    )

    for result in results:
        if result:
            print(result)



Processing images: 100%|███████████████████████████████████████████████████████████████████| 10406/10406 [01:08<00:00, 151.58it/s]


Deleted: 0_aave-link.png
Deleted: 0_aave-tusd.png
Deleted: 0_balancer-boosted-aave-usdt.png
Deleted: 0_cloakcoin.png
Deleted: 0_eco.png
Deleted: 0_educare.png
Deleted: 0_ethereum-name-service.png
Deleted: 0_ethereumfair.png
Deleted: 0_etherparty.png
Deleted: 0_ethlas.png
Deleted: 0_ethscape.png
Deleted: 0_evany.png
Deleted: 0_everest.png
Deleted: 0_everreflect.png
Deleted: 0_everycoin.png
Deleted: 0_exenpay-token.png
Deleted: 0_exobots.png
Deleted: 0_ezystayz.png
Deleted: 0_fable-of-the-dragon.png
Deleted: 0_fairface.png
Deleted: 0_fame-mma.png
Deleted: 0_fang-token.png
Deleted: 0_fantom-maker.png
Deleted: 0_fantom-oasis.png
Deleted: 0_fanzy.png
Deleted: 0_fathom-protocol.png
Deleted: 0_favor.png
Deleted: 0_feathercoin.png
Deleted: 0_felicette.png
Deleted: 0_ferro.png
Deleted: 0_fetch.png
Deleted: 0_fibos.png
Deleted: 0_fidira.png
Deleted: 0_filecoin-standard-hashrate-token.png
Deleted: 0_filestar.png
Deleted: 0_finance-ai.png
Deleted: 0_finschia.png
Deleted: 0_fintrux-network.png
Dele