### Find similar images in a corpus

In [13]:
from PIL import Image
import pandas as pd
import imagehash
import itertools
import os
import shutil
import concurrent
import tqdm
import json
from io import BytesIO

### Test imagehash library with some image samples

In [None]:
df = pd.DataFrame(columns=['image','ahash','phash','dhash','whash','colorhash'])

for img in os.listdir(IMAGES_FOLDER):
    file = Image.open(os.path.join(IMAGES_FOLDER, img))

    data = {
        'image': img,
        'ahash': imagehash.average_hash(file),
        'phash': imagehash.phash(file),
        'dhash': imagehash.dhash(file),
        'whash': imagehash.whash(file),
        'colorhash': imagehash.colorhash(file),   
    }
    
    df = df.append(data, ignore_index=True)

In [13]:
df

Unnamed: 0,image,ahash,phash,dhash,whash,colorhash
0,blurry.PNG,00ff19bd81a1ffff,af4ad42751510f5d,a5256951554f0b36,00d711698181ffff,06200000180
1,high.png,00ff99bd81a1ffff,bf0ad42751510f5d,e5256971556f0b16,00d791a98181efff,03200000180
2,medium.jpg,00ff99bd81a1ffff,bf0ad42751510f5d,e5256951556f0b16,00d791a98181efff,03200000180
3,other.PNG,00ffff9d990100d9,fbd0453b5c1d1345,a4a5313171597333,00ffff9db90100d9,0b2010000c0
4,small.jpg,00ff99bd81a1ffff,ff0ad42751110f5d,a5256971556f0b16,00d791a98181efff,03200000180


Ahash (average hash) seems to give the best results accross different kinds of images

### Try average hash on real corpus

In [80]:
prefix = '/home/tyra/Documents/CERES/PMA'
userpaths = [os.path.join(prefix, path) for path in ['images_TDG', 'images_PMA', 'images_MPT']]
output_path = r'/home/tyra/Documents/CERES/PMA/multi_corpus/'

Some images were collected by mistake and these images are referenced in a useless.json, these images will be taken out of the current analysis

In [None]:
with open(os.path.join('/home/tyra/Documents/CERES/PMA', 'useless.json'), 'r') as f:
    useless = json.load(f)

In [79]:
def is_image(filename):
        f = filename.lower()
        return (f.endswith(".png") or f.endswith(".jpg") or \
            f.endswith(".jpeg") or f.endswith(".bmp") or \
            f.endswith(".gif") or '.jpg' in f or  f.endswith(".svg")) and f.split('.')[0] not in useless
    
image_filenames = []

for userpath in userpaths:
    image_filenames += [os.path.join(userpath, path) for path in os.listdir(userpath) if is_image(path)]
len(image_filenames)

32039

Compute all hashs with specified precision

In [70]:
def hasher(path, dic, precision, bar):
    img = Image.open(path)
    hash_ = imagehash.average_hash(img, precision)
    if str(hash_) not in dic:
        dic[str(hash_)] = []
    dic[str(hash_)].append(path)
    bar.update(1)

In [73]:
ahash_to_paths = {}
precision = 7
with tqdm.tqdm(total=len(image_filenames)) as pbar:
    with concurrent.futures.ThreadPoolExecutor() as executor:
        [executor.submit(hasher, path, ahash_to_paths, precision, pbar) for path in image_filenames]     

 62%|███████████████████████▍              | 4085/6628 [00:11<00:07, 354.32it/s]


Create a dictionnary sha_1 -> ahash ta will be used to update the db

In [42]:
sha1_to_ahash = {}

for ahash, paths in ahash_to_paths.items():
    for path in paths:
        sha1 = path.split('/')[-1].split('.')[0]
        sha1_to_ahash[sha1] = str(ahash)
with open(os.path.join(output_path, f'sha1_to_ahash_{precision}.json'), 'w') as f:
    json.dump(sha1_to_ahash, f)

Group images with similar hashs in same folder

In [77]:
for hash_, paths in ahash_to_paths.items():
    if len(paths) > 0:
        folder = os.path.join(output_path, str(precision), str(hash_))
        os.makedirs(folder, exist_ok=True)
        folders = []
        for path in paths:
            corpus, file = path.split('/')[-2:]
            folders.append(corpus)
            # use a simlink instead of copying the file
            os.symlink(path, os.path.join(folder, f'{corpus}-{file}'))
        # detect files that were in TDG and PMA
        if len(folders) > 1:
            TDG = False
            PMA = False
            for f in folders:
                if 'tdg' in f:
                    TDG = True
                else:
                    PMA = True
            if TDG and PMA:
                os.symlink(folder, output_path + 'multi/' + hash_)