In [2]:
import pandas as pd
import cv2
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm


In [3]:
df = pd.read_csv("../all_csv_images/filtered_file_ocr_no_duplicates.csv")


In [4]:
# load the csv file
df = pd.read_csv("../all_csv_images/WikiArt_images.csv")

# create an empty list to hold the images
art = []

# iterate through each row in the csv file
for index, row in df.iterrows():
    # get the relative path of the image
    path = row['relative_path']

    # load the image using cv2.imread
    piece = cv2.imread(f"{path}")

    # check if the image is not empty
    if piece is not None:
        # convert the image to RGB color space
        piece = cv2.cvtColor(piece, cv2.COLOR_BGR2RGB)

        # append the image to the art list
        art.append(piece)
    else:
        print(f"Empty image at {path}")


KeyboardInterrupt: 

In [None]:
def extract_features(images):
    hog = cv2.HOGDescriptor()
    features = []
    for image in images:
        fd = hog.compute(image)
        features.append(fd)
    return features

art_features = extract_features(art)


KeyboardInterrupt: 

In [None]:
results = {}
for i, row in df.iterrows():
    meme = cv2.imread(row["relative_path"])
    meme = cv2.cvtColor(meme, cv2.COLOR_BGR2RGB)
    meme_features = extract_features([meme])
    similarity = cosine_similarity(meme_features, art_features)
    match_index = similarity.argmax(axis=1)
    results[row["relative_path"]] = "art" + str(match_index[0] + 1)



In [None]:
df["matched_art"] = df["relative_path"].map(results)


In [None]:
df.to_csv("filtered_file_ocr_no_duplicates_with_matching.csv", index=False)


In [4]:
import pandas as pd
import cv2
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

BATCH_SIZE = 5

df = pd.read_csv("../all_csv_images/filtered_file_ocr_no_duplicates.csv")

image_paths = []

for index, row in df.iterrows():
    path = row['relative_path']
    
    image_paths.append(path)

results = {}

win_size = (64, 64)
block_size = (16, 16)
block_stride = (8, 8)
cell_size = (8, 8)
nbins = 9

hog = cv2.HOGDescriptor(win_size, block_size, block_stride, cell_size, nbins)


for i in tqdm(range(0, len(image_paths), BATCH_SIZE)):
    batch_paths = image_paths[i:i+BATCH_SIZE]
    batch_images = []
    for path in batch_paths:
        image = cv2.imread(path)
        if image is not None:
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            image = cv2.resize(image, (64, 64))
            batch_images.append(image)
        else:
            print(f"Empty image at {path}")


    batch_features = np.zeros((len(batch_images), hog.getDescriptorSize()), dtype=np.float32)
    for j, image in enumerate(batch_images):
        if image is not None and image.shape == (64, 64, 3):
            fd = hog.compute(image)
            batch_features[j, :] = fd.T
        else:
            print(f"Invalid image at index {j} in batch")

    

    for j, row in df.iloc[i:i+BATCH_SIZE].iterrows():
        meme_path = row["relative_path"]
        meme_image = cv2.imread(meme_path)
        if meme_image is not None:
            meme_image = cv2.cvtColor(meme_image, cv2.COLOR_BGR2RGB)
            meme_image = cv2.resize(meme_image, (64, 64))
        else:
            print(f"Empty image at {meme_path}")
            continue

        meme_features = hog.compute(meme_image).T.reshape(1, -1)
        similarity = cosine_similarity(meme_features, batch_features)
        match_index = similarity.argmax(axis=1)
        results[meme_path] = "art" + str(match_index[0] + i + 1)


df["matched_art"] = df["relative_path"].map(results)


df.to_csv("filtered_file_ocr_no_duplicates_with_matching.csv", index=False)


  0%|          | 13/4675 [00:02<16:55,  4.59it/s]

Empty image at ../images/EDEZ6.jpg
Empty image at ../images/EDEZ6.jpg


 75%|███████▌  | 3515/4675 [13:14<02:47,  6.93it/s]

Empty image at ../images/5isQukU.jpg
Empty image at ../images/5isQukU.jpg


 76%|███████▌  | 3537/4675 [13:17<03:38,  5.20it/s]

Empty image at ../images/tkX4Zjh.jpg
Empty image at ../images/iWMFoLF.jpg
Empty image at ../images/tkX4Zjh.jpg
Empty image at ../images/iWMFoLF.jpg


 79%|███████▉  | 3697/4675 [13:48<01:54,  8.56it/s]

Empty image at ../images/smgbZ91.jpg
Empty image at ../images/smgbZ91.jpg


 91%|█████████ | 4256/4675 [15:40<01:34,  4.43it/s]

Empty image at ../images/Persée%2Btuant%2Ble%2Bdragon%2B-%2Bfélix%2BVallotton%2B2.png
Empty image at ../images/Persée%2Btuant%2Ble%2Bdragon%2B-%2Bfélix%2BVallotton%2B2.png


 97%|█████████▋| 4554/4675 [16:42<00:19,  6.26it/s]

Empty image at ../images/DokqfS0.jpg
Empty image at ../images/DokqfS0.jpg


100%|█████████▉| 4652/4675 [16:58<00:02,  7.74it/s]

Empty image at ../images/jdI5Wo3.jpg
Empty image at ../images/jdI5Wo3.jpg


100%|██████████| 4675/4675 [17:04<00:00,  4.56it/s]


In [49]:
# https://youtu.be/16s3Pi1InPU
"""
Comparing images using ORB/SIFT feature detectors
and structural similarity index. 

@author: Sreenivas Bhattiprolu
"""


from skimage.metrics import structural_similarity
import cv2


def orb_sim(img1, img2):

  orb = cv2.ORB_create()


  kp_a, desc_a = orb.detectAndCompute(img1, None)
  kp_b, desc_b = orb.detectAndCompute(img2, None)


  bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)

  matches = bf.match(desc_a, desc_b)

  similar_regions = [i for i in matches if i.distance < 40]  
  if len(matches) == 0:
    return 0
  return len(similar_regions) / len(matches)



def structural_sim(img1, img2):

  sim, diff = structural_similarity(img1, img2, full=True)
  return sim

img_test1 = cv2.imread('../test_images/Test5.jpg', 0)
img_test2 = cv2.imread('../test_images/Test6.png', 0)

img00 = cv2.imread('../images_art/5a02d7d6edc2c9410844bbf3.jpg', 0)

img01 = cv2.imread('../images_art/5a02d7d6edc2c9410844bbf3.jpg', 0)

img1 = cv2.imread('../images_art/5a02d7d6edc2c9410844bbf3.jpg', 0)  
img2 = cv2.imread('../images_art/5a4cf41dedc2c900bcea5fc9.jpg', 0)  
img3 = cv2.imread('../images_art/5a4cf42aedc2c900bcea8fbb.jpg', 0)  
img4 = cv2.imread('../images_art/5a4cf42cedc2c900bcea968d.jpg', 0)  

orb_similarity = orb_sim(img_test1, img_test2)  

print("Similarity using ORB is: ", orb_similarity)

from skimage.transform import resize
img5 = resize(img_test2, (img_test1.shape[0], img_test1.shape[1]), anti_aliasing=True, preserve_range=True)

ssim = structural_sim(img_test1, img5) 
print("Similarity using SSIM is: ", ssim)

Similarity using ORB is:  0.216
Similarity using SSIM is:  0.2397903110957828


In [57]:
import os
from skimage.metrics import structural_similarity
import cv2


def orb_sim(img1, img2):

    orb = cv2.ORB_create()


    kp_a, desc_a = orb.detectAndCompute(img1, None)
    kp_b, desc_b = orb.detectAndCompute(img2, None)


    bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)


    matches = bf.match(desc_a, desc_b)

    similar_regions = [i for i in matches if i.distance < 40]
    if len(matches) == 0:
        return 0
    return len(similar_regions) / len(matches)



def structural_sim(img1, img2):
    sim, diff = structural_similarity(img1, img2, full=True)
    return sim

folder1 = '../images'
folder2 = '../images_art'

max_scores = {}  

for filename1 in os.listdir(folder1):
    try:
        img_path1 = os.path.join(folder1, filename1)
        img1 = cv2.imread(img_path1, 0)
        
        max_score = 0  
        
        for filename2 in os.listdir(folder2):
            img_path2 = os.path.join(folder2, filename2)
            img2 = cv2.imread(img_path2, 0)

            orb_similarity = orb_sim(img1, img2)  


            from skimage.transform import resize

            img5 = resize(img2, (img1.shape[0], img1.shape[1]), anti_aliasing=True, preserve_range=True)

            ssim = structural_sim(img1, img5)  

            avg_similarity = (orb_similarity + ssim) / 2
            
            if avg_similarity > 0.35 and avg_similarity > max_score:
                max_score = avg_similarity  
                
                max_scores[filename1] = (filename2, max_score)
                print(f"New max score for {filename1}: {filename2}")
    except:
        print("Error processing", filename1)

for filename1, (filename2, max_score) in max_scores.items():
    print(f"Highest match found for {filename1}: {filename2}")
    print("Highest similarity score is:", max_score)


  return func(*args, **kwargs)


Error processing 002yholifze81.jpg
Error processing 0032kpxx8fn11.png
New max score for 006rXbh.jpg: 57726e89edc2cb3880b75d96.jpg
Error processing 006rXbh.jpg
New max score for 00a14ud32du31.jpg: 57726e38edc2cb3880b6299e.jpg
Error processing 00a14ud32du31.jpg
