In [1]:
import glob
import hashlib
import imagehash
from PIL import Image
import pandas as pd
from tqdm import tqdm
from collections import Counter

In [2]:
hash_size = 24

filename = []
md5_hash = []
avg_hash = []

# Read 2 types of hash (md5 and avg) and store data in arrays
for file in tqdm(glob.glob(r"W:\Users\Goldian\Desktop\data\images\images\*.jpg")):
    file_name = file.split("\\")[-1]

    with open(file, 'rb') as f:
        file_hash = hashlib.md5(f.read()).hexdigest()
        
    with Image.open(file) as img:
        temp_hash = imagehash.average_hash(img, hash_size)

    filename.append(file_name)
    md5_hash.append(file_hash)
    avg_hash.append(temp_hash)


pd.DataFrame({'image': filename, 'md5': md5_hash, 'avg': avg_hash}).to_csv('allhash.csv', index=False)

100%|██████████| 86627/86627 [30:08<00:00, 47.89it/s] 


In [14]:
df = pd.read_csv(r"..\\beda.csv")
tr = pd.read_csv(r"..\\data\train.csv")
df_test = pd.read_csv(r"..\\data\test.csv")

In [15]:
print(f'The dimensionality of the data before cleaning: {tr.shape}')

# Removing dublicate by hash
df_mod = df.drop_duplicates("avg")
df_mod = df_mod.drop_duplicates('md5')

# Combine the known class_id from the train dataframe
df_mod["class_id"] = df_mod['image'].map(tr.set_index('image')['class_id'])

# Removing NA values
df_mod.dropna(inplace=True)

# Save dataframe
df_mod = df_mod[['class_id', 'image']]
df_mod.class_id = df_mod.class_id.astype(int)
df_mod.to_csv('clean_train.csv', index=False)

print(f'The final dimensionality of the data after cleaning: {df_mod.shape}')
display(df_mod.head())

The dimensionality of the data before cleaning: (47575, 2)
The final dimensionality of the data after cleaning: (40832, 2)


Unnamed: 0,class_id,image
0,1410,012IhTSMDC8Pp7NtwUJVlOao3HrcRk4Ad9xG5XFE.jpg
2,2871,018lTBUgtZ9pC4FVdbvOPwRNYG5Q2nkrADSuyhjJ.jpg
3,3434,019iQvjmJUFVLDSIdgz3thaGPR6oc5fETyCnAxqN.jpg
4,2986,019vfB7ykc2dOVYGTAMmDZuNbiIg8exsWjX6SPQo.jpg
5,582,01BaP8ewyHRTWGUKYOZmELxQuzc3NCMk7V9ghf5n.jpg


In [16]:
# Combine the known class_id from the common dataframe
df["class_id"] = df['image'].map(tr.set_index('image')['class_id'])

# Assign a hash to the test dataframe from the common dataframe
df_test["avg"] = df_test['image'].map(df.set_index('image')['avg'])
df_test['md5'] = df_test['image'].map(df.set_index('image')['md5'])

# Creating blank labels
df_test["label_md5"] = 0
df_test["label_avg"] = 0
df_test["class_id"] = 0       # final label

# Emulating the training dataframe, removing empty classes
df.dropna(inplace=True)
df.class_id = df.class_id.astype(int)


# Iterate through each line and save its index
for index, row in df_test.iterrows():

    # We take a data slice where the hash equals the row hash
    slice_df_avg = df[df['avg'] == row["avg"]]
    slice_df_md5 = df[df['md5'] == row["md5"]]

    # Access the line to change
    index_label_md5 = df_test.at[index, "label_md5"]
    index_label_avg = df_test.at[index, "label_avg"]

    # If the hash is unique, assign -1 class
    if len(slice_df_avg) == 0:
        index_label_avg  = -1

    # Otherwise, choose the most popular (mode) class among the entire data slice
    else:
        index_label_avg  = list(Counter(slice_df_avg['class_id']).keys())[0]
    
    # Similarly for the md5 hash
    if len(slice_df_md5) == 0:
        index_label_md5  = -1
    else:
        index_label_md5  = list(Counter(slice_df_md5['class_id']).keys())[0]



    if index_label_md5 != index_label_avg:

        # If the classes are different, then trust md5, because it is based on a perfect match
        if index_label_md5 != -1:
            df_test.at[index, 'class_id'] = index_label_md5

        # If no md5 hash matches are found (class -1), choose avg
        else:
            df_test.at[index, "class_id"] = index_label_avg

    else:
        df_test.at[index, 'class_id'] = index_label_md5


print(f'Total number of successfully found classes - {sum(df_test.class_id != -1)} \ {len(df_test)}')


# Save dataframe

df_test.class_id = df_test.class_id.astype(int)
sub = df_test[['class_id', 'image']]
sub.to_csv('predict_class_hash.csv', index=False)

Total number of successfully found classes - 5279 \ 31718
