In [37]:
#   Source: https://www.kaggle.com/stehai/duplicate-images-data-cleaning
#   Modified by team Pepper
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
import collections
import imagehash
from os import path as os_path

TRAIN_IMG_PATH = r"train/train"
###############################################################################
# misc functions

def plot_images(path, imgs):
    assert(isinstance(imgs, collections.Iterable))
    imgs_list = list(imgs)
    nrows = len(imgs_list)
    if (nrows % 2 != 0):
        nrows = nrows + 1 

    plt.figure(figsize=(18, 6*nrows/2))
    for i, img_file in enumerate(imgs_list):
        with Image.open(os_path.join(path, img_file)) as img:
            ax = plt.subplot(nrows/2, 2, i+1)
            ax.set_title("#{}: '{}'".format(i+1, img_file))
            ax.imshow(img)
        
    plt.show()


###############################################################################
# load data

def getImageMetaData(file_path):
    with Image.open(file_path) as img:
        img_hash = imagehash.phash(img)
        return img.size, img.mode, img_hash

def get_train_input():
    train_input = pd.read_csv(r"train.csv")
    
    m = train_input.Image.apply(lambda x: getImageMetaData(os_path.join(TRAIN_IMG_PATH, x)))
    train_input["Hash"] = [str(i[2]) for i in m]
    train_input["Shape"] = [i[0] for i in m]
    train_input["Mode"] = [str(i[1]) for i in m]
    train_input["Length"] = train_input["Shape"].apply(lambda x: x[0]*x[1])
    train_input["Ratio"] = train_input["Shape"].apply(lambda x: x[0]/x[1])
    train_input["New_Whale"] = train_input.Id == "new_whale"
    
    return train_input

train_input = get_train_input()

###############################################################################
# data cleaning duplicate images

# determine duplicate images using the hash

t = train_input.Hash.value_counts()
t = t[t > 1]
duplicates_df = pd.DataFrame(t)

# get the Ids of the duplicate images
duplicates_df["Ids"] =list(map(
            lambda x: set(train_input.Id[train_input.Hash==x].values), 
            t.index))
duplicates_df["Ids"] =list(map(
            lambda x: set(train_input.Id[train_input.Hash==x].values), 
            t.index))
duplicates_df["Ids_count"] = duplicates_df.Ids.apply(lambda x: len(x))
duplicates_df["Ids_contain_new_whale"] = duplicates_df.Ids.apply(lambda x: "new_whale" in x)

print(duplicates_df.head(20))

###
# There are 3 types of data errors regarding duplicate images:
#
# 1) The same image with the corresponding Id appears multiple time.
# 2) The same image appears with an Id and as "new_whale".
# 3) The same image appears with different Ids (ambiguous classified). 
#

# Fix error type 1: The same image with the corresponding Id appears multiple time.

train_input.drop_duplicates(["Hash", "Id"], inplace = True)

# Fix error type 2: The same image appears with an Id and as "new_whale".
# => delete the "new_whale" entry

drop_hash = duplicates_df.loc[(duplicates_df.Ids_count>1) & (duplicates_df.Ids_contain_new_whale==True)].index
train_input.drop(train_input.index[(train_input.Hash.isin(drop_hash) & (train_input.Id=="new_whale"))], inplace=True)

# Fix error type 3: The same image appears with different Ids (ambiguous classified).
# => delete all of them

drop_hash = duplicates_df.loc[(duplicates_df.Ids_count>1) & ((duplicates_df.Ids_count - duplicates_df.Ids_contain_new_whale)>1)].index

#print("Ambiguous classified images:")
#for i in drop_hash:
    #plot_images(TRAIN_IMG_PATH, 
                #train_input[train_input.Hash==i].Image)

train_input.drop(train_input.index[train_input.Hash.isin(drop_hash)], inplace=True)

# check if there are still duplicate images
assert(np.sum(train_input.Hash.value_counts()>1) == 0)

                  Hash                     Ids  Ids_count  \
bb8ec43039cb663c     3  {w_cae7677, new_whale}          2   
9e1bc5d0bc4e0bc3     2  {new_whale, w_7185713}          2   
af8fd0fcd3702940     2  {new_whale, w_1f09cdd}          2   
e9889673ed9d5a02     2  {new_whale, w_a365757}          2   
ee9ac1f47a4b8470     2  {new_whale, w_17a2610}          2   
ed088dab92f0e3f0     2  {new_whale, w_ab4cae2}          2   
932c5ac3a4b9ac5b     2             {w_2f54c3c}          1   
8b90a4633b5cd29f     2             {w_dcb1f2a}          1   
84717a9ec1a4717d     2             {w_ee948c6}          1   
96bd2dc2c8c272f4     2  {w_5ba417d, new_whale}          2   
e159966ab5455a8e     2             {w_9ff699b}          1   
a19886a5f9d8bc27     2             {w_5c6215c}          1   
afdad0b5c0ace928     2             {w_242a05d}          1   
f688cd27921d28d7     2             {w_f19faeb}          1   
e86a9296343dcbca     2             {w_05b2ddd}          1   
e990966279da8667     2  

In [3]:
print("Ambiguous classified images:")
for i in drop_hash:
    plot_images(TRAIN_IMG_PATH, 
                train_input[train_input.Hash==i].Image)

Ambiguous classified images:


<Figure size 1296x0 with 0 Axes>

<Figure size 1296x0 with 0 Axes>

<Figure size 1296x0 with 0 Axes>

<Figure size 1296x0 with 0 Axes>

<Figure size 1296x0 with 0 Axes>

<Figure size 1296x0 with 0 Axes>

<Figure size 1296x0 with 0 Axes>

<Figure size 1296x0 with 0 Axes>

<Figure size 1296x0 with 0 Axes>

<Figure size 1296x0 with 0 Axes>

In [36]:
duplicates_df

Unnamed: 0,Hash,Ids,Ids_count,Ids_contain_new_whale
bb8ec43039cb663c,3,"{w_cae7677, new_whale}",2,True
9e1bc5d0bc4e0bc3,2,"{new_whale, w_7185713}",2,True
af8fd0fcd3702940,2,"{new_whale, w_1f09cdd}",2,True
e9889673ed9d5a02,2,"{new_whale, w_a365757}",2,True
ee9ac1f47a4b8470,2,"{new_whale, w_17a2610}",2,True
ed088dab92f0e3f0,2,"{new_whale, w_ab4cae2}",2,True
932c5ac3a4b9ac5b,2,{w_2f54c3c},1,False
8b90a4633b5cd29f,2,{w_dcb1f2a},1,False
84717a9ec1a4717d,2,{w_ee948c6},1,False
96bd2dc2c8c272f4,2,"{w_5ba417d, new_whale}",2,True


In [38]:
train_input

Unnamed: 0,Image,Id,Hash,Shape,Mode,Length,Ratio,New_Whale
0,00022e1a.jpg,w_e15442c,b362cc79b1a623b8,"(699, 500)",L,349500,1.398000,False
1,000466c4.jpg,w_1287fbc,b3cccc3331cc8733,"(1050, 700)",RGB,735000,1.500000,False
2,00087b01.jpg,w_da2efe0,bc4ed0f2a7e168a8,"(1050, 368)",RGB,386400,2.853261,False
3,001296d5.jpg,w_19e5482,93742d9a28b35b87,"(397, 170)",RGB,67490,2.335294,False
4,0014cfdf.jpg,w_f22f3e3,d4a1dab1c49f6352,"(700, 398)",L,278600,1.758794,False
5,0025e8c2.jpg,w_8b1ca89,ee1292c52c7bc627,"(1000, 652)",RGB,652000,1.533742,False
6,0026a8ab.jpg,w_eaad6a8,dacaa0f0bde91625,"(1050, 317)",RGB,332850,3.312303,False
8,0035632e.jpg,w_3d0bc7a,bb0ed0b6e5a1ac31,"(843, 400)",RGB,337200,2.107500,False
9,0037e7d3.jpg,w_50db782,b2cccd31f2c84ce3,"(1050, 700)",RGB,735000,1.500000,False
10,00389cd7.jpg,w_2863d51,fbc3a5782ec1c0c3,"(1050, 600)",RGB,630000,1.750000,False


In [48]:
adf = adf.drop(['Length', 'Mode', 'Shape', 'New_Whale', 'Ratio', 'Hash'], axis = 1)

In [49]:
adf

Unnamed: 0,Image,Id
0,00022e1a.jpg,w_e15442c
1,000466c4.jpg,w_1287fbc
2,00087b01.jpg,w_da2efe0
3,001296d5.jpg,w_19e5482
4,0014cfdf.jpg,w_f22f3e3
5,0025e8c2.jpg,w_8b1ca89
6,0026a8ab.jpg,w_eaad6a8
8,0035632e.jpg,w_3d0bc7a
9,0037e7d3.jpg,w_50db782
10,00389cd7.jpg,w_2863d51


In [51]:
adf.to_csv('new.csv', sep='\t', encoding='ascii', index = False)