In [1]:
from fastai.vision.all import *

In [2]:
base_path = Path('src')

img_train = base_path / 'train_images'
img_test = base_path / 'test_images'

In [3]:
df_train = pd.read_csv(base_path / 'train.csv')
df_test = pd.read_csv(base_path / 'test.csv')

In [4]:
df_train['image'] = df_train['image'].apply(lambda x: img_train / x)

Drop duplicate image-files. Rethink if there is a smarter way to handle this (since same picture with different text occurs).

In [5]:
df_train.drop_duplicates(['image'], inplace = True, ignore_index = True)

Explore `image_phash`

In [6]:
print(f'Unique images: {df_train.image_phash.nunique()} of {df_train.image.nunique()} unique image-files.')

Unique images: 28735 of 32412 unique image-files.


In [7]:
df_groupby_label = df_train.groupby(['label_group'])['image_phash'].nunique().to_frame()

In [8]:
print(f'Number of labels, that only contain items with the same image_phash: {(df_groupby_label.image_phash < 2).sum()}')

Number of labels, that only contain items with the same image_phash: 991


Only keep labels that have at least 2 items with different image_phash.

In [None]:
keep_labels = df_groupby_label[df_groupby_label.image_phash >= 2].index.tolist()
df_train = df_train[df_train['label_group'].isin(keep_labels)].reset_index(drop = True)

We can now make shure to draw image-files with different phashes, when creating the Siamese dataset.

## create smaller dataset

In [None]:
x = df_train['label_group'].unique()
y = np.random.choice(x, size = int(len(x)*small_pct), replace = False)
df_train = df_train[df_train['label_group'].isin(y)]
df_train = df_train.reset_index(drop = True)

In [None]:
df_train.head()

In [None]:
class ImageTuple(fastuple):
    @classmethod
    def create(cls, fns):
        return cls(tuple(PILImage.create(f) for f in fns))
    
    def show(self, ctx = None, **kwargs):
        t1, t2 = self
        if not isinstance(t1, Tensor) or not isinstance(t2, Tensor) or t1.shape != t2.shape:
            return ctx
        line = t1.new_zeros(t1.shape[0], t1.shape[1], 10)
        
        return show_image(torch.cat([t1,line,t2], dim=2), ctx = ctx, **kwargs)

In [None]:
files = L(df_train.image.tolist())

In [None]:
img = ImageTuple.create((files[0], files[1]))
tst = ToTensor()(img)
type(tst[0]),type(tst[1])

In [None]:
img1 = Resize(224)(img)
tst = ToTensor()(img1)
tst.show();

In [None]:
def ImageTupleBlock():
    return TransformBlock(type_tfms = ImageTuple.create, batch_tfms = IntToFloatTensor)

`splits` is a 2-tuple of L-lists. The first element is the list of indices of the training files, the second a list of the indices of the validation files. They are used to mask the files L-lists.

In [None]:
def split_by_label(df):
    # L-list of unique labels
    labels = L(df.label_group.unique().tolist())
    # Randomly split labels
    split_labels = RandomSplitter(valid_pct=0.2)(labels)  # Returns 80/20 split of labels

    # Mask labels to receive train/val labels
    train_labels = labels[split_labels[0]]
    validation_labels = labels[split_labels[1]]

    # Add colum to mark file as a part of the training/validation set
    df['is_valid'] = df_train.label_group.isin(validation_labels)

    # Sanity check:
    assert((df.groupby(['label_group'])['is_valid'].nunique() > 1).sum() == 0)
    
    files = L(df['image'].tolist())
    train_idx = df[df['is_valid'] == False].index.tolist()
    validation_idx = df[df['is_valid'] == True].index.tolist()
    
    # Sanity check:
    assert(set(files[train_idx]) == set(df[df['is_valid'] == False]['image']))
    assert(set(files[validation_idx]) == set (df[df['is_valid'] == True]['image']))
    
    return files, train_labels, validation_labels, train_idx, validation_idx

In [None]:
df_train.shape

In [None]:
files, train_labels, validation_labels, train_split, val_split = split_by_label(df_train)
splits = (train_split, val_split)
labels = (train_labels,validation_labels)

In [None]:
# Splitting the labels 80/20 also results in a ~80/20 file-split
n = len(files)
print(len(splits[0])/n, len(splits[1])/n)

In [None]:
splits_files = [files[splits[i]] for i in range(2)]
splits_sets = mapped(set, splits_files)

In [None]:
def get_split(f):
    for i,s in enumerate(splits_sets):
        if f in s:
            return i
    raise ValueError(f'File {f} is not presented in any split.')

In [None]:
def label_func(f):
    return df_train[df_train['image'] == f]['label_group'].values[0]

In [None]:
# Sanity check, all labels belong to EITHER the training OR the validation set
# This does take some while on the whole dataset
# assert(set(mapped(label_func,splits_files[0])).intersection(set(mapped(label_func, splits_files[1]))) == set())

In [None]:
# from fastai tutorial, not suitable for large number of labels! Make use of the dataframe.
# splbl2files = [{l: [f for f in s if label_func(f) == l] for l in labels} for s in splits_sets]
splbl2files = [(df_train.loc[splits[i]]).groupby(['label_group'])['image'].apply(list).to_dict() for i in range(2)]

In [9]:
len(df_train)

32412

## Issue:

Some `image_phash`s occure in more than one `label_group`. When creating the dataloaders its decided wether to images are 'the same' on the `label_group`. Given, that the the same image (same `image_phash`) might be mapped as 'not the same' since the `label_group`s are different is a problem.

## Solution:
Pick one file for every `image_phash`. Remove all `label_group`s that contain less than one file.

In [10]:
##############
##############
##############
##############

splbl2files_alt = []

df_group_label_phash = df_train.groupby(['label_group','image_phash'])['image'].apply(list).to_frame()

df_group_label_phash['nr_images'] = df_group_label_phash['image'].apply(lambda x: len(x))

#df_group_label_phash[df_group_label_phash['nr_images'] > 1]

df_group_label_phash['chosen_image'] = df_group_label_phash['image'].apply(lambda x: random.choice(x))
############################
############################

In [13]:
##############
##############
##############
##############

valid_files = df_group_label_phash.chosen_image.tolist()

print(len(valid_files))

df_train.image_phash.nunique()

############################
############################

28855


28735

In [22]:
##############
##############
##############
##############

df_test = df_train[df_train['image'].isin(valid_files)]

print(len(df_test), df_test.image_phash.nunique())

(df_test.groupby(['image_phash'])['label_group'].count() > 1).sum()
############################
############################

28855 28735


114

In [31]:
##############
##############
##############
##############

df_test = df_train[df_train['image'].isin(valid_files)]

print(len(df_test), df_test.image_phash.nunique())

group_image_phash_label = df_test.groupby(['image_phash'])['label_group'].apply(list).to_frame()

group_image_phash_label['len'] = group_image_phash_label['label_group'].apply(lambda x: len(x))

group_image_phash_label[group_image_phash_label['len'] > 1]
############################
############################

28855 28735


Unnamed: 0_level_0,label_group,len
image_phash,Unnamed: 1_level_1,Unnamed: 2_level_1
84b67e8525cf3f02,"[1876943817, 2829310561]",2
84b67f8525cf3f00,"[1876943817, 1424289463, 2829310561]",3
84eab151bbd44abc,"[1417997905, 327189920]",2
8e07e1fffe80e00c,"[3128161097, 3888975197]",2
9607293c2fc3bec1,"[4009508396, 1942259177]",2
...,...,...
fc57942c26b6e4c8,"[185142711, 1127243882]",2
fcb0701999c74bc3,"[1065450055, 4223656537]",2
fe7e898456893163,"[196545328, 191984645]",2
fefa48fa8283a185,"[3527837949, 863127146]",2


In [41]:
##############
##############
##############
##############

label1 = 1417997905
label2 = 327189920

fx = df_train[df_train['label_group'].isin([label1,label2])].image.tolist()

imx = mapped(Image.open,fx)

for im in imx:
    im.show()

############################
############################

In [44]:
##############
##############
##############
##############

ip = 'fcb0701999c74bc3'

fs = df_train[df_train['image_phash'] == ip]['image'].tolist()

ims = mapped(Image.open,fs)

for im in ims:
    im.show()

############################
############################

In [None]:
def splitter(items):
    def get_split_files(i):
        return [j for j,(f1,f2,same) in enumerate(items) if get_split(f1) == i]
    return get_split_files(0), get_split_files(1)

In [None]:
def get_phash(f):
    return df_train[df_train['image'] == f]['image_phash'].values[0]

-----------------

In [None]:
def draw_other(f):
    same = random.random() < 0.5
    cls = label_func(f)
    split = get_split(f)
    if not same:
        ## src
        # cls = random.choice(L(l for l in labels if 1 != cls))
        # make shure to pick a label that is from the same split
        cls = random.choice(L(l for l in labels[split] if l != cls))
    ## src
    #return random.choice(splbl2files[split][cls]),same
    # make shure to not pick the input file itself
    return random.choice([f2 for f2 in splbl2files[split][cls] if get_phash(f2) != get_phash(f)]), same

In [None]:
##############
##############
##############
##############

file = files[0]
print(file)

print(len(files))
############################
############################

In [None]:
%time draw_other(file)

In [None]:
%time cls = label_func(file)

In [None]:
%time split = get_split(file)

In [None]:
%time random.choice(L(l for l in labels[split] if l != cls))

In [None]:
%time random.choice([f2 for f2 in splbl2files[split][cls] if get_phash(f2) != get_phash(file)])

In [None]:
%time get_phash(file)

In [None]:
%time same = random.random() < 0.5
%time cls = label_func(file)
%time split = get_split(file)
%time if not same: cls = random.choice(L(l for l in labels[split] if l != cls))
%time random.choice([f2 for f2 in splbl2files[split][cls] if get_phash(f2) != get_phash(file)]), same

%time draw_other(file)

In [None]:
##############
##############
##############
##############

In [None]:
def get_tuples(files):
    return [[f, *draw_other(f)] for f in files]

In [None]:
def get_x(t):
    return t[:2]
def get_y(t):
    return t[2]

In [None]:
siamese = DataBlock(
    blocks = (ImageTupleBlock, CategoryBlock),
    get_items = get_tuples,
    get_x = get_x,
    get_y = get_y,
    splitter = splitter,
    item_tfms = Resize(224),
    batch_tfms = [Normalize.from_stats(*imagenet_stats)]
)

In [None]:
small_pct = 0.4

In [None]:
#%time dls = siamese.dataloaders(files, bs = 4)
''' small_pct=0.4
CPU times: user 7min 25s, sys: 184 ms, total: 7min 25s
Wall time: 7min 25s
'''

In [None]:
#%time dls = siamese.dataloaders(files, bs = 4)
''' small_pct=0.3
CPU times: user 4min 16s, sys: 119 ms, total: 4min 17s
Wall time: 4min 17s
'''

In [None]:
#%time dls = siamese.dataloaders(files, bs = 4)
''' small_pct=0.2
CPU times: user 2min 2s, sys: 72.1 ms, total: 2min 2s
Wall time: 2min 2s
'''

In [None]:
#%time dls = siamese.dataloaders(files, bs = 4)
''' small_pct=0.1
CPU times: user 39.5 s, sys: 20.1 ms, total: 39.5 s
Wall time: 39.5 s
'''

In [None]:
b = dls.one_batch()

In [None]:
explode_types(b)

In [None]:
@typedispatch
def show_batch(x:ImageTuple, 
               y, 
               samples, 
               ctxs=None, 
               max_n=6, 
               nrows=None, 
               ncols=2, 
               figsize=None, 
               **kwargs):
    if figsize is None: 
        figsize = (ncols*6, max_n//ncols * 3)
        
    if ctxs is None: 
        ctxs = get_grid(min(len(samples), max_n), nrows=nrows, ncols=ncols, figsize=figsize)
        
    ctxs = show_batch[object](x, y, samples, ctxs=ctxs, max_n=max_n, **kwargs)
    
    return ctxs

In [None]:
dls.show_batch()

***Avoiding ImageTuples with the same phash seems to resolve this.***

***Note:*** Some instances that are not the same, can still have the same picture. See if this needs attention.

In [None]:
#twins = []
#for i in range(2):
#    for label in splbl2files[i].keys():
#        remaining = splbl2files[i][label]
#        touched = []
#        for f1 in remaining:
#            touched.append(f1)
#            remaining = [g for g in remaining if g not in touched]
#            img1 = np.array(Image.open(f1)
#                            .convert('L')
#                            .resize(((224,224)))
#                           ).astype(np.int)
#            
#            for f2 in remaining:
#                img2 = np.array(Image.open(f2)
#                            .convert('L')
#                            .resize(((224,224)))
#                           ).astype(np.int)            
#                diff = np.abs(img1 - img2).sum()
#                twins.append([f1,f2,diff])
#                
#list_f1 = [x[0] for x in twins]
#list_f2 = [x[1] for x in twins]
#list_diff = [x[2] for x in twins]
#
#d = {'file_1': list_f1, 'file_2': list_f2, 'difference': list_diff}
#
#df_twins = pd.DataFrame(d)

In [None]:
#f1,f2,diff = df_twins.min().tolist()
#img1, img2 = Image.open(f1), Image.open(f2)
#
#plt.subplot(1,2,1)
#plt.imshow(img1)
#plt.subplot(1,2,2)
#plt.imshow(img2)

-----------------

In [None]:
class SiameseModel(Module):
    def __init__(self, encoder, head):
        self.encoder = encoder
        self.head = head
        
    def forward(self, x):
        ftrs = torch.cat([self.encoder(x[0]), self.encoder(x[1])], dim = 1)
        return self.head(ftrs)

In [None]:
cut = model_meta[resnet34]['cut']
encoder = create_body(resnet34, cut = cut)

In [None]:
encoder[-1]

In [None]:
head = create_head(512 * 2, 2, ps=0.5)
model = SiameseModel(encoder, head)

In [None]:
head

In [None]:
def siamese_splitter(model):
    return [params(model.encoder), params(model.head)]

In [None]:
def loss_func(out, targ):
    return CrossEntropyLossFlat()(out, targ.long())

In [None]:
learn = Learner(dls,model,loss_func=loss_func, splitter=siamese_splitter, metrics = accuracy)

In [None]:
learn.freeze()

In [None]:
learn.lr_find()

In [None]:
learn.fit_one_cycle(10, 2e-4)

In [None]:
learn.unfreeze()

In [None]:
learn.lr_find()

In [None]:
learn.fit_one_cycle(25, slice(1e-6,1e-3))

In [None]:
learn.save('learner_resnet34_size224')

In [None]:
learn.lr_find()

In [None]:
learn.load('learner_resnet34_size224')

In [None]:
learn.unfreeze()

In [None]:
learn.lr_find()

In [None]:
learn.fit_one_cycle(50, slice(1e-4,1e-2))