In [1]:
from fastai.text.all import *
import codecs

In [2]:
base_path = Path('src')

img_train = base_path / 'train_images'
img_test = base_path / 'test_images'

In [3]:
df_train = pd.read_csv(base_path / 'train.csv')
df_test = pd.read_csv(base_path / 'test.csv')

In [4]:
df_train.title = df_train.title.apply(lambda x: codecs.escape_decode(x)[0].decode('utf8'))

In [5]:
print(f'Unique images: {df_train.image_phash.nunique()} of {df_train.image.nunique()} unique image-files.')

Unique images: 28735 of 32412 unique image-files.


Drop duplicate texts.

In [6]:
df_train.drop_duplicates(['title'], inplace = True)

In [7]:
df_groupby_label = df_train.groupby(['label_group'])['title'].nunique().to_frame()

In [8]:
print(f'Number of labels, that contain only one title: {(df_groupby_label.title < 2).sum()}')

Number of labels, that contain only one title: 309


Only keep instances that belong to a label_group that contains more than one unique texts.

In [9]:
keep_labels = df_groupby_label[df_groupby_label.title >= 2].index.tolist()
df_train = df_train[df_train['label_group'].isin(keep_labels)].reset_index(drop = True)

In [10]:
# Sanity check:
print(f"Number of labels, that contain only one image_phash: {(df_train.groupby(['label_group'])['title'].nunique() < 2).sum()}")
print(f"Number of duplicate files: {df_train.duplicated(['title']).sum()}")

Number of labels, that contain only one image_phash: 0
Number of duplicate files: 0


We can now make shure to draw image-files with different phashes, when creating the Siamese dataset.

## create smaller dataset

In [11]:
small_pct = 0.1

In [12]:
x = df_train['label_group'].unique()
y = np.random.choice(x, size = int(len(x)*small_pct), replace = False)
df_train = df_train[df_train['label_group'].isin(y)]
df_train = df_train.reset_index(drop = True)

In [14]:
print(df_train.shape)
df_train.head()

(3363, 5)


Unnamed: 0,posting_id,image,image_phash,title,label_group
0,train_1806152124,0014f61389cbaa687a58e38a97b6383d.jpg,eea7e1c0c04da33d,KULOT PLISKET SALUR /CANDY PLISKET /WISH KULOT PREMIUM /KULOT PELANGI PREMIUM/HIEKA KULOT,1565741687
1,train_1180155871,00369839c7d94411e54437b7cf59b603.jpg,bbc1c43ec3633ec0,[INCLUDE PAJAK] NCT DREAM RELOAD ALBUM,2224428568
2,train_3921931335,007ed1560a37f2fc76297ed06af5c4a5.jpg,facb8794856d52c2,KEDAS BEAUTY BODY SERUM BER BPOM,2217827677
3,train_2510999729,008f8aac1d118f0b318b918aa783e9ed.jpg,934f36b444d03d3b,Anti Blue Light Film Sony Xperia XZ1 XZ3 XZ/XZ2 Premium XA2 Plus/Ultra Xperia 1 10 Plus Screen Guard,1702111919
4,train_2962272674,00a8d112066ea78e33d5ef987091a6b4.jpg,ebe19ec4d31ea103,Set Boneka Jari Binatang Hewan Finget Puppet Mainan Edukasi Anak Bayi Balita PAUD Alat Mendongeng,1896239564


In [None]:
## Save tiny dataset
#df_exp = df_train.drop(['image','image_phash','label_group'], axis = 1)
#df_exp.to_csv('src/small_text_df.csv')

In [15]:
class SiameseText(fastuple):
    def show(self, ctx = None, **kwargs):
        if len(self) > 2:
            txt1, txt2, same_prod = self
        else:
            txt1, txt2 = self
            same_prod = 'Undetermined'
        
        return print(f'First text: {txt1}\nSecond text: {txt2}\nDescribes same: {same_prod}')

In [48]:
i,j = np.random.randint(0, high = len(df_train), size = 2)
txt1 = df_train.loc[i, 'title']
txt2 = df_train.loc[j, 'title']
same = df_train.loc[i, 'label_group'] == df_train.loc[j, 'label_group']

siamt = SiameseText(txt1, txt2, same)
siamt.show()

First text: Isomil Advance Soya (0-12 Bln) 850 gr
Second text: CUSSONS BABY CREAM 50GR
Describes same: False


In [50]:
def split_by_label(df, col):
    # L-list of unique labels
    labels = L(df.label_group.unique().tolist())
    # Randomly split labels
    split_labels = RandomSplitter(valid_pct=0.2)(labels)  # Returns 80/20 split of labels

    # Mask labels to receive train/val labels
    train_labels = labels[split_labels[0]]
    validation_labels = labels[split_labels[1]]

    # Add colum to mark file as a part of the training/validation set
    df['is_valid'] = df_train.label_group.isin(validation_labels)

    # Sanity check:
    assert((df.groupby(['label_group'])['is_valid'].nunique() > 1).sum() == 0)
    
    files = L(df[col].tolist())
    train_idx = df[df['is_valid'] == False].index.tolist()
    validation_idx = df[df['is_valid'] == True].index.tolist()
    
    # Sanity check:
    assert(set(files[train_idx]) == set(df[df['is_valid'] == False][col]))
    assert(set(files[validation_idx]) == set (df[df['is_valid'] == True][col]))
    
    return files, train_labels, validation_labels, train_idx, validation_idx

In [51]:
files, train_labels, validation_labels, train_split, val_split = split_by_label(df_train, 'title')
splits = (train_split, val_split)
labels = (train_labels,validation_labels)

In [54]:
splits_files = [files[splits[i]] for i in range(2)]
splits_sets = mapped(set, splits_files)

In [56]:
splbl2files = [(df_train.loc[splits[i]]).groupby(['label_group'])['title'].apply(list).to_dict() for i in range(2)]

In [57]:
splbl2files

[{5949579: ['Kisonli Speaker Bluetooth Tenteng FojaxFJ-16DW Plus Mic Karaoke, Radio, USB, Micro SD',
   'Speaker Bluetooth Tenteng Portabable Unitech FJD-16DW Lampu LED Support USB MicroSD Aux FM Radio'],
  7657592: ['Tote bag wanita BAG TULIP / tas wanita murah/tas kuliah/ tas bahu',
   'Tote bag cantik TULIP BAG / tas wanita / tas bahu / tas kuliah / tas wanita murah /women bag'],
  10069919: ['Ellips Hair Vitamin (Pro Keratin Complex) Kemasan Botol Jar 50 kapsul 1 ml',
   'Ellips Hair Vitamin Rambut jar isi 50 pcs'],
  19273520: ['SHANNEN LIPSTICK CLP',
   'SHANNEN LIPSTICK CREAMY LIP PAINT',
   'SHANNEN CREAMY LIP PAINT',
   'Promo lipstik Shannen creamy lip paint ORI harga Grossir'],
  39139032: ['Speaker mini bluetooth usb Advance TP600 TP 600 (Tanpa micro sd)',
   'Speaker Advance TP 600 Batik Portable Digital TP-600BT Bluetooth Ori'],
  66057972: ['ROBOT TRANSFORM POLIIISI MOBIL BERUBAH JADI ROBOT - KADO MAINAN ANAK',
   'ROBOT TRANSFORM POLISI MOBIL BERUBAH JADI ROBOT - KADO M

In [58]:
def get_split(f):
    for i,s in enumerate(splits_sets):
        if f in s:
            return i
    raise ValueError(f'File {f} is not presented in any split.')

In [63]:
f_0 = splits_files[0][984]
f_1 = splits_files[1][65]

get_split(f_0) == 0, get_split(f_1) == 1

(True, True)

In [59]:
def label_func(f):
    return df_train[df_train['title'] == f]['label_group'].values[0]

In [94]:
l_0 = train_labels[654]
l_1 = validation_labels[69]

f_0 = np.random.choice(df_train[df_train['label_group'] == l_0]['title'])
f_1 = np.random.choice(df_train[df_train['label_group'] == l_1]['title'])

label_func(f_0) == l_0, label_func(f_1) == l_1

(True, True)

In [163]:
spacy = WordTokenizer(lang = 'id')
tkn = Tokenizer(spacy)

In [162]:
class SiameseTransform(Transform):
    def __init__(self, files, splits):
        self.splbl2files = [(df_train.loc[splits[i]]).groupby(['label_group'])['title'].apply(list).to_dict() for i in range(2)]
        self.valid = {f: self._draw(f,1) for f in files[splits[1]]}
             
    def encodes(self, f):
        ## self.valid.get: Return the value for key if key is in the dictionary, else default.
        f2, same = self.valid.get(f, self._draw(f, 0))
        txt1, txt2 = f, f2
        return SiameseText(txt1, txt2, same)
    
    def _draw(self, f, split = 0):
            same = random.random() < 0.5

            cls = label_func(f)
            split = get_split(f)

            if not same:
                cls = random.choice([l for l in labels[split] if l != cls])

            return random.choice([f2 for f2 in self.splbl2files[split][cls] if f2 != f]),int(same)

In [164]:
tfm = SiameseTransform(files, splits)
tls = TfmdLists(files, [tfm, tkn, Numericalize], splits = splits)
dls = tls.dataloaders(bs = 8)

TypeError: unhashable type: 'L'

In [157]:
b = dls.one_batch()

In [161]:
dls.vocab

['xxunk',
 'xxpad',
 'xxbos',
 'xxeos',
 'xxfld',
 'xxrep',
 'xxwrep',
 'xxup',
 'xxmaj',
 1,
 0,
 '✧YUKSHOPING✧ R103 Kaos Kaki Polos 6 Garis Unisex Fashion Wanita/Pria Kaos Kaki Import Murah Jakarta',
 '[NATURE REPUBLIC] - Fresh Herb Cleansing Foam 170ml',
 'Botol Susu + Drying Rack Set Disney Isi 4 Winnie The Pooh | Kado Bayi',
 'Tissue tisu paseo smart handky ( 12sheet )isi 6bungkus',
 'HANDBAG DOUBLE ZIPPER',
 'RANGKAIAN BUNGA ARTIFICIAL SEPEDA VESPA MINI BP014',
 'SALE!!! GOLDENFIL STRAWBERRY & BLUEBERRY ( EXP MARET 2021 )',
 'Anti Blue Light Film Sony Xperia XZ1 XZ3 XZ/XZ2 Premium XA2 Plus/Ultra Xperia 1 10 Plus Screen Guard',
 'Joran pancing capung relix nusantara  UL',
 'kitty plus shampoo jamur kutu anjing kucing250ml',
 'ASIN829 - D09 DOMPET LIPAT PENDEK FOREVER YOUNG IMPOR',
 'Celana dalam boxer pria fema sport (Paket isi 6)',
 'CARYN DRESS BY SHIERAKI INDONESIA',
 '45Pcs Stiker Kertas Motif Print untuk Dekorasi Album / Scrapbook DIY',
 'MIYAKO HM620 Hand Mixer / Handmixer H

In [158]:
b

TensorText([[176, 280,  10],
        [  0,   0,   9],
        [  0,   0,   9],
        [641,   0,   9],
        [397, 338,   9],
        [  0,   0,  10],
        [  0,   0,  10],
        [  0,   0,   9]], device='cuda:0')

In [155]:
@typedispatch
def show_batch(x:SiameseText, 
               y, 
               samples, 
               ctxs=None, 
               max_n=6, 
               nrows=None, 
               ncols=2, 
               figsize=None, 
               **kwargs
              ):
    if figsize is None: 
        figsize = (ncols*6, max_n//ncols * 3)
    for i,ctx in enumerate(ctxs): 
        SiameseText(x[0][i], x[1][i], ['Not similar','Similar'][x[2][i].item()]).show(ctx=ctx)

In [None]:
dls.show_batch()

In [None]:
class SiameseModel(Module):
    def __init__(self, encoder, head):
        self.encoder,self.head = encoder,head
    
    def forward(self, x1, x2):
        ftrs = torch.cat([self.encoder(x1), self.encoder(x2)], dim=1)
        return self.head(ftrs)

In [None]:
encoder = create_body(resnet34, cut=-2)

In [None]:
head = create_head(512*2, 2, ps=0.5)
model = SiameseModel(encoder, head)

In [None]:
def siamese_splitter(model):
    return [params(model.encoder), params(model.head)]

In [None]:
def loss_func(out, targ):
    return CrossEntropyLossFlat()(out, targ.long())

In [None]:
learn = Learner(dls,
                model,
                #loss_func = loss_func,
                loss_func = CrossEntropyLossFlat(),
                splitter = siamese_splitter,
                metrics = accuracy
               )

In [None]:
learn.freeze()

In [None]:
learn.lr_find()

In [None]:
learn.fit_one_cycle(7, 3e-3)

In [None]:
learn.save('full_df_224_frozen_7epos')

In [None]:
learn.load('full_df_224_frozen_7epos')

In [None]:
learn.unfreeze()

In [None]:
learn.lr_find()

In [None]:
learn.fit_one_cycle(5, slice(5e-7,1e-4))

In [None]:
@typedispatch
def show_results(x:SiameseImage, 
                 y, 
                 samples, 
                 outs, 
                 ctxs=None, 
                 max_n=6, 
                 nrows=None, 
                 ncols=2, 
                 figsize=None, 
                 **kwargs                
                ):
    if figsize is None: 
        figsize = (ncols*6, max_n//ncols * 3)
        
    if ctxs is None: 
        ctxs = get_grid(min(x[0].shape[0], max_n), nrows=None, ncols=ncols, figsize=figsize)
        
    for i,ctx in enumerate(ctxs):
        title = f'Actual: {["Not similar","Similar"][int(x[2][i].item())]} \n Prediction: {["Not similar","Similar"][y[2][i].argmax().item()]}'
        SiameseImage(x[0][i], x[1][i], title).show(ctx=ctx)

In [None]:
learn.show_results()

In [None]:
preds = learn.get_preds()

In [None]:
print(type(preds[1]))

In [None]:
len(preds[1]), preds[1].sum()

In [None]:
@patch
def siampredict(self:Learner, item, rm_type_tfms=None, with_input=False):
    res = self.predict(item, rm_type_tfms=None, with_input=False)
    if res[0] == tensor(0):
        SiameseImage(item[0], item[1], 'Prediction: Not similar').show()
    else:
        SiameseImage(item[0], item[1], 'Prediction: Similar').show()
    return res

In [None]:
imgtest = PILImage.create(files[0])
imgval = PILImage.create(files[100])
siamtest = SiameseImage(imgval, imgtest)
siamtest.show();

In [None]:
res = learn.siampredict(siamtest)