In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import tensorflow as tf
import re
import math
import os
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold 


# Amount of tf records we want to create
FOLDS = 15
# FOLDS = 2
# Random seed for stratification
SEED = 123
# Image size 
IMAGE_SIZE = (512, 512)

In [2]:
# Function to get our f1 score
def f1_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1

In [3]:
# Function to read and preprocess our data
def preprocess():
    # Read train and test csv
    train = pd.read_csv('../input/shopee-product-matching/train.csv')
    test = pd.read_csv('../input/shopee-product-matching/test.csv')
    # Drop duplicates images to avoid leakage (dont know if this is correct)
    train.drop_duplicates(subset = ['image'], inplace = True)
    train.reset_index(drop = True, inplace = True)
    label_mapper = dict(zip(train['label_group'].unique(), np.arange(len(train['label_group'].unique()))))
    label_mapper_inv = dict(zip(np.arange(len(train['label_group'].unique())), train['label_group'].unique()))
    train['label_group'] = train['label_group'].map(label_mapper)
    # Number of classes
    N_CLASSES = train['label_group'].nunique()
    # Get ground truth labels format
    tmp = train.groupby(['label_group'])['posting_id'].unique().to_dict()
#     return train.groupby(['label_group'])
#     print(train.groupby(['label_group'])['posting_id'].unique())
    train['matches'] = train['label_group'].map(tmp)
    train['matches'] = train['matches'].apply(lambda x: ' '.join(x))
    # Calculate naive score using self-post
    train['f1'] = f1_score(train['matches'], train['posting_id'])
    score = train['f1'].mean()
    print(f'Using the same posting id as prediction our f1 score is {score}')
    print(f'We have a multiclass problem with {N_CLASSES} classes')
    return train

In [4]:
train = preprocess()

Using the same posting id as prediction our f1 score is 0.48310682247119097
We have a multiclass problem with 11011 classes


In [5]:
# train.head()

In [6]:
# def check(row):
#     if len(row['matches'].split())>1:
#         return row
#     else:
#         row['matches'] = None
#         return row

In [7]:
# print(train.shape)
# data = train.apply(check, axis=1)
# data.shape

In [8]:
# # data['matches'].isnull().values.any()
# print(data['matches'].isnull().sum())
# # data['matches'].isnull().values
# print(data.shape)
# print(data.dropna(inplace=True))
# print(data.shape)
# train=data
# train.reset_index(drop = True, inplace = True)
# print(train['label_group'].nunique())

In [9]:
# # Function to read and preprocess our data
# def preprocess2(train):
#     train.drop_duplicates(subset = ['image'], inplace = True)
#     train.reset_index(drop = True, inplace = True)
#     label_mapper = dict(zip(train['label_group'].unique(), np.arange(len(train['label_group'].unique()))))
#     label_mapper_inv = dict(zip(np.arange(len(train['label_group'].unique())), train['label_group'].unique()))
#     train['label_group'] = train['label_group'].map(label_mapper)
#     # Number of classes
#     N_CLASSES = train['label_group'].nunique()
#     # Get ground truth labels format
#     tmp = train.groupby(['label_group'])['posting_id'].unique().to_dict()
# #     return train.groupby(['label_group'])
# #     print(train.groupby(['label_group'])['posting_id'].unique())
#     train['matches'] = train['label_group'].map(tmp)
#     train['matches'] = train['matches'].apply(lambda x: ' '.join(x))
#     # Calculate naive score using self-post
#     train['f1'] = f1_score(train['matches'], train['posting_id'])
#     score = train['f1'].mean()
#     print(f'Using the same posting id as prediction our f1 score is {score}')
#     print(f'We have a multiclass problem with {N_CLASSES} classes')
#     return train

In [10]:
# train = preprocess2(train)

In [11]:
kfold = StratifiedKFold(n_splits = FOLDS, shuffle = True, random_state = SEED)
for fold, (trn_ind, val_ind) in enumerate(kfold.split(train, train['label_group'])):
#     print(trn_ind, val_ind)
    train.loc[val_ind, 'fold'] = fold
train['fold'] = train['fold'].astype(int)



In [12]:
train.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group,matches,f1,fold
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,0,train_129225211 train_2278313361,0.666667,1
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",1,train_3386243561 train_3423213080,0.666667,2
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2,train_2288590299 train_3803689425,0.666667,5
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,3,train_2406599165 train_3342059966,0.666667,7
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,4,train_3369186413 train_921438619,0.666667,9


In [13]:
# train.groupby('label_group')['posting_id'].unique()

In [14]:
train.groupby('fold')['label_group'].unique()

fold
0     [12, 39, 43, 56, 48, 86, 91, 98, 107, 110, 114...
1     [0, 29, 34, 40, 49, 54, 62, 71, 87, 119, 121, ...
2     [1, 30, 44, 50, 59, 66, 78, 89, 92, 99, 102, 1...
3     [13, 17, 24, 35, 67, 75, 82, 122, 160, 166, 18...
4     [6, 45, 57, 72, 100, 24, 123, 139, 145, 151, 1...
5     [2, 7, 41, 51, 68, 83, 88, 93, 111, 127, 134, ...
6     [21, 46, 63, 76, 84, 94, 103, 135, 146, 196, 2...
7     [3, 25, 52, 115, 152, 192, 211, 227, 254, 289,...
8     [8, 14, 18, 26, 31, 95, 112, 124, 141, 162, 16...
9     [4, 9, 22, 36, 47, 48, 69, 79, 101, 104, 116, ...
10    [19, 32, 37, 58, 60, 80, 96, 136, 129, 142, 14...
11    [5, 27, 64, 77, 61, 85, 48, 90, 105, 109, 118,...
12    [10, 20, 33, 53, 131, 143, 163, 168, 170, 194,...
13    [11, 15, 23, 28, 38, 42, 65, 73, 81, 97, 106, ...
14    [16, 55, 61, 70, 74, 59, 120, 137, 153, 165, 1...
Name: label_group, dtype: object

In [15]:
def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy()
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def serialize_example(posting_id, image, label_group, matches):
    feature = {
        'posting_id': _bytes_feature(posting_id),
        'image': _bytes_feature(image),
        'label_group': _int64_feature(label_group),
        'matches': _bytes_feature(matches)
    }
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()


for fold in range(FOLDS):
    print('\n')
    print('-'*50)
    print(f'Writing TFRecord {fold} of {FOLDS - 1}...')
    train_ = train[train['fold'] == fold]
    with tf.io.TFRecordWriter('train%.2i-%i.tfrec'%(fold, train_.shape[0])) as writer:
        for k in range(train_.shape[0]):
            row = train_.iloc[k]
            image = cv2.imread('../input/shopee-product-matching/train_images/' + row['image'])
#             if image.shape[0]!=image.shape[1]:
#                 print(image.shape)
#                 plt.imshow(image)
#                 plt.show()
#                 image = tf.image.resize_with_pad(image, target_width = IMAGE_SIZE[0], target_height = IMAGE_SIZE[1])
#                 image = tf.cast(image, tf.float32) / 255.0
#                 image = np.array(image)
#                 print(image.shape)
#                 plt.imshow(image)
#                 plt.show()
            image = tf.image.resize_with_pad(image, target_width = IMAGE_SIZE[0], target_height = IMAGE_SIZE[1])
            image = tf.cast(image, tf.float32) / 255.0
            image = np.array(image)
#             image = cv2.resize(image, IMAGE_SIZE)
            image = cv2.imencode('.jpg', image, (cv2.IMWRITE_JPEG_QUALITY, 100))[1].tobytes()
            posting_id = row['posting_id']
            label_group = row['label_group']
            matches = row['matches']
            example = serialize_example(str.encode(posting_id),
                                        image, 
                                        label_group,
                                        str.encode(matches))
            writer.write(example)
            if k%100==0: print(k,', ',end='')



--------------------------------------------------
Writing TFRecord 0 of 14...
0 , 100 , 200 , 300 , 400 , 500 , 600 , 700 , 800 , 900 , 1000 , 1100 , 1200 , 1300 , 1400 , 1500 , 1600 , 1700 , 1800 , 1900 , 2000 , 2100 , 

--------------------------------------------------
Writing TFRecord 1 of 14...
0 , 100 , 200 , 300 , 400 , 500 , 600 , 700 , 800 , 900 , 1000 , 1100 , 1200 , 1300 , 1400 , 1500 , 1600 , 1700 , 1800 , 1900 , 2000 , 2100 , 

--------------------------------------------------
Writing TFRecord 2 of 14...
0 , 100 , 200 , 300 , 400 , 500 , 600 , 700 , 800 , 900 , 1000 , 1100 , 1200 , 1300 , 1400 , 1500 , 1600 , 1700 , 1800 , 1900 , 2000 , 2100 , 

--------------------------------------------------
Writing TFRecord 3 of 14...
0 , 100 , 200 , 300 , 400 , 500 , 600 , 700 , 800 , 900 , 1000 , 1100 , 1200 , 1300 , 1400 , 1500 , 1600 , 1700 , 1800 , 1900 , 2000 , 2100 , 

--------------------------------------------------
Writing TFRecord 4 of 14...
0 , 100 , 200 , 300 , 400 ,