In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
# Add project code
import sys
sys.path.append('/eai/project/')

In [3]:
import os 
import numpy as np
from fuel.datasets.hdf5 import H5PYDataset
import matplotlib.pyplot as plt

BATH_SIZE = 100
IMG_SIZE = 256

# Img resizing stuff
LR_HR_RATIO = 4
BIG_SIZE = int(IMG_SIZE * 76 / 64)
SMALL_SIZE = int(BIG_SIZE / LR_HR_RATIO)

FASHION_PATH  = '/fashion/'
DATA_PATH     = '/data/'
DATA_TEMPLATE =  os.path.join(FASHION_PATH, 'ssense_%i_%i.h5')

  from ._conv import register_converters as _register_converters


In [4]:
! ls { FASHION_PATH }

ssense_128_128.h5  ssense_256_256.h5  ssense_512_512.h5  ssense_full_size.h5


## Load categories

In [85]:
import json
category2idx = json.load(open(os.path.join(DATA_PATH, 'categories.json'), 'rt'))

## Load all the data into memory

In [40]:
dataset = H5PYDataset(DATA_TEMPLATE % (IMG_SIZE, IMG_SIZE), 
                      sources = ['input_category', 'input_description', 'input_image'],
                      which_sets=('all',),  load_in_memory=True)

In [45]:
classes, texts, images = dataset.data_sources

In [79]:
print("There are %i examples" % dataset.num_examples)

There are 73761 examples


In [80]:
print(texts.shape, images.shape, classes.shape)

((73761, 1), (73761, 256, 256, 3), (73761, 1))


# Preprocessing

In [48]:
import string
from collections import Counter

BLACK_LIST = string.punctuation.replace('%', '') + '\n'

def normalize(text_array, 
    black_list = BLACK_LIST,
    vocab=None, lowercase =  True, tokenize = False):
    text = text_array[0]
    if black_list:
        text = text.translate(string.maketrans(BLACK_LIST, ' '*len(BLACK_LIST)))
    if lowercase:
        text = text.lower()
    if vocab:
        text = ' '.join([word for word in text.split() if word in vocab])
        
    if tokenize:
        return text.split()
    else:
        return ' '.join(text.split())

### Create class info

In [49]:
print("N. examples: %i, fst: %s" %(len(classes), classes[0]))

N. examples: 73761, fst: ['TOPS']


### Create captions and filenames

In [88]:
def create_captions(
    classes, texts, category2idx, 
    verbose = True, save=True):
    '''
    helper function to create text_c10 folder
    '''
    cls2count = {k.replace(" ", "_"): 1 for k in category2idx}
    filenames = []
    for index, (cls, text) in enumerate(zip(classes, texts)):
        category = cls[0].replace(" ", "_").replace("&", 'AND')
        cls = cls[0].replace("&", 'AND')
        
        dirname  = "%.3i.%s"   % (category2idx[cls], category)
        filename = "%s_%i.txt" % (category, cls2count[category])

        directory = os.path.join(DATA_PATH, "text_c10/%s" % dirname)
        if not os.path.exists(directory):
            os.makedirs(directory)

        if verbose and (index % 1000) == 0:
            print("%i - %s" % (index, filename))
        
        if save:
            with open(os.path.join(directory, filename), 'wt') as f:
                f.write("%s\n" % normalize(text))    

        filenames.append(os.path.join(dirname, filename))
        cls2count[category] += 1       
        
    return filenames

In [91]:
# prepare filenames
filenames = create_captions(classes, texts, category2idx, False, False)

In [92]:
print("N. files: %i, fst: %s" % (len(filenames), filenames[0]))

N. files: 73761, fst: 001.TOPS/TOPS_1.txt


### Create embedding

In [17]:
from embedding.model import Model

MODEL_PATH = '/models/fashion/'

model = Model(
    os.path.join(MODEL_PATH, 'frozen_model.pb'),
    os.path.join(MODEL_PATH, 'tokenizer.pickle'))

Using TensorFlow backend.


Loading the graph


In [41]:
def get_batch(list_, batch_size):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(list_), batch_size):
        yield list_[i:i + batch_size]
        
def prepare_embeddings(texts, model, limit=None, batch_size=128):
    # normalize texts
    text_list = [normalize(text) for text in texts[:limit].reshape(-1, 1)]
    
    hs = []
    for index, batch in enumerate(get_batch(text_list, batch_size)):
        if index and index % 100:
            print("Processing batch number %i" % index)
        
        hs.extend([h.reshape(1, -1) for h in model.embed(batch)])

    return hs

## Stratified split

Due to the fact, that we have high clas imbalance we have to split the dataset using stratified samplig

In [22]:
from sklearn.model_selection import StratifiedShuffleSplit

In [23]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=0)
train_idx, test_idx = list(sss.split(class_info, class_info))[0]

In [24]:
# check if splits are behaving the way we expect

def split_hist(data, split):
    c = Counter(np.array(data)[split]).most_common()
    N = float(sum([v for (k, v) in c]))
    cp = [(k, v/ N) for (k, v) in c ]
    return np.array([v for (k, v ) in sorted(cp, key = lambda (k, v): k)])

print("MSE of diffrences: %0.5f" % np.sum(np.power(
    split_hist(class_info, train_idx) - split_hist(class_info, test_idx
    ), 2)))

MSE of diffrences: 0.00000


In [60]:
images[test_idx].shape

(46690, 256, 256, 3)

In [69]:
test_idx[:10]

array([107372, 137760,  88487,  38586, 109923, 171122,  75016,  85465,
       106474,   3371])

In [110]:
from scipy.misc import imresize

def dump_all(class_info, filenames, images, texts, split, model, outdir):
    
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    
    print('Selecting splits')
    imgs = images[split]
    txts = texts[split]
    
    class_info_ = np.array(class_info)[split].tolist()
    filenames_ = np.array(filenames)[split].tolist()

    print("N. files: %i, fst: %s" % (len(filenames_), filenames_[0]))
    print("N. examples: %i, fst: %s" %(len(class_info_), class_info_[0]))
    
    print("Saving class info")
    with open(os.path.join(outdir, 'class_info.pickle '), 'wb') as f:
        pickle.dump(class_info_, f)
        
    print("Saving filenames")
    with open(os.path.join(outdir, 'filenames.pickle '), 'wb') as f:
        pickle.dump(filenames_, f)

    print('Creating 76x76 images')
    img_76  = [imresize(img, [SMALL_SIZE, SMALL_SIZE], 'bicubic') for img in imgs]
    print("Small images: %i, %s" % (len(img_76),  img_76[0].shape))
    
    print("Saving 76x76 images")
    with open(os.path.join(outdir, '76images.pickle'), 'wb') as f:
        pickle.dump(img_76, f)
        
    print('Creating 304x304 images')
    img_304 = [imresize(img, [BIG_SIZE, BIG_SIZE], 'bicubic') for img in imgs]
    print("Big images: %i, %s" % (len(img_304), img_304[0].shape))
    
    print("Saving 304x304 images")
    with open(os.path.join(outdir, '304images.pickle'), 'wb') as f:
        pickle.dump(img_304, f)
    
    print("Creating text embeddings")
    embeddings = prepare_embeddings(txts, model)
    print("Embeddings %i, %s" % (len(embeddings), embeddings[0].shape))
    
    print("Saving embeddings")
    with open(os.path.join(outdir, 'custom_embeddings.pickle'), 'wb') as f:
        pickle.dump(embeddings, f)

In [None]:
dump_all(class_info, filenames, images, texts, test_idx, model, '/data/fashion/test')

Selecting splits
N. files: 46690, fst: 087.Zip_Up_&_Buckled_Boots/Zip_Up_&_Buckled_Boots_582.txt
N. examples: 46690, fst: 87
Saving class info
Saving filenames
Creating 76x76 images
Small images: 46690, (76, 76, 3)
Saving 76x76 images
Creating 304x304 images
Big images: 46690, (304, 304, 3)
Saving 304x304 images


In [115]:
dump_all(class_info, filenames, images, texts, train_idx, model, '/data/fashion/train')

Selecting splits
N. files: 140069, fst: 026.Hoodies_&_Zipups/Hoodies_&_Zipups_844.txt
N. examples: 140069, fst: 26
Saving class info
Saving filenames
Creating 76x76 images
Small images: 140069, (76, 76, 3)
Saving 76x76 images
Creating 304x304 images
Big images: 140069, (304, 304, 3)
Saving 304x304 images
Creating text embeddings
Processing batch number 1
Processing batch number 2
Processing batch number 3
Processing batch number 4
Processing batch number 5
Processing batch number 6
Processing batch number 7
Processing batch number 8
Processing batch number 9
Processing batch number 10
Processing batch number 11
Processing batch number 12
Processing batch number 13
Processing batch number 14
Processing batch number 15
Processing batch number 16
Processing batch number 17
Processing batch number 18
Processing batch number 19
Processing batch number 20
Processing batch number 21
Processing batch number 22
Processing batch number 23
Processing batch number 24
Processing batch number 25
Pro

# Birds dataset

In [44]:
# print some statistics to know how to split fashion dataset
train_size = 8855
test_size = 2933
print("%.02f" % (1.0 * train_size / (train_size + test_size)))

0.75


In [45]:
data_path = '/data/birds/test/'

In [46]:
!ls { data_path }

304images.pickle  char-CNN-RNN-embeddings.pickle  custom-embeddings.pickle
76images.pickle   class_info.pickle		  filenames.pickle


In [47]:
import os
import pickle

In [48]:
birds_class_info        = pickle.load(open(os.path.join(data_path, 'class_info.pickle'), 'rb'))
birds_filenames         = pickle.load(open(os.path.join(data_path, 'filenames.pickle'), 'rb'))
birds_images76          = pickle.load(open(os.path.join(data_path, '76images.pickle'), 'rb'))
birds_custom_embeddings = pickle.load(open(os.path.join(data_path, 'custom-embeddings.pickle'), 'rb'))

In [49]:
len(birds_class_info), birds_class_info[0]

(2933, 1)

In [50]:
len(birds_filenames), birds_filenames[0]

(2933, u'001.Black_footed_Albatross/Black_Footed_Albatross_0046_18')

In [51]:
birds_filenames[:5]

[u'001.Black_footed_Albatross/Black_Footed_Albatross_0046_18',
 u'001.Black_footed_Albatross/Black_Footed_Albatross_0009_34',
 u'001.Black_footed_Albatross/Black_Footed_Albatross_0002_55',
 u'001.Black_footed_Albatross/Black_Footed_Albatross_0074_59',
 u'001.Black_footed_Albatross/Black_Footed_Albatross_0014_89']

In [55]:
type(birds_images76), len(birds_images76), birds_images76[0].shape

(list, 2933, (76, 76, 3))

In [53]:
type(birds_custom_embeddings), len(birds_custom_embeddings), birds_custom_embeddings[0].shape

(list, 2933, (10, 1024))

In [54]:
birds_custom_embeddings[0]

array([[ 0.        ,  0.84947538,  0.75146329, ...,  2.36164188,
         0.        ,  0.        ],
       [ 0.        ,  0.25183675,  0.57483685, ...,  0.09980994,
         0.        ,  0.        ],
       [ 0.        ,  0.53113759,  0.80126029, ...,  3.72051549,
         0.        ,  0.        ],
       ..., 
       [ 0.        ,  0.83978474,  0.56212908, ...,  1.83551645,
         0.        ,  0.        ],
       [ 0.        ,  0.47584864,  0.01191006, ...,  2.09561086,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.73810697, ...,  0.64446282,
         0.        ,  0.        ]], dtype=float32)

### text_c10 folder

In [None]:
! ls /data/birds/text_c10/ | head 

In [None]:
! ls /data/birds/text_c10/001.Black_footed_Albatross  | head

In [None]:
! cat /data/birds/text_c10/001.Black_footed_Albatross/Black_Footed_Albatross_0001_796111.txt

## To be consistent with StackGAN

In [None]:
img_304 = scipy.misc.imresize(img, [BIG_SIZE, BIG_SIZE], 'bicubic')
img_76 = scipy.misc.imresize(img, [SMALL_SIZE, SMALL_SIZE], 'bicubic')

In [None]:
clean_plot_dpi(img_76, SMALL_SIZE) ; clean_plot_dpi(img_304, BIG_SIZE);

In [None]:
from tensorflow.python.client import device_lib

In [None]:
print(device_lib.list_local_devices())