In [3]:
from __future__ import print_function
import pandas as pd
import numpy as np
import os
import time
from tqdm import tqdm
import scipy.misc
import utils
from collections import Counter
import img_augm

In [20]:

PATH_TO_WIKIART = '/export/home/asanakoy/workspace/wikiart/images'


ARTISTS_LIST = ['paul-cezanne', 'vincent-van-gogh', 'amedeo-modigliani', 'camille-pissarro', 'pierre-auguste-renoir',
                'childe-hassam', 'paul-gauguin', 'alfred-sisley', 'claude-monet', 'berthe-morisot']
TECHNIQUES_LIST = ['oil', 'watercolor', 'chalk', 'pastel']


class ArtDataset():
    def __init__(self, path_to_art_dataset, artists_list):
        self.artists_list = artists_list
        # Read dataset with paintings and corresponding labels.
        dataset = pd.read_csv(filepath_or_buffer=path_to_art_dataset,
                                  index_col='image_id',
                                  sep='\t',
                                  encoding='utf-8')
        self.dataset = dataset[dataset['artist_slug'].isin(artists_list)]
        print("Art dataset size for:", artists_list, "is", self.dataset.shape[0])

    def extract_digits(self, s):
        return int(''.join(c for c in s if c.isdigit()))

    def get_df_statistics_given_column(self, df_orig, column_name, verbose=True):
        if column_name not in df_orig.columns:
            print("Column not found in dataframe.")
            return 0
        df = df_orig.copy()
        df.fillna(value='NaN', inplace=True)
        a = df[column_name].values
        col_vals_counts = Counter(a)
        if verbose:
            print(col_vals_counts)
        return col_vals_counts

    def prepare_dataset(self,
                        input_df,
                        wikiart_dir=PATH_TO_WIKIART,
                        to_normalize_date=True,
                        verbose=True):
        print("Prepare given dataset.csv...")
        if 'image_id' not in input_df.columns:
            input_df['image_id'] = input_df.index

        if verbose: print("Input. input_df.shape:", input_df.shape)
        input_df = input_df[['artist_slug', 'technique', 'date', 'image_id']]

        if verbose: print("Leave only relevant columns. Dataframe.shape:", input_df.shape)
        # Leave only specified artists
        input_df = input_df[input_df['artist_slug'].isin(self.artists_list)]
        if verbose: print("Leave only relevant artists. Dataframe.shape:", input_df.shape)

        # Get rid of images without date
        input_df = input_df[~input_df['date'].isnull()]
        if verbose: print("Get rid of unspecified dates. Dataframe.shape:", input_df.shape)

        # Leave only digits in date
        if verbose: input_df['date'] = input_df['date'].apply(self.extract_digits)

        # Normalize dates for each artist separately.

        def date_normalization(df):
            df['date'] = (df['date'] - df['date'].min()) / (df['date'].max() - df['date'].min())
            return df
        if to_normalize_date:
            input_df = pd.concat([date_normalization(input_df[input_df['artist_slug'] == artis_slug])
                                  for artis_slug in self.artists_list])

        # generate column with path to image
        input_df['path'] = input_df['image_id'].apply(lambda x: os.path.join(wikiart_dir, x+'.jpg'))
        input_df.drop(['image_id'], axis=1, inplace=True)

        if verbose:
            print("Dataset statistics for each artist separately.")
            for artist in self.artists_list:
                print("%s:" % artist)
                self.get_df_statistics_given_column(
                    input_df[input_df['artist_slug'] == artist],
                    column_name='technique')
        print("Art dataset preprocessing completed.")

        return input_df

    def get_batch(self, augmentor, batch_size=1):
        """
        Reads data from dataframe data containing path to images in column 'path' and, in case of dataframe,
         also containing artist name, technique name, and period of creation for given artist.
         In case of content images we have only the 'path' column.
        Args:
            data: dataframe with columns ['path', 'artist_slug', 'technique', 'period']
            augmentor: Augmentor object responsible for augmentation pipeline
            batch_size: size of batch
        Returns:
            dictionary with fields: image, artist_slug, artist_slug_onehot
            each containing a batch of corresponding values
        """
    
        batch_image = []
        batch_artist_slug = []
        batch_artist_slug_onehot = []
    
        for _ in range(batch_size):
            row = self.dataset.sample(n=1)
            image = scipy.misc.imread(name=row['path'].values[0], mode='RGB')

            if max(image.shape) > 1800.:
                image = scipy.misc.imresize(image, size=1800./max(image.shape))
            if max(image.shape) < 800:
                # Resize the smallest side of the image to 800px
                alpha = 800. / float(min(image.shape))
                if alpha < 4.:
                    image = scipy.misc.imresize(image, size=alpha)
                    image = np.expand_dims(image, axis=0)
                else:
                    image = scipy.misc.imresize(image, size=[800, 800])
    
            batch_image.append(augmentor(utils.enhance_image(image)).astype(np.float32))
            batch_artist_slug.append(self.artists_list.index(row['artist_slug'].values[0]))
            batch_artist_slug_onehot.append(
                utils.get_one_hot_encoded_vector(l=len(self.artists_list),
                                                 i=self.artists_list.index(row['artist_slug'].values[0]))
            )

        # Now return a batch in correct form
        batch_image = np.asarray(batch_image)
        batch_artist_slug_onehot = np.asarray(batch_artist_slug_onehot)

        return {"image": batch_image,
                "artist_slug": batch_artist_slug,
                "artist_slug_onehot": batch_artist_slug_onehot}

    def initialize_batch_worker(self, queue, augmentor, batch_size=1, seed=228):
        np.random.seed(seed)
        while True:
            batch = self.get_batch(augmentor=augmentor, batch_size=batch_size)
            queue.put(batch)


class PlacesDataset():
    categories_names = \
        ['/a/abbey', '/a/arch', '/a/amphitheater', '/a/aqueduct', '/a/arena/rodeo', '/a/athletic_field/outdoor',
         '/b/badlands', '/b/balcony/exterior', '/b/bamboo_forest', '/b/barn', '/b/barndoor', '/b/baseball_field',
         '/b/basilica', '/b/bayou', '/b/beach', '/b/beach_house', '/b/beer_garden', '/b/boardwalk', '/b/boathouse',
         '/b/botanical_garden', '/b/bullring', '/b/butte', '/c/cabin/outdoor', '/c/campsite', '/c/campus',
         '/c/canal/natural', '/c/canal/urban', '/c/canyon', '/c/castle', '/c/church/outdoor', '/c/chalet',
         '/c/cliff', '/c/coast', '/c/corn_field', '/c/corral', '/c/cottage', '/c/courtyard', '/c/crevasse',
         '/d/dam', '/d/desert/vegetation', '/d/desert_road', '/d/doorway/outdoor', '/f/farm', '/f/fairway',
         '/f/field/cultivated', '/f/field/wild', '/f/field_road', '/f/fishpond', '/f/florist_shop/indoor',
         '/f/forest/broadleaf', '/f/forest_path', '/f/forest_road', '/f/formal_garden', '/g/gazebo/exterior',
         '/g/glacier', '/g/golf_course', '/g/greenhouse/indoor', '/g/greenhouse/outdoor', '/g/grotto', '/g/gorge',
         '/h/hayfield', '/h/herb_garden', '/h/hot_spring', '/h/house', '/h/hunting_lodge/outdoor', '/i/ice_floe',
         '/i/ice_shelf', '/i/iceberg', '/i/inn/outdoor', '/i/islet', '/j/japanese_garden', '/k/kasbah',
         '/k/kennel/outdoor', '/l/lagoon', '/l/lake/natural', '/l/lawn', '/l/library/outdoor', '/l/lighthouse',
         '/m/mansion', '/m/marsh', '/m/mausoleum', '/m/moat/water', '/m/mosque/outdoor', '/m/mountain',
         '/m/mountain_path', '/m/mountain_snowy', '/o/oast_house', '/o/ocean', '/o/orchard', '/p/park',
         '/p/pasture', '/p/pavilion', '/p/picnic_area', '/p/pier', '/p/pond', '/r/raft', '/r/railroad_track',
         '/r/rainforest', '/r/rice_paddy', '/r/river', '/r/rock_arch', '/r/roof_garden', '/r/rope_bridge',
         '/r/ruin', '/s/schoolhouse', '/s/sky', '/s/snowfield', '/s/swamp', '/s/swimming_hole',
         '/s/synagogue/outdoor', '/t/temple/asia', '/t/topiary_garden', '/t/tree_farm', '/t/tree_house',
         '/u/underwater/ocean_deep', '/u/utility_room', '/v/valley', '/v/vegetable_garden', '/v/viaduct',
         '/v/village', '/v/vineyard', '/v/volcano', '/w/waterfall', '/w/watering_hole', '/w/wave',
         '/w/wheat_field', '/z/zen_garden', '/a/alcove', '/a/apartment-building/outdoor', '/a/artists_loft',
         '/b/building_facade', '/c/cemetery']
    categories_names = [x[1:] for x in categories_names]

    def __init__(self, path_to_dataset):
        paths = []
        categories = []

        nmbr_skipped = 0
        categories_skipped = []
        start_time = time.time()
        for category_idx, category_name in enumerate(tqdm(self.categories_names)):
            #category_name = category_name[1:]
            #print("Process %d/%d category." % (category_idx + 1, len(categories_names)))
            if os.path.exists(os.path.join(path_to_dataset, category_name)):
                for file_name in tqdm(os.listdir(os.path.join(path_to_dataset, category_name))):
                    paths.append(os.path.join(path_to_dataset, category_name, file_name))
                    categories.append(category_name)
            else:
                print("Category %s can't be found in path %s. Skip it." %
                      (category_name, os.path.join(path_to_dataset, category_name)))
                nmbr_skipped += 1
                categories_skipped.append(category_name)

        self.dataset = pd.DataFrame(np.array([paths, categories]).T, columns=['path', 'category'])
        print("\n")
        print("Finished. Constructed Places2 dataset of %d images." % len (self.dataset))
        print("Time elapsed: %fs. Categories skipped: %d." % (time.time() - start_time, nmbr_skipped))
        print("Following categories are skipped:", categories_skipped, '\n' * 1)

    def get_batch(self, augmentor, batch_size=1):
        """
        Generate bathes of images with attached labels(place category) in two different formats:
        textual and one-hot-encoded.
        Args:
            augmentor: Augmentor object responsible for augmentation pipeline
            batch_size: size of batch we return
        Returns:
            dictionary with fields: image, label_text, label_onehot
            each containing a batch of corresponding values
        """

        batch_image = []
        batch_class = []
        for _ in range(batch_size):
            row = self.dataset.sample(n=1)
            image = scipy.misc.imread(name=row['path'].values[0], mode='RGB')
            image_class = row['category'].values[0]
            image = scipy.misc.imresize(image, size=2.)
            image_shape = image.shape

            if max(image_shape) > 1800.:
                image = scipy.misc.imresize(image, size=1800. / max(image_shape))
            if max(image_shape) < 800:
                # Resize the smallest side of the image to 800px
                alpha = 800. / float(min(image_shape))
                if alpha < 4.:
                    image = scipy.misc.imresize(image, size=alpha)
                    image = np.expand_dims(image, axis=0)
                else:
                    image = scipy.misc.imresize(image, size=[800, 800])

            batch_image.append(augmentor(utils.enhance_image(image)).astype(np.float32))
            batch_class.append(image_class)

        return {"image": np.asarray(batch_image),
                "label_text": batch_class,
                "label_onehot": np.array(
                    [utils.get_one_hot_encoded_vector(l=len(self.categories_names),
                                                      i=self.categories_names.index(x)) for x in batch_class])
                }

    def initialize_batch_worker(self, queue, augmentor, batch_size = 1, seed = 228):
        np.random.seed(seed)
        while True:
            batch = self.get_batch(augmentor=augmentor, batch_size=batch_size)
            queue.put(batch)


class CocoDataset():
    def __init__(self):
        pass


In [17]:
content_dataset_places = PlacesDataset(path_to_dataset='/export/home/dkotoven/workspace/Places2_dataset/data_large')
art_dataset = ArtDataset(path_to_art_dataset='./datasets/relevant_wikiart_plus2.csv', 
                         artists_list=ARTISTS_LIST)

  0%|          | 0/132 [00:00<?, ?it/s]
  0%|          | 0/5000 [00:00<?, ?it/s][A
100%|██████████| 5000/5000 [00:00<00:00, 264188.16it/s][A
  0%|          | 0/5000 [00:00<?, ?it/s][A
100%|██████████| 5000/5000 [00:00<00:00, 250337.46it/s][A
  0%|          | 0/5000 [00:00<?, ?it/s][A
100%|██████████| 5000/5000 [00:00<00:00, 256673.64it/s][A
  0%|          | 0/5000 [00:00<?, ?it/s][A
  4%|▍         | 5/132 [00:00<00:03, 39.84it/s].70it/s][A
  0%|          | 0/5000 [00:00<?, ?it/s][A
100%|██████████| 5000/5000 [00:00<00:00, 258196.82it/s][A
  0%|          | 0/5000 [00:00<?, ?it/s][A
100%|██████████| 5000/5000 [00:00<00:00, 233338.38it/s][A
  0%|          | 0/5000 [00:00<?, ?it/s][A

Category a/abbey can't be found in path /export/home/dkotoven/workspace/Places2_dataset/data_large/a/abbey. Skip it.



100%|██████████| 5000/5000 [00:00<00:00, 258222.25it/s][A
  0%|          | 0/5000 [00:00<?, ?it/s][A
  7%|▋         | 9/132 [00:00<00:03, 37.77it/s].40it/s][A
  0%|          | 0/5000 [00:00<?, ?it/s][A
100%|██████████| 5000/5000 [00:00<00:00, 235415.51it/s][A
  0%|          | 0/5000 [00:00<?, ?it/s][A
100%|██████████| 5000/5000 [00:00<00:00, 130110.00it/s][A
  0%|          | 0/5000 [00:00<?, ?it/s][A
  9%|▉         | 12/132 [00:00<00:03, 32.22it/s]64it/s][A
  0%|          | 0/5000 [00:00<?, ?it/s][A
100%|██████████| 5000/5000 [00:00<00:00, 223196.25it/s][A
  0%|          | 0/5000 [00:00<?, ?it/s][A
100%|██████████| 5000/5000 [00:00<00:00, 203650.49it/s][A
  0%|          | 0/5000 [00:00<?, ?it/s][A
 13%|█▎        | 17/132 [00:00<00:03, 35.10it/s]63it/s][A
  0%|          | 0/5000 [00:00<?, ?it/s][A
100%|██████████| 5000/5000 [00:00<00:00, 217051.54it/s][A
  0%|          | 0/5000 [00:00<?, ?it/s][A
100%|██████████| 5000/5000 [00:00<00:00, 227644.48it/s][A

Category b/basilica can't be found in path /export/home/dkotoven/workspace/Places2_dataset/data_large/b/basilica. Skip it.
Category b/bayou can't be found in path /export/home/dkotoven/workspace/Places2_dataset/data_large/b/bayou. Skip it.



  0%|          | 0/5000 [00:00<?, ?it/s][A
 15%|█▌        | 20/132 [00:00<00:03, 33.22it/s]37it/s][A
  0%|          | 0/4681 [00:00<?, ?it/s][A
100%|██████████| 4681/4681 [00:00<00:00, 222342.81it/s][A
  0%|          | 0/5000 [00:00<?, ?it/s][A
100%|██████████| 5000/5000 [00:00<00:00, 225733.23it/s][A
  0%|          | 0/5000 [00:00<?, ?it/s][A
 17%|█▋        | 23/132 [00:00<00:03, 32.11it/s]19it/s][A
  0%|          | 0/5000 [00:00<?, ?it/s][A
100%|██████████| 5000/5000 [00:00<00:00, 228321.09it/s][A
  0%|          | 0/5000 [00:00<?, ?it/s][A
100%|██████████| 5000/5000 [00:00<00:00, 236181.72it/s][A
  0%|          | 0/5000 [00:00<?, ?it/s][A
 20%|█▉        | 26/132 [00:00<00:03, 30.63it/s]62it/s][A
  0%|          | 0/5000 [00:00<?, ?it/s][A
100%|██████████| 5000/5000 [00:00<00:00, 199801.07it/s][A
  0%|          | 0/5000 [00:00<?, ?it/s][A
100%|██████████| 5000/5000 [00:00<00:00, 197184.15it/s][A
  0%|          | 0/5000 [00:00<?, ?it/s][A
 22%|██▏       | 29/132 [00:

Category f/fairway can't be found in path /export/home/dkotoven/workspace/Places2_dataset/data_large/f/fairway. Skip it.



100%|██████████| 5000/5000 [00:00<00:00, 175783.68it/s][A
  0%|          | 0/5000 [00:00<?, ?it/s][A
 39%|███▊      | 51/132 [00:01<00:02, 29.46it/s]39it/s][A
  0%|          | 0/5000 [00:00<?, ?it/s][A
100%|██████████| 5000/5000 [00:00<00:00, 232753.10it/s][A
  0%|          | 0/5000 [00:00<?, ?it/s][A
100%|██████████| 5000/5000 [00:00<00:00, 263226.52it/s][A
  0%|          | 0/5000 [00:00<?, ?it/s][A
 41%|████      | 54/132 [00:01<00:02, 29.46it/s]63it/s][A
  0%|          | 0/5000 [00:00<?, ?it/s][A
100%|██████████| 5000/5000 [00:00<00:00, 243843.54it/s][A
  0%|          | 0/5000 [00:00<?, ?it/s][A
100%|██████████| 5000/5000 [00:00<00:00, 201946.32it/s][A
  0%|          | 0/5000 [00:00<?, ?it/s][A
100%|██████████| 5000/5000 [00:00<00:00, 263726.36it/s][A
  0%|          | 0/4939 [00:00<?, ?it/s][A
 44%|████▍     | 58/132 [00:01<00:02, 29.60it/s]57it/s][A
  0%|          | 0/5000 [00:00<?, ?it/s][A
100%|██████████| 5000/5000 [00:00<00:00, 224467.18it/s][A
  0%|        

Category g/gorge can't be found in path /export/home/dkotoven/workspace/Places2_dataset/data_large/g/gorge. Skip it.
Category h/herb_garden can't be found in path /export/home/dkotoven/workspace/Places2_dataset/data_large/h/herb_garden. Skip it.


  0%|          | 0/5000 [00:00<?, ?it/s][A
100%|██████████| 5000/5000 [00:00<00:00, 184808.55it/s][A
  0%|          | 0/5000 [00:00<?, ?it/s][A
100%|██████████| 5000/5000 [00:00<00:00, 229400.02it/s][A
  0%|          | 0/5000 [00:00<?, ?it/s][A
100%|██████████| 5000/5000 [00:00<00:00, 220254.37it/s][A
  0%|          | 0/5000 [00:00<?, ?it/s][A
 55%|█████▍    | 72/132 [00:02<00:01, 31.14it/s]93it/s][A
  0%|          | 0/5000 [00:00<?, ?it/s][A
100%|██████████| 5000/5000 [00:00<00:00, 313321.08it/s][A
  0%|          | 0/5000 [00:00<?, ?it/s][A
100%|██████████| 5000/5000 [00:00<00:00, 213822.73it/s][A
  0%|          | 0/5000 [00:00<?, ?it/s][A
100%|██████████| 5000/5000 [00:00<00:00, 268414.84it/s][A
  0%|          | 0/5000 [00:00<?, ?it/s][A
 58%|█████▊    | 76/132 [00:02<00:01, 30.76it/s]29it/s][A
  0%|          | 0/5000 [00:00<?, ?it/s][A
100%|██████████| 5000/5000 [00:00<00:00, 158887.19it/s][A
  0%|          | 0/5000 [00:00<?, ?it/s][A
100%|██████████| 5000/5000 [0

Category a/apartment-building/outdoor can't be found in path /export/home/dkotoven/workspace/Places2_dataset/data_large/a/apartment-building/outdoor. Skip it.


Finished. Constructed Places2 dataset of 624077 images.
Time elapsed: 5.220313s. Categories skipped: 7.
Following categories are skipped: ['a/abbey', 'b/basilica', 'b/bayou', 'f/fairway', 'g/gorge', 'h/herb_garden', 'a/apartment-building/outdoor'] 

Art dataset size for: ['paul-cezanne', 'vincent-van-gogh', 'amedeo-modigliani', 'camille-pissarro', 'pierre-auguste-renoir', 'childe-hassam', 'paul-gauguin', 'alfred-sisley', 'claude-monet', 'berthe-morisot'] is 2498


In [8]:
augmentor = img_augm.Augmentor(crop_size=[256, 256],
                                       vertical_flip_prb=0.,
                                       hsv_augm_prb=1.0,
                                       hue_augm_shift=0.10,
                                       saturation_augm_shift=0.10, saturation_augm_scale=0.10,
                                       value_augm_shift=0.10, value_augm_scale=0.10,
                                       affine_trnsfm_prb=1.0, affine_trnsfm_range=0.1)

In [21]:
augmentor_empty = img_augm.Augmentor(crop_size=[256, 256],
                                    scale_augm_prb=0., scale_augm_range=0.2,
                                     rotation_augm_prb=0., rotation_augm_range=0.15,

                                     hsv_augm_prb=0., 
                                     hue_augm_shift=0.05,

                                     saturation_augm_shift=0.05, saturation_augm_scale=0.05,
                 value_augm_shift=0.05, value_augm_scale=0.05,
                 affine_trnsfm_prb=0.0, affine_trnsfm_range=0.05,
                 horizontal_flip_prb=0.0,
                 vertical_flip_prb=0.0)

In [22]:
batch_size=8
start_time = time.time()
for _ in range(10):
    batch = art_dataset.get_batch(augmentor=augmentor, batch_size=batch_size)
print("Time elapsed to extract %d batches of %d elements: %.4fs." % (10, batch_size, time.time() - start_time ))

Time elapsed to extract 10 batches of 8 elements: 9.2197s.


In [23]:
batch_size=8
start_time = time.time()
for _ in range(10):
    batch = art_dataset.get_batch(augmentor=augmentor_empty, batch_size=batch_size)
print("Time elapsed to extract %d batches of %d elements with augmentor_empty: %.4fs." % (10, batch_size, time.time() - start_time ))

Time elapsed to extract 10 batches of 8 elements with augmentor_empty: 6.5652s.


In [19]:
batch_size=8
start_time = time.time()
for _ in range(10):
    batch = art_dataset.get_batch(augmentor=augmentor, batch_size=batch_size)
print("Time elapsed to extract %d batches of %d elements without resizing: %.4fs." % (10, batch_size, time.time() - start_time ))

Time elapsed to extract 10 batches of 8 elements without resizing: 8.3197s.
