In [85]:
# -*- coding: utf-8 -*-
# IMPORTS

import numpy as np
import os
import codecs  # Needs to be imported because of chinese characters
import pandas as pd
from PIL import *
import pickle
import time
import cv2
import sys
pd.set_option('display.max_columns', 100)
import matplotlib.pyplot as plt
import h5py
%matplotlib inline
# GLOBAL

#root = os.getcwd()
root = '/home/joao/Projeto/Deep_Fashion_Project/'
dataset_folder_path = os.path.join(root, 'Dataset')

Anno_path = os.path.join(dataset_folder_path, 'Anno')
list_attr_cloth = os.path.join(Anno_path, 'list_attr_cloth.txt')
list_attr_items = os.path.join(Anno_path, 'list_attr_items.txt')
list_attr_type = os.path.join(Anno_path, 'list_attr_type.txt')
list_bbox_consumer2shop = os.path.join(Anno_path, 'list_bbox_consumer2shop.txt')
list_item_consumer2shop = os.path.join(Anno_path, 'list_item_consumer2shop.txt')
list_landmarks_consumer2shop = os.path.join(Anno_path, 'list_landmarks_consumer2shop.txt')

Eval_path = os.path.join(dataset_folder_path, 'Eval')
list_eval_partition = os.path.join(Eval_path, 'list_eval_partition.txt')

Img_path = os.path.join(dataset_folder_path, 'Img')
data_order = 'tf'
#img_dtype = tables.UInt8Atom()  # dtype in which the images will be saved

In [2]:
# -*- coding: utf-8 -*-
#from config import *

# Return list of lists [category_id, 'category_name_string', category_type_id]

def get_category_id_name_type(path_list_attr_cloth):
    category_list = []
    with codecs.open(path_list_attr_cloth, 'r', 'utf-8') as file_list_attr_clothes:
        next(file_list_attr_clothes)
        next(file_list_attr_clothes)
        for idx, line in enumerate(file_list_attr_clothes, 1):
            category_name = line[24:66].strip().replace(' ', '_').upper()
            category_attribute_type = int(line[-5:].strip())
            category_list.append([idx, category_name, category_attribute_type])
    return category_list


# Return dictionary attr_type_dict = {'category_type_id': 'category_type_name'}
def generate_attr_type_dict(path_list_attr_type):
    attr_type_dict = dict()
    with codecs.open(path_list_attr_type, 'r', 'utf-8') as file_list_attr_clothes:
        next(file_list_attr_clothes)
        next(file_list_attr_clothes)
        for idx, line in enumerate(file_list_attr_clothes, 1):
            attr_type_name = line[-37:].strip().replace(' ', '_').upper()
            attr_type_dict[idx] = attr_type_name
    return attr_type_dict


# merge into list of lists [category_id, 'category_name_string', category_type_id, 'category_type_id']
def merge_attr_types_names(attr_type_dict, category_list):
    for category_id in category_list:
        category_id.append(attr_type_dict[category_id[-1]])
    return category_list


# build three sets with unique item ids according to train/test/eval partition
def get_item_ids_partition_sets(path_list_eval_partition):
    train_ids = set()
    val_ids = set()
    test_ids = set()
    with codecs.open(path_list_eval_partition, 'r', 'utf-8') as file_list_eval_partition:
        next(file_list_eval_partition)
        next(file_list_eval_partition)
        for line in file_list_eval_partition:
            if line.split()[3] == 'train':
                train_ids.add(line.split()[2])
            elif line.split()[3] == 'val':
                val_ids.add(line.split()[2])
            else:
                test_ids.add(line.split()[2])
    return train_ids, val_ids, test_ids

#creates databse with all path of images and its partition group
def gen_processed_list_eval_partition(path_list_eval_partition):
    df_list_eval_partition = pd.read_table(path_list_eval_partition,
                                           delim_whitespace=True, skiprows=0, header=1)
    consumer_files = df_list_eval_partition.drop('image_pair_name_2', axis=1).drop_duplicates()
    consumer_files = consumer_files.rename(columns={'image_pair_name_1': 'image_name'})
    shop_files = df_list_eval_partition.drop('image_pair_name_1', axis=1).drop_duplicates()
    shop_files = shop_files.rename(columns={'image_pair_name_2': 'image_name'})
    processed_list_eval_partition = consumer_files.append(shop_files, ignore_index=True)
    return processed_list_eval_partition

#Generate database with all annotatios
def gen_full_anno(path_list_eval_partition, path_list_landmarks_consumer2shop,
                  path_list_bbox_consumer2shop, path_list_attr_items,
                  bbox=True, item_features = True):
    processed_list_eval_partition = gen_processed_list_eval_partition(path_list_eval_partition)
    landmarks_consumer2shop = pd.read_table(path_list_landmarks_consumer2shop,
                                            delim_whitespace=True, skiprows=0, header=1)
    full_anno = processed_list_eval_partition.merge(landmarks_consumer2shop,
                                                    how='inner', on='image_name')
    if bbox:
        bbox_consumer2shop = pd.read_table(path_list_bbox_consumer2shop,
                                                delim_whitespace=True, skiprows=0, header=1)
        full_anno = full_anno.merge(bbox_consumer2shop, how='inner', on='image_name')
    if item_features:
        col = ['item_id'] + ['Attr' + str(i) for i in range(1, 304)]
        attr_consumer2shop = pd.read_table(path_list_attr_items,
                                                delim_whitespace=True, skiprows=2, header=None, names=col)
        full_anno = full_anno.merge(attr_consumer2shop,how='outer', on='item_id', validate="m:1")
    return full_anno
#Select folder of images
def gets_from_scope(full_anno,CLOTHING = True, DRESSES=True, TOPS=True, TROUSERS=True ):
    full_anno['folders'] = full_anno.image_name.str.split('/').str[1]
    if not CLOTHING:
        full_anno = full_anno[full_anno.folders != 'CLOTHING']
    if not DRESSES:
        full_anno = full_anno[full_anno.folders != 'DRESSES']
    if not TOPS:
        full_anno = full_anno[full_anno.folders != 'TOPS']
    if not TROUSERS:
        full_anno = full_anno[full_anno.folders != 'TROUSERS']
    return full_anno
#split database in train, eval and test
def split_full_anno(df_full_anno):
    train = df_full_anno[df_full_anno.evaluation_status == 'train']
    eval = df_full_anno[df_full_anno.evaluation_status == 'val']
    test = df_full_anno[df_full_anno.evaluation_status == 'test']
    return train, eval, test


#gera lista de caminho para leitura das imagens
def generate_path_list(data):
    return data.image_name.tolist()


# Encontra tamanho máximo das fotos
def get_maximal_sizes(data_train):
    i_max = 0
    j_max = 0
    for img in data_train:
        if img.shape[0] > i_max:
            i_max = img.shape[0]
        if img.shape[1] > j_max:
            j_max = img.shape[1]
    return(i_max,j_max)


# gera a lista de np.arrays referentes a imagem
def generate_list_images(image_path_list):
    return [(cv2.imread(os.path.join(Img_path,fname))) for fname in train]


# Adiciona faixas pretas abaixo e a esquerda das figuras
def create_numpy_data(image_path_list, Img_path, i_max = 301, j_max = 301):
        
    img = cv2.imread(os.path.join(Img_path,image_path_list[0]))
    # cv2 load images as BGR, convert it to RGB
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
    i_img, j_img = img.shape[0], img.shape[1]
    img = np.hstack((img,np.zeros((i_img, j_max-j_img,3))))
    img = np.vstack((img, np.zeros((i_max-i_img, j_max, 3))))
    data = img
    for fname in image_path_list[1:]:
        img = cv2.imread(os.path.join(Img_path,fname))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        i_img, j_img = img.shape[0], img.shape[1]
        img = np.hstack((img,np.zeros((i_img, j_max-j_img,3))))
        img = np.vstack((img, np.zeros((i_max-i_img, j_max, 3))))
        data = np.vstack((data,img))
    return data
'''
def save_hdf5_file(file, path):
    with h5py.File('name-of-file.h5', 'w') as hf:
    hf.create_dataset("name-of-dataset",  data=file)
'''

'\ndef save_hdf5_file(file, path):\n    return 1\n'

In [4]:
if __name__ == '__main__':

    #category_list = get_category_id_name_type(list_attr_cloth)
    
    #attr_type_dict = generate_attr_type_dict(list_attr_type)
    
    #category_data = merge_attr_types_names(attr_type_dict, category_list)
    
    #train_ids_set, val_ids_set, test_ids_set = get_item_ids_partition_sets(list_eval_partition)
    
    df_full_anno = gen_full_anno(list_eval_partition, list_landmarks_consumer2shop,
                  list_bbox_consumer2shop, list_attr_items, item_features= True)
    
    clothing_full = gets_from_scope(df_full_anno, True, False, False, False)
    
    df_train, df_val, df_test = split_full_anno(clothing_full)
    
    #clothing_train_list_path = generate_path_list(df_train)
    
    #clothing_val_list_path = generate_path_list(df_val)
    
    #clothing_test_list_path = generate_path_list(df_test)
    
    #data_clothing_train = create_numpy_data(clothing_train_list_path ,i_max = 301,\
                                            #j_max = 301, Img_path = Img_path)
    
    #data_clothing_val = create_numpy_data(clothing_val_list_path ,i_max = 301,\
    #                                      j_max = 301, Img_path = Img_path)
    
    #data_clothing_test = create_numpy_data(clothing_test_list_path , i_max = 301,\
     #                                      j_max = 301, Img_path = Img_path)
    

In [86]:
#For upper-body clothing does not have landmark 7 and 8
#these columns will be droped

clothing_train_y_landmark = df_train.loc[:, 'clothes_type_x':'landmark_location_y_6'].as_matrix()\
.astype(dtype = np.int16)

clothing_test_y_landmark = df_test.loc[:, 'clothes_type_x':'landmark_location_y_6'].as_matrix()\
.astype(dtype = np.int16)

clothing_val_y_landmark = df_val.loc[:, 'clothes_type_x':'landmark_location_y_6'].as_matrix()\
.astype(dtype = np.int16)


clothing_train_y_bbox = df_train.loc[:,'clothes_type_y':'y_2'].as_matrix()\
.astype(dtype = np.int16)


clothing_test_y_bbox = df_test.loc[:,'clothes_type_y':'y_2'].as_matrix()\
.astype(dtype = np.int16)


clothing_val_y_bbox = df_val.loc[:,'clothes_type_y':'y_2'].as_matrix()\
.astype(dtype = np.int16)

clothing_train_y_bbox = df_train.loc[:,'clothes_type_y':'y_2'].as_matrix()\
.astype(dtype = np.int16)

clothing_train_y_features = df_train.loc[:,'Attr1':'Attr303'].as_matrix()\
.astype(dtype = np.int8) 

clothing_test_y_features = df_test.loc[:,'Attr1':'Attr303'].as_matrix()\
.astype(dtype = np.int8) 

clothing_val_y_features = df_val.loc[:,'Attr1':'Attr303'].as_matrix()\
.astype(dtype = np.int8) 


hf = h5py.File('clothing_data_y_train_np.h5', 'w')
hf.create_dataset('landmark_train', data=clothing_train_y_landmark)
hf.create_dataset('bbox_train', data=clothing_train_y_bbox)
hf.create_dataset('features_train', data=clothing_train_y_features)
hf.close()


hf = h5py.File('clothing_data_y_test_np.h5', 'w')
hf.create_dataset('landmark_test', data=clothing_test_y_landmark)
hf.create_dataset('bbox_test', data=clothing_test_y_bbox)
hf.create_dataset('features_test', data=clothing_test_y_features)
hf.close()

hf = h5py.File('clothing_data_y_val_np.h5', 'w')
hf.create_dataset('landmark_val', data=clothing_val_y_landmark)
hf.create_dataset('bbox_val', data=clothing_val_y_bbox)
hf.create_dataset('features_val', data=clothing_val_y_features)
hf.close()

In [4]:
clothing_full.clothes_type_y.value_counts()

1    7628
2    4949
Name: clothes_type_y, dtype: int64

In [27]:
train = df_train.image_name.tolist()
#del(df_train, df_full_anno,df_eval,df_test

t0 = time.time()
data_train = [(cv2.imread(os.path.join(Img_path,fname))) for fname in train[:10]]
#with open('data_train.pkl', 'wb') as f:
 #   pickle.dump(data_train, f)
(time.time() - t0)

0.011518001556396484

In [80]:
#df_train.iloc[:,].astype('int16').memory_usage(deep.apply(pd.to_numeric,downcast ='unsigned')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
