In [1]:
# imports
import os
os.sys.path.append(os.path.abspath(".."))

import random

from copy import deepcopy

import pandas as pd
import numpy as np

import keras

from matplotlib import pyplot as plt

import json

import notebooks_utils.analysis as utils

from IPython.display import display

import project.download_content as content

METAPATH = os.path.join(content.DATAPATH, 'METADATA')

import cv2

Using TensorFlow backend.


## Loading data

Inspired in the EDA notebook inside this project
http://htmlpreview.github.io/?https://github.com/BAlmeidaS/capstone-udacity-mle/blob/master/EDA.html

### Loading data

In [2]:
# metadata general files
df_classes_raw = pd.read_csv(METAPATH + "/class-descriptions-boxable.csv",
                         names=['class_encode', 'class_name'])
with open(METAPATH + "/bbox_labels_600_hierarchy.json") as f:
    dict_hierarchy_raw = json.load(f)
    
df_train_raw = (pd.read_csv(METAPATH + "/train-annotations-bbox.csv")
                  .merge(utils.images_downloaded('TRAIN'),
                         on='ImageID', how='inner')
                  .merge(df_classes_raw, left_on='LabelName', right_on='class_encode',
                         how='left')
                  .drop(columns=['class_encode'])
                  .rename(columns={'class_name': 'LabelSemantic'}))
df_val_raw = (pd.read_csv(METAPATH + "/validation-annotations-bbox.csv")
                .merge(utils.images_downloaded('VALIDATION'),
                       on='ImageID', how='inner')
                .merge(df_classes_raw, left_on='LabelName', right_on='class_encode',
                       how='left')
                .drop(columns=['class_encode'])
                .rename(columns={'class_name': 'LabelSemantic'}))
df_test_raw = (pd.read_csv(METAPATH + "/test-annotations-bbox.csv")
                 .merge(utils.images_downloaded('TEST'),
                         on='ImageID', how='inner')
                 .merge(df_classes_raw, left_on='LabelName', right_on='class_encode',
                        how='left')
                 .drop(columns=['class_encode'])
                 .rename(columns={'class_name': 'LabelSemantic'}))

In [3]:
df_train_raw.head(2)

Unnamed: 0,ImageID,Source,LabelName,Confidence,XMin,XMax,YMin,YMax,IsOccluded,IsTruncated,IsGroupOf,IsDepiction,IsInside,Path,LabelSemantic
0,000002b66c9c498e,xclick,/m/01g317,1,0.0125,0.195312,0.148438,0.5875,0,1,0,0,0,data/TRAIN/train_0/000002b66c9c498e.jpg,Person
1,000002b66c9c498e,xclick,/m/01g317,1,0.025,0.276563,0.714063,0.948438,0,1,0,0,0,data/TRAIN/train_0/000002b66c9c498e.jpg,Person


### Wrangling

In [4]:
# cleaning
df_train_raw = df_train_raw.drop_duplicates()
df_val_raw = df_val_raw.drop_duplicates()
df_test_raw = df_test_raw.drop_duplicates()

In [5]:
# creating a hierarchical dict with semantic labels
def changing_to_semantic(tree, *args):
    try:
        tree['LabelName'] = utils.semantic_name(tree['LabelName'])
    except KeyError:
        tree['LabelName'] = 'Entity'
    
    if 'Subcategory' in tree.keys():
        for subcat in tree['Subcategory']:
            changing_to_semantic(subcat, *args, tree['LabelName'])
            
dict_hierarchy = deepcopy(dict_hierarchy_raw)
changing_to_semantic(dict_hierarchy)

In [6]:
#adding label to the first node
df_encodes = pd.concat([pd.DataFrame([['/m/0bl9f', 'Entity']],
                                     columns=['class_encode', 'class_name']),
                        df_classes_raw],
                       ignore_index=True)

In [7]:
# creating a df to show in a tabular form the relation between classes
df_hierarchy = pd.DataFrame(utils.tabularize_hierarchy_dict(dict_hierarchy,
                                                            df_encodes),
                            columns=['Id', 'Label', 'IdParent',
                                     'LabelParent', 'Depth', 'Leaf'])


In [8]:
# creating a classes df
df_classes = (df_hierarchy.merge(df_encodes, left_on='Label', right_on='class_name')
                          .drop(columns=['class_name'])
                          .rename(columns={'class_encode': 'Encode'})
                          .merge(df_encodes, left_on='LabelParent', right_on='class_name')
                          .drop(columns=['class_name'])
                          .rename(columns={'class_encode': 'EncodeParent'})
                          [['Id', 'Label', 'Encode',
                            'IdParent', 'LabelParent', 'EncodeParent',
                            'Depth', 'Leaf']])

### Visualize DFs

In [9]:
display(df_train_raw.head(2))
display(df_val_raw.head(2))
display(df_test_raw.head(2))
display(df_classes.head(2))

Unnamed: 0,ImageID,Source,LabelName,Confidence,XMin,XMax,YMin,YMax,IsOccluded,IsTruncated,IsGroupOf,IsDepiction,IsInside,Path,LabelSemantic
0,000002b66c9c498e,xclick,/m/01g317,1,0.0125,0.195312,0.148438,0.5875,0,1,0,0,0,data/TRAIN/train_0/000002b66c9c498e.jpg,Person
1,000002b66c9c498e,xclick,/m/01g317,1,0.025,0.276563,0.714063,0.948438,0,1,0,0,0,data/TRAIN/train_0/000002b66c9c498e.jpg,Person


Unnamed: 0,ImageID,Source,LabelName,Confidence,XMin,XMax,YMin,YMax,IsOccluded,IsTruncated,IsGroupOf,IsDepiction,IsInside,Path,LabelSemantic
0,0001eeaf4aed83f9,xclick,/m/0cmf2,1,0.022673,0.9642,0.071038,0.800546,0,0,0,0,0,data/VALIDATION/validation/0001eeaf4aed83f9.jpg,Airplane
1,000595fe6fee6369,xclick,/m/02wbm,1,0.0,1.0,0.0,1.0,0,0,1,0,0,data/VALIDATION/validation/000595fe6fee6369.jpg,Food


Unnamed: 0,ImageID,Source,LabelName,Confidence,XMin,XMax,YMin,YMax,IsOccluded,IsTruncated,IsGroupOf,IsDepiction,IsInside,Path,LabelSemantic
0,000026e7ee790996,xclick,/m/07j7r,1,0.071875,0.145313,0.20625,0.391667,0,1,1,0,0,data/TEST/test/000026e7ee790996.jpg,Tree
1,000026e7ee790996,xclick,/m/07j7r,1,0.439063,0.571875,0.264583,0.435417,0,1,1,0,0,data/TEST/test/000026e7ee790996.jpg,Tree


Unnamed: 0,Id,Label,Encode,IdParent,LabelParent,EncodeParent,Depth,Leaf
0,132,Coin,/m/0242l,0,Entity,/m/0bl9f,1,True
1,211,Flag,/m/03120,0,Entity,/m/0bl9f,1,True


### Preprocess data

In [10]:
%%time
# df_train = df_train_raw.groupby('ImageID')[['Path']].last()

# for col in ['XMin', 'XMax', 'YMin', 'YMax']:
#     df_train = df_train.join(df_train_raw.groupby('ImageID')
#                                          [col]
#                                          .apply(list))
                     
# df_train.head(2)

CPU times: user 1e+03 ns, sys: 0 ns, total: 1e+03 ns
Wall time: 3.34 µs


In [11]:
%%time
# df_train = df_train_raw.groupby('ImageID')[['Path']].last()

# for col in ['XMin', 'XMax', 'YMin', 'YMax']:
#     keys, values = df_train_raw[['ImageID', col]].sort_values('ImageID').values.T
#     ukeys, index = np.unique(keys, True)
#     arrays = np.split(values, index[1:])
#     df_train = df_train.join(pd.DataFrame({'ImageID': ukeys,
#                                            col:[list(a) for a in arrays]})
#                                .set_index('ImageID'))

# df_train.head(2)

CPU times: user 1 µs, sys: 0 ns, total: 1 µs
Wall time: 2.86 µs


In [12]:
%%time
df_train_raw.to_csv(METAPATH + "/enriched_train_bbox.csv", index=False)
df_val_raw.to_csv(METAPATH + "/enriched_val_bbox.csv", index=False)
df_test_raw.to_csv(METAPATH + "/enriched_test_bbox.csv", index=False)

CPU times: user 1min 47s, sys: 1.16 s, total: 1min 48s
Wall time: 1min 48s
