In [None]:
# imports
import os
import random

import pandas as pd
import numpy as np

from copy import deepcopy

from matplotlib import pyplot as plt
import seaborn as sns

sns.set()

import json

import project.download_content as content

from collections import Counter

import notebooks_utils.analysis as utils
from notebooks_utils import visuals

from IPython.display import display

from plotly import graph_objects as go

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

sns.set_palette(sns.diverging_palette(255, 133, l=60, n=12, center="dark"))

%matplotlib inline

METAPATH = os.path.join(content.DATAPATH, 'METADATA')

## Prepare data

Inspired in the EDA notebook inside this project

### Wrangling

In [None]:
# metadata general files
df_classes_raw = pd.read_csv(METAPATH + "/class-descriptions-boxable.csv",
                         names=['class_encode', 'class_name'])
with open(METAPATH + "/bbox_labels_600_hierarchy.json") as f:
    dict_hierarchy_raw = json.load(f)
    
df_train = pd.read_csv(METAPATH + "/train-annotations-bbox.csv")
df_val = pd.read_csv(METAPATH + "/validation-annotations-bbox.csv")
df_test = pd.read_csv(METAPATH + "/test-annotations-bbox.csv")

In [None]:
# cleaning
df_train = df_train.drop_duplicates()
df_val = df_val.drop_duplicates()
df_test = df_test.drop_duplicates()

In [None]:
# creating a hierarchical dict with semantic labels
def changing_to_semantic(tree, *args):
    try:
        tree['LabelName'] = utils.semantic_name(tree['LabelName'])
    except KeyError:
        tree['LabelName'] = 'Entity'
    
    if 'Subcategory' in tree.keys():
        for subcat in tree['Subcategory']:
            changing_to_semantic(subcat, *args, tree['LabelName'])
            
dict_hierarchy = deepcopy(dict_hierarchy_raw)
changing_to_semantic(dict_hierarchy)

In [None]:
#adding label to the first node
df_encodes = pd.concat([pd.DataFrame([['/m/0bl9f', 'Entity']],
                                     columns=['class_encode', 'class_name']),
                        df_classes_raw],
                       ignore_index=True)

In [None]:
# creating a df to show in a tabular form the relation between classes
df_hierarchy = pd.DataFrame(utils.tabularize_hierarchy_dict(dict_hierarchy,
                                                            df_encodes),
                            columns=['Id', 'Label', 'IdParent',
                                     'LabelParent', 'Depth', 'Leaf'])


In [None]:
# creating a classes df
df_classes = (df_hierarchy.merge(df_encodes, left_on='Label', right_on='class_name')
                          .drop(columns=['class_name'])
                          .rename(columns={'class_encode': 'Encode'})
                          .merge(df_encodes, left_on='LabelParent', right_on='class_name')
                          .drop(columns=['class_name'])
                          .rename(columns={'class_encode': 'EncodeParent'})
                          [['Id', 'Label', 'Encode',
                            'IdParent', 'LabelParent', 'EncodeParent',
                            'Depth', 'Leaf']])

### Visualize DFs

In [None]:
display(df_train.head(2))
display(df_val.head(2))
display(df_test.head(2))
display(df_classes.head(2))