# Analysis - Object Detection

This notebook is about the exploratory data analysis to Object Detection project

In [None]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sns

import json

import project.download_content as content

from IPython.display import display

%matplotlib inline

## Wrangling

### Gather

In [None]:
if not content.does_metadata_exist():
    raise OSError(f'There are metadata file(s) that did not downloaded yet...')
print('all files exist...')

METAPATH = content.DATAPATH + "/METADATA"

# metadata general files
print('Gathering all metadata files...', end='')

df_classes_raw = pd.read_csv(METAPATH + "/class-descriptions-boxable.csv",
                             names=['class_encode', 'class_name'],
                            index_col='class_encode')
with open(METAPATH + "/bbox_labels_600_hierarchy.json") as f:
    dict_hierarchy_raw = json.load(f)
print('OK!')


# train files
print('Gathering all train files...', end='')
df_train_bbox_raw = pd.read_csv(METAPATH
                                + "/train-annotations-bbox.csv")
df_train_labels_raw = pd.read_csv(METAPATH
                                  + "/train-annotations-human-imagelabels-boxable.csv")
print('OK!')


# validation files
print('Gathering all validation files...', end='')
df_val_bbox_raw = pd.read_csv(METAPATH
                              + "/validation-annotations-bbox.csv")
df_val_labels_raw = pd.read_csv(METAPATH
                                + "/validation-annotations-human-imagelabels-boxable.csv")
print('OK!')


# test files
print('Gathering all test files...', end='')
df_test_bbox_raw = pd.read_csv(METAPATH
                               + "/test-annotations-bbox.csv")
df_test_labels_raw = pd.read_csv(METAPATH
                                 + "/test-annotations-human-imagelabels-boxable.csv")
print('OK!')

In [None]:
%whos

### Assess

#### Programmatic Visualization

In [None]:
#show samples
for k, df in {'Train Bounding Boxes': df_train_bbox_raw,
              'Train Labels': df_train_labels_raw,
              'Validation Bounding Boxes': df_val_bbox_raw,
              'Validation Labels': df_val_labels_raw,
              'Test Bounding Boxes': df_test_bbox_raw,
              'Test Labels': df_test_labels_raw}.items():
    print(f'{k}:')
    display(df.sample(2, random_state=37))

In [None]:
#number of images by dataset
print(f"""Number of images: {(df_train_bbox_raw.shape[0]
                            + df_val_bbox_raw.shape[0]
                            + df_test_bbox_raw.shape[0]):,}""", end="\n"*2)

print(f"images in train: {df_train_bbox_raw.shape[0]:,}")
print(f"images in validation: {df_val_bbox_raw.shape[0]:,}")
print(f"images in test: {df_test_bbox_raw.shape[0]:,}")

In [None]:
# display classes and their encodes
print("Classes Mapping: ")
display(df_classes_raw.sample(3, random_state=17))

In [None]:
#classes hierarchy
print("Showing a sample of the hierarchy among the classes:", end="\n"*2)
print("The first node class encode: ", dict_hierarchy_raw['LabelName'], end="\n"*2)

print("the 17th son encode of the first node:",
      dict_hierarchy_raw['Subcategory'][17]['LabelName'])
print("The sons of the 17th son of the first node:",
      dict_hierarchy_raw['Subcategory'][17]['Subcategory'])

In [None]:
df_classes_raw.loc['/m/02wv84t'].class_name

### Clean

## EDA