# Analysis - Object Detection

This notebook is about the exploratory data analysis to Object Detection project

In [None]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sns

import json

import project.download_content as content

from IPython.display import display

%matplotlib inline

This analysis starts with the downloading of all the data. To do this, you could use the makefile created. Just open the terminal, go to where you clone this project and run `make make download-content`, following the instructions to download files.

**To run this analysis and reproduced it, you must download the METADATA files**. Besides that, in some cells, it is necessary to download the images (TRAIN, TEST, and VALIDATION image files). Because of that, to reproduce this analysis entirely, you should download these files also (download around 550Gb). If you did not download them, these cells are not going to run entirely, but the cell will notify you about this, and the process is going to follow.

The analysis made with all the images files downloaded could be accessed in an HTML file that is in the project, called "analysis.html.". You do not need to download all images to see it, open the file in your browser.

## Wrangling

### Gather

In [None]:
# Gathering all files
if not content.does_metadata_exist():
    raise OSError(f'There are metadata file(s) that did not downloaded yet...')
print('all files exist...')

METAPATH = content.DATAPATH + "/METADATA"

# metadata general files
print('Gathering all metadata files...', end='')

df_classes_raw = pd.read_csv(METAPATH + "/class-descriptions-boxable.csv",
                             names=['class_encode', 'class_name'],
                             index_col='class_encode')
with open(METAPATH + "/bbox_labels_600_hierarchy.json") as f:
    dict_hierarchy_raw = json.load(f)
print('OK!')


# train files
print('Gathering all train files...', end='')
df_train_bbox_raw = pd.read_csv(METAPATH + "/train-annotations-bbox.csv")
df_train_labels_raw = pd.read_csv(
    METAPATH + "/train-annotations-human-imagelabels-boxable.csv")
print('OK!')


# validation files
print('Gathering all validation files...', end='')
df_val_bbox_raw = pd.read_csv(METAPATH + "/validation-annotations-bbox.csv")
df_val_labels_raw = pd.read_csv(
    METAPATH + "/validation-annotations-human-imagelabels-boxable.csv")
print('OK!')


# test files
print('Gathering all test files...', end='')
df_test_bbox_raw = pd.read_csv(METAPATH + "/test-annotations-bbox.csv")
df_test_labels_raw = pd.read_csv(
    METAPATH + "/test-annotations-human-imagelabels-boxable.csv")
print('OK!')

### Assess

#### Explaining data

#### Assessing Data

In [None]:
#number of images by dataset
print(f"""Number of images: {(df_train_bbox_raw.shape[0]
                            + df_val_bbox_raw.shape[0]
                            + df_test_bbox_raw.shape[0]):,}""", end="\n"*2)

print(f"images in train: {df_train_bbox_raw.shape[0]:,}")
print(f"images in validation: {df_val_bbox_raw.shape[0]:,}")
print(f"images in test: {df_test_bbox_raw.shape[0]:,}")

#### Explaining more about the data

In [None]:
# display classes and their encodes
print("####### CLASSES MAPPING #######")
print(f"total classes: {df_classes_raw.shape[0]}")
display(df_classes_raw.sample(2, random_state=17))

In [None]:
# show info abou all dfs
for k, df in {'Train Bounding Boxes': df_train_bbox_raw,
              'Train Labels': df_train_labels_raw,
              'Validation Bounding Boxes': df_val_bbox_raw,
              'Validation Labels': df_val_labels_raw,
              'Test Bounding Boxes': df_test_bbox_raw,
              'Test Labels': df_test_labels_raw}.items():
    print(f"####### {k.upper()} #######", end="\n"*2)
    print(f"shape: {df.shape[0]:,} rows, {df.shape[1]} columns")
    print(f"duplicated values: {df[df.duplicated(keep='first')].shape[0]} records",
          end="\n"*2)

    print("Unique Values:")
    for col in df.columns:
        print(
            f"   {str(col)+' ':-<15} {str(df[col].dtype).upper()+' ':-<10} Nulls = {df[col].isna().sum():,} | Uniques = {df[col].nunique():,}")
    display(df.sample(2, random_state=37))
    print('_'*80, end="\n"*2)

In [None]:
# classes hierarchy
print("####### CLASSES Hierarchy #######", end="\n"*2)

def count_recursive(tree: dict) -> int:
    nodes = []
    
    def recursion(tree, count: int=0):    
        nodes.append(tree['LabelName'])

        if 'Subcategory' in tree.keys():
            for subcat in tree['Subcategory']:
                count = recursion(subcat, count)
        return count + 1
    
    return recursion(tree), nodes

# test count recursive
def test_count_recursive():
    tree = {'LabelName': '1',
            'Subcategory': [
                {'LabelName': '2'},
                {'LabelName': '3',
                 'Subcategory': [
                     {'LabelName': '4'}
                 ]},
                {'LabelName': '5',
                 'Subcategory': [
                     {'LabelName': '6',
                      'Subcategory': [
                          {'LabelName': '7'}
                      ]}
                 ]},
                {'LabelName': '8'}
            ]}
    assert count_recursive(tree)[0] == 8
    assert count_recursive(tree)[1] == ['1', '2', '3', '4', '5', '6', '7', '8']
test_count_recursive()

num_classes, classes = count_recursive(dict_hierarchy_raw)

print(f"There are {num_classes} classes in the JSON hierarchy",
      end="\n"*2)

print("The first node class encode is:",
      dict_hierarchy_raw['LabelName'], end="\n"*2)

# defining a node to consult
i=17

print(f"the {i}th son encode of the first node:",
      dict_hierarchy_raw['Subcategory'][i]['LabelName'])
print(f"The sons of the {i}th node:")
for subcat in dict_hierarchy_raw['Subcategory'][i]['Subcategory']:
    print(f"   {subcat}")

In [None]:
errs = []
for encoded_name in classes:
    try:
        semantic_name = df_classes_raw.loc[encoded_name].class_name 
    except KeyError:
        errs.append(encoded_name)
        semantic_name = ''
    print(f"{encoded_name} - {semantic_name}")

In [None]:
errs

### Clean

## EDA