# Eploratory Data Analysis

This notebook is partly adapted from: 
* https://www.kaggle.com/code/jeffaudi/eda-airbus-oil-storage-tanks-dataset

In [None]:
import os
from ast import literal_eval

import pandas as pd
import numpy as np
import PIL.ImageDraw
from IPython.display import display
from matplotlib import pyplot as plt
import seaborn as sns

# disable tensorflow logging 
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

# load kedro variables 
%load_ext kedro.extras.extensions.ipython

# read data 
annotations = catalog.load("annotations")

In [None]:
# show the number of objects per image
with sns.axes_style("white"):
    plt.figure(figsize=(25, 15))
    plt.title("Number of Oil Storage Tanks per image")
    g = sns.countplot(
        x="image_id",
        data=annotations.merge(
            pd.DataFrame(annotations.image_id.value_counts())
            .reset_index(drop=False)
            .rename(columns={"index": "image_id", "image_id": "object_count"})
        )
        .sort_values(by="object_count")
        .reset_index(drop=True),
        palette="bright"
    )
    plt.xlabel("image_id")
    plt.ylabel("n row per image_id")

    # Rotate x labels
    g.set_xticklabels(labels=g.get_xticklabels(), rotation=90)  


    os.makedirs("../data/08_reporting/plots", exist_ok=True)
    plt.savefig("../data/08_reporting/plots/number_objects_per_image.png", transparent=True)

In [None]:
# convert bounds to python object
annotations["bounds"] = annotations["bounds"].apply(
    lambda x: literal_eval(str(x).rstrip("\r\n"))
)

# explode bounds col
annotations = pd.concat(
    [
        annotations,
        pd.DataFrame(
            annotations["bounds"].tolist(),
            index=annotations.index,
            columns=["bbox_x1", "bbox_y1", "bbox_x2", "bbox_y2"],
        ),
    ],
    axis=1,
)

geometry1 = []

def draw_bboxes(image_id: str, labels: pd.DataFrame) -> None:
    """Draws rectangular boundings boxes on a given image.

    Args:
        image (PIL.Image): Image to draw boxes on. 
        labels (pd.DataFrame): Labels dataframe containing the geometries

    Returns:
        None: None.
    """

    for _, row in labels[labels["image_id"] == image_id].iterrows():
        geometry = np.array([
            (row["bbox_x1"], ["bbox_y1"]),
            (row["bbox_x1"], ["bbox_y2"]),
            (row["bbox_x2"], ["bbox_y2"]),
            (row["bbox_x2"], ["bbox_y1"]),
            (row["bbox_x1"], ["bbox_y1"]),
        ])
        PIL.ImageDraw.Draw(
            PIL.Image.open("../data/01_raw" + "/images/" + image_id + ".jpg")
        ).polygon(
            geometry, outline=(255, 0, 0),
        )
    return img

In [None]:
img = draw_bboxes("1fcb9fee-da89-43f8-83d9-b5d17575f5e6", annotations)
display(img)