### Import dependencies

In [None]:
import pandas as pd
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import os
import dotenv
from dotenv import load_dotenv

In [None]:
load_dotenv()
DATA_DIR = os.getenv("DATA_DIR")

### Load data

In [None]:
def load_train_df() -> None:
    return pd.read_csv(f'{DATA_DIR}/train.csv', dtype={
        'img': int, # Image ID
        'x': int, # X coordinate of the top-left corner of the bounding box
        'y': int, # Y coordinate of the top-left corner of the bounding box
        'w': int, # Width of the bounding box
        'h': int, # Height of the bounding box
        'rot': int, # Whether the image shows a rotten pear (1) or not (0)
    }, index_col=0).rename(columns={
        'img': 'image_id',
        'w': 'x2',
        'h': 'y2',
        'rot': 'is_rotten',
    })

def load_test_df() -> None:
    return pd.read_csv(f'{DATA_DIR}/test.csv', dtype={
        'img': int, # Image ID
        'x': int, # X coordinate of the top-left corner of the bounding box
        'y': int, # Y coordinate of the top-left corner of the bounding box
        'w': int, # Width of the bounding box
        'h': int, # Height of the bounding box
    }, index_col=0).rename(columns={
        'img': 'image_id',
        'w': 'x2',
        'h': 'y2',
    })

In [None]:
train_df = load_train_df()
test_df = load_test_df()

### Data understanding

In [None]:
def add_dimensions(df: pd.DataFrame) -> None:
    df['width'] = df['x2'] - df['x']
    df['height'] = df['y2'] - df['y']

In [None]:
add_dimensions(train_df)
add_dimensions(test_df)

In [None]:
# Create subplots with two rows and two columns
fig = make_subplots(rows=2, cols=2, subplot_titles=("Training Set (Width)", "Training Set (Height)", "Test Set (Width)", "Test Set (Height)"))

# Add histograms for width and height for training set
fig.add_trace(go.Histogram(x=train_df['width'], name='Width (train)', opacity=0.7, nbinsx=50, histnorm='probability'), row=1, col=1)
fig.add_trace(go.Histogram(x=train_df['height'], name='Height (train)', opacity=0.7, nbinsx=50, histnorm='probability'), row=1, col=2)

# Add histograms for width and height for test set
fig.add_trace(go.Histogram(x=test_df['width'], name='Width (test)', opacity=0.7, nbinsx=50, histnorm='probability'), row=2, col=1)
fig.add_trace(go.Histogram(x=test_df['height'], name='Height (test)', opacity=0.7, nbinsx=50, histnorm='probability'), row=2, col=2)

# Update xaxis and yaxis titles
fig.update_xaxes(title_text="Width", row=1, col=1)
fig.update_xaxes(title_text="Height", row=1, col=2)
fig.update_xaxes(title_text="Width", row=2, col=1)
fig.update_xaxes(title_text="Height", row=2, col=2)
fig.update_yaxes(title_text="Probability", row=1, col=1)
fig.update_yaxes(title_text="Probability", row=1, col=2)
fig.update_yaxes(title_text="Probability", row=2, col=1)
fig.update_yaxes(title_text="Probability", row=2, col=2)

# Update layout
fig.update_layout(title='Distribution of the width and height of the bounding boxes in the training and test sets', showlegend=False)

# Show the figure
fig.show()

In [None]:
train_df