### Tasks
0. unifying code: data loading, data structures, imports, 

 - 0.1. Add a quick EDA section before the viz tasks

1. make the pie plot about num of keypoints and num of images / annotations for each dataset like the following:

2.  animal / image size analysis on each bird dataset (histogram). Like this below

3. keypoints vs. bounding box analysis (how many boxes look unusual? how many keypoints are occluded vs. not occluded? Try to make histogram and nice analysis)

4. visualize the keypoints and bounding box on 10 images from each bird dataset

5. Run a k-means clustering to detect outliers and visualize outliers vs. centroid keypoints

### TODOs
- methods => into scripts
    - data loading scripts: data.py
    - visualisation/plotting scripts: visualisation.py
    - stats script: stats.py
    - utilities script: utils.py

In [8]:
# imports
import matplotlib.ticker as ticker
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from src.data import *
from src.calcs import *
from src.utils import *
from src.visualisation import *

In [None]:
# data loading
root = "/home/dikra/media/dikra/PhD/DATA/DLC24_Data/tiny_all_bird_merged_coco"
images = f"{root}/images"
annotations = f"{root}/annotations"
train_file = f"{annotations}/train.json"
test_file = f"{annotations}/test.json"

train_data = load_data(train_file)
test_data = load_data(test_file)

# EDA: Exploratory Data Analysis 
Print number of labeled images, #of annotated keypoints, labels of bodyparts,... 

# Task1

In [None]:
train_annotations_by_dataset, train_keypoints_by_dataset = count_keypoints_annotations(data)
train_annotations_by_dataset, train_keypoints_by_dataset

In [None]:
# Prepare data for pie charts
train_keypoints_labels = list(train_keypoints_by_dataset.keys())
train_keypoints_sizes = list(train_keypoints_by_dataset.values())

train_annotations_labels = list(train_annotations_by_dataset.keys())
train_annotations_sizes = list(train_annotations_by_dataset.values())

In [None]:
# Plot the pie charts in a grid
fig, axs = plt.subplots(2, 2, figsize=(18, 12))

# Adjust the spacing at the top to avoid overlap
fig.subplots_adjust(top=0.85)

# Train keypoints pie chart
axs[0, 0].pie(train_keypoints_sizes, labels=train_keypoints_labels, autopct=autopct_format(train_keypoints_sizes), startangle=140, textprops={'fontsize': 10})
axs[0, 0].axis('equal')
axs[0, 0].set_title('Distribution of Labelled Train Keypoints in Bird Datasets', pad=20)

# Train annotations pie chart
axs[1, 0].pie(train_annotations_sizes, labels=train_annotations_labels, autopct=autopct_format(train_annotations_sizes), startangle=140, textprops={'fontsize': 10})
axs[1, 0].axis('equal')
axs[1, 0].set_title('Distribution of Annotated Images in Train  Bird Datasets', pad=20)

plt.tight_layout()
plt.show()

# Task2

In [None]:
train_proportions_by_dataset = extract_bbox_proportions(train_data)
test_proportions_by_dataset = extract_bbox_proportions(test_data)
train_annotations_by_dataset, test_proportions_by_dataset

In [None]:
# Combine train and test proportions
combined_proportions_by_dataset = {}
for dataset in set(train_proportions_by_dataset.keys()).union(test_proportions_by_dataset.keys()):
    combined_proportions_by_dataset[dataset] = train_proportions_by_dataset.get(dataset, []) + test_proportions_by_dataset.get(dataset, [])


# Plot histograms for each dataset in a grid-like format
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(15, 10))
axes = axes.flatten()

for ax, (dataset, proportions) in zip(axes, combined_proportions_by_dataset.items()):
    ax.hist(proportions, bins=30, edgecolor='black')
    ax.set_title(f'Histogram of Bounding Box Proportions in {dataset}')
    ax.set_xlabel('Proportion of Image Size')
    ax.set_ylabel('Frequency')
    ax.ticklabel_format(style='plain', axis='x')  # Disable scientific notation on x-axis
    ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: '{:.2f}'.format(x)))
    plt.setp(ax.get_xticklabels(), rotation=45)

# Hide any unused subplots
for i in range(len(combined_proportions_by_dataset), len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.show()


In [None]:
train_sum_proportions_by_dataset = sum_bbox_proportions(train_data)
test_sum_proportions_by_dataset = sum_bbox_proportions(test_data)
train_sum_proportions_by_dataset, test_sum_proportions_by_dataset

In [None]:
# Combine train and test proportions
combined_sum_proportions_by_dataset = {}
for dataset in set(train_sum_proportions_by_dataset.keys()).union(test_sum_proportions_by_dataset.keys()):
    combined_sum_proportions_by_dataset[dataset] = train_sum_proportions_by_dataset.get(dataset, []) + test_sum_proportions_by_dataset.get(dataset, [])


# Plot histograms for each dataset in a grid-like format
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(15, 10))
axes = axes.flatten()

for ax, (dataset, proportions) in zip(axes, combined_sum_proportions_by_dataset.items()):
    ax.hist(proportions, bins=30, edgecolor='black')
    ax.set_title(f'Histogram of Bounding Box Proportions in {dataset}')
    ax.set_xlabel('Proportion of Image Size')
    ax.set_ylabel('Frequency')
    ax.ticklabel_format(style='plain', axis='x')  # Disable scientific notation on x-axis
    ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: '{:.2f}'.format(x)))
    plt.setp(ax.get_xticklabels(), rotation=45)

# Hide any unused subplots
for i in range(len(combined_sum_proportions_by_dataset), len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.show()


# Task 3

In [None]:
bbox_widths, bbox_heights = calculate_width_height(train_data)
plot_width_height(bbox_widths, bbox_heights)

In [None]:
bbox_area = calculate_area(bbox_widths, bbox_heights)
plot_area(bbox_area)

In [None]:
bbox_ratio = calculate_ratio(bbox_widths, bbox_heights)
plot_ratio(bbox_ratio)

In [None]:
percent_visible, percent_occluded, percent_unlabeled = compute_keypoints(train_data)
plot_keypoint_percent(percent_visible, percent_occluded, percent_unlabeled)

# Task 5

In [None]:
all_keypoints = get_all_keypoints(train_data)  # load all keypoint annotations
parsed_poses = keypoint2pose(all_keypoints)  # parse keypoint annotations into (x, y, visibility) triples
parsed_poses.shape

In [None]:
poses_df

In [11]:
for i in range(0,36, 3):
    # print(i)
    
    print(i+2)

2
5
8
11
14
17
20
23
26
29
32
35
