In [78]:
from glob import glob
import os

import numpy as np
import pandas as pd
import skimage as ski

In [None]:
path = "../../data/images/*/*.png"
file_paths = sorted(glob(path))

In [18]:
type_to_file = []
for file_path in file_paths:
    parts = os.path.normcase(file_path).split(os.sep)
    type_name = parts[-2]
    file_name = parts[-1]
    type_to_file.append({"type": type_name, "file": file_name})
    
type_to_file[:5]

[{'type': 'bug', 'file': '010.png'},
 {'type': 'bug', 'file': '011.png'},
 {'type': 'bug', 'file': '012.png'},
 {'type': 'bug', 'file': '013.png'},
 {'type': 'bug', 'file': '014.png'}]

In [37]:
image_list = [ski.io.imread(filename) for filename in file_paths]

In [33]:
from collections import Counter

dimensions = [img.shape for img in image_list]

Counter(dimensions)

Counter({(475, 475, 4): 1024, (474, 475, 4): 1})

In [34]:
# Find the anomalous file dimension
anomalous_path = file_paths[dimensions.index((474, 475, 4))]
anomalous_path

'../../data/images\\grass\\1010.png'

In [38]:
# Remove the anomalous image and convert to numpy array
filtered_image_list = [img for img in image_list if img.shape == (475, 475, 4)]
image_array = np.array(filtered_image_list)

In [86]:
distribution_types = ["distribution_red", "distribution_green", "distribution_blue", "distribution_alpha"]
for i, image in enumerate(image_list):
    for channel_id in range(4):
        dist = np.zeros(256, dtype=int)
        unique, counts = np.unique(image[:, :, channel_id], return_counts=True)
        dist[unique] = counts
        type_to_file[i][distribution_types[channel_id]] = dist

    # _, dist_r = np.unique(image[:, :, 0].flatten(), return_counts=True)
    # _, dist_g = np.unique(image[:, :, 1].flatten(), return_counts=True)
    # _, dist_b = np.unique(image[:, :, 2].flatten(), return_counts=True)
    # _, dist_a = np.unique(image[:, :, 3].flatten(), return_counts=True)
    
    # type_to_file[i]["distribution_red"] = dist_r
    # type_to_file[i]["distribution_green"] = dist_g 
    # type_to_file[i]["distribution_blue"] = dist_b
    # type_to_file[i]["distribution_alpha"] = dist_ab

In [87]:
distribution_df = pd.DataFrame(type_to_file)
distribution_df

Unnamed: 0,type,file,distribution_red,distribution_green,distribution_blue,distribution_alpha
0,bug,010.png,"[86585, 304, 273, 124, 123, 123, 221, 107, 138...","[86078, 606, 263, 122, 217, 115, 101, 198, 104...","[84122, 2596, 351, 159, 265, 142, 147, 240, 13...","[123452, 37, 34, 25, 22, 21, 20, 14, 19, 11, 2..."
1,bug,011.png,"[84649, 3971, 2765, 861, 660, 167, 82, 69, 77,...","[92201, 484, 118, 95, 95, 65, 62, 64, 60, 59, ...","[88664, 3179, 895, 276, 112, 102, 102, 119, 89...","[150712, 22, 21, 21, 25, 14, 14, 13, 11, 16, 1..."
2,bug,012.png,"[61895, 15544, 280, 168, 135, 120, 125, 125, 1...","[53494, 23902, 266, 202, 130, 121, 118, 119, 1...","[56717, 20622, 283, 158, 146, 110, 108, 102, 1...","[120121, 51, 39, 34, 36, 27, 32, 26, 19, 18, 2..."
3,bug,013.png,"[88217, 113, 95, 78, 76, 73, 54, 57, 88, 57, 7...","[88385, 156, 94, 89, 80, 72, 79, 62, 69, 76, 6...","[88303, 172, 129, 136, 100, 93, 94, 77, 78, 80...","[163016, 25, 31, 22, 28, 21, 23, 26, 13, 19, 1..."
4,bug,014.png,"[32960, 559, 151, 109, 69, 81, 70, 67, 51, 76,...","[33582, 150, 113, 77, 81, 76, 82, 74, 79, 72, ...","[33567, 212, 140, 125, 115, 117, 131, 130, 110...","[154283, 19, 10, 23, 15, 13, 10, 10, 10, 13, 1..."
...,...,...,...,...,...,...
1020,water,961.png,"[2496, 162, 175, 17949, 35398, 90, 47, 56, 40,...","[56118, 73, 48, 55, 58, 61, 51, 47, 52, 46, 55...","[56109, 55, 52, 44, 44, 49, 41, 51, 42, 39, 49...","[167319, 12, 12, 9, 9, 11, 8, 13, 3, 9, 7, 10,..."
1021,water,963.png,"[128, 13, 21, 9, 38, 21, 19, 17, 18, 25, 26, 2...","[169, 13, 18, 17, 18, 17, 14, 10, 13, 15, 12, ...","[194, 15, 17, 22, 16, 13, 10, 14, 14, 20, 16, ...","[180678, 6, 6, 8, 11, 10, 9, 5, 8, 5, 4, 6, 1,..."
1022,water,964.png,"[133, 10, 22, 10, 34, 20, 19, 20, 19, 23, 27, ...","[174, 13, 16, 15, 19, 21, 15, 8, 12, 17, 13, 1...","[198, 13, 15, 20, 18, 15, 10, 13, 15, 17, 20, ...","[180677, 7, 5, 9, 11, 10, 9, 5, 6, 6, 3, 7, 1,..."
1023,water,976.png,"[1414, 59, 62, 60, 61, 66, 68, 73, 103, 69, 68...","[1474, 84, 72, 86, 67, 74, 71, 91, 64, 84, 83,...","[1481, 71, 76, 77, 59, 84, 70, 83, 82, 68, 73,...","[181923, 16, 11, 11, 13, 13, 10, 12, 10, 9, 11..."


In [80]:
distribution_df = pd.DataFrame(type_to_file)
distribution_df

Unnamed: 0,type,file,distribution_red,distribution_green,distribution_blue,distribution_alpha
0,bug,010.png,"[86585, 304, 273, 124, 123, 123, 221, 107, 138...","[86078, 606, 263, 122, 217, 115, 101, 198, 104...","[84122, 2596, 351, 159, 265, 142, 147, 240, 13...","[123452, 37, 34, 25, 22, 21, 20, 14, 19, 11, 2..."
1,bug,011.png,"[84649, 3971, 2765, 861, 660, 167, 82, 69, 77,...","[92201, 484, 118, 95, 95, 65, 62, 64, 60, 59, ...","[88664, 3179, 895, 276, 112, 102, 102, 119, 89...","[150712, 22, 21, 21, 25, 14, 14, 13, 11, 16, 1..."
2,bug,012.png,"[61895, 15544, 280, 168, 135, 120, 125, 125, 1...","[53494, 23902, 266, 202, 130, 121, 118, 119, 1...","[56717, 20622, 283, 158, 146, 110, 108, 102, 1...","[120121, 51, 39, 34, 36, 27, 32, 26, 19, 18, 2..."
3,bug,013.png,"[88217, 113, 95, 78, 76, 73, 54, 57, 88, 57, 7...","[88385, 156, 94, 89, 80, 72, 79, 62, 69, 76, 6...","[88303, 172, 129, 136, 100, 93, 94, 77, 78, 80...","[163016, 25, 31, 22, 28, 21, 23, 26, 13, 19, 1..."
4,bug,014.png,"[32960, 559, 151, 109, 69, 81, 70, 67, 51, 76,...","[33582, 150, 113, 77, 81, 76, 82, 74, 79, 72, ...","[33567, 212, 140, 125, 115, 117, 131, 130, 110...","[154283, 19, 10, 23, 15, 13, 10, 10, 10, 13, 1..."
...,...,...,...,...,...,...
1020,water,961.png,"[128, 13, 21, 9, 38, 21, 19, 17, 18, 25, 26, 2...","[169, 13, 18, 17, 18, 17, 14, 10, 13, 15, 12, ...","[194, 15, 17, 22, 16, 13, 10, 14, 14, 20, 16, ...","[180678, 6, 6, 8, 11, 10, 9, 5, 8, 5, 4, 6, 1,..."
1021,water,963.png,"[133, 10, 22, 10, 34, 20, 19, 20, 19, 23, 27, ...","[174, 13, 16, 15, 19, 21, 15, 8, 12, 17, 13, 1...","[198, 13, 15, 20, 18, 15, 10, 13, 15, 17, 20, ...","[180677, 7, 5, 9, 11, 10, 9, 5, 6, 6, 3, 7, 1,..."
1022,water,964.png,"[1414, 59, 62, 60, 61, 66, 68, 73, 103, 69, 68...","[1474, 84, 72, 86, 67, 74, 71, 91, 64, 84, 83,...","[1481, 71, 76, 77, 59, 84, 70, 83, 82, 68, 73,...","[181923, 16, 11, 11, 13, 13, 10, 12, 10, 9, 11..."
1023,water,976.png,"[3409, 348, 485, 9672, 32875, 583, 293, 372, 2...","[45359, 406, 127, 147, 235, 72, 92, 71, 90, 80...","[45350, 231, 100, 131, 142, 120, 71, 75, 89, 5...","[152698, 11, 8, 12, 21, 17, 12, 10, 9, 12, 14,..."


In [117]:
avg_distribution_df = distribution_df.groupby("type")[distribution_types].mean().explode(column=distribution_types).melt(ignore_index=False, var_name="channel", value_name="distribution").reset_index()
avg_distribution_df

Unnamed: 0,type,channel,distribution
0,bug,distribution_red,26043.771084
1,bug,distribution_red,2296.26506
2,bug,distribution_red,1900.168675
3,bug,distribution_red,3342.409639
4,bug,distribution_red,6626.385542
...,...,...,...
18427,water,distribution_alpha,26.507463
18428,water,distribution_alpha,29.186567
18429,water,distribution_alpha,33.029851
18430,water,distribution_alpha,42.126866


In [134]:
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "browser"

fig = px.line(avg_distribution_df, x=list(range(256)) * 18 * 4, y="distribution", facet_row="type", facet_col="channel", title="Average Color Distribution per Type", log_y=True)
fig.update_layout(height=18 * 200)
fig.show()