# Descriptive analysis

## Imports and settings

In [None]:
#!pip3 install plotly

In [None]:
import os
import numpy as np
import pandas as pd
import cv2 as cv
import PIL
import matplotlib.pyplot as plt
import plotly.express as px
    
from os import path
from glob import glob

In [None]:
PATH_IN = './in/'
PATH_OUT = './out/'

In [None]:
!ls $PATH_IN

## Single sample check

In [None]:
# get path of first file in source 
f = sorted(glob(path.join(PATH_IN, "*.png")))[11]
print('file:', f)

# read as rgb image
mask_in = cv.imread(f, cv.IMREAD_COLOR) 
mask_in = cv.cvtColor(mask_in, cv.COLOR_BGR2RGB)

# set as binary mask
gray = cv.cvtColor(mask_in, cv.COLOR_BGR2GRAY);
mask = cv.inRange(gray, 1, 255);

mask_out = mask_in.copy()

# detect contours and analyze
print('detected contours')
contours, _ = cv.findContours(mask, cv.RETR_TREE, cv.CHAIN_APPROX_SIMPLE);
for c in contours:
    print('-' * 80)
    
    # area, arc length
    area = np.int0(cv.contourArea(c))
    arcl = cv.arcLength(c, True)
    print('area:', area)
    print('arc length:', arcl)
    
    # rectangle (bbox)
    x, y, w, h = cv.boundingRect(c)    
    bbox_min, bbox_max = min(w, h), max(w, h)
    bbox_ratio = bbox_max / bbox_min
    print('bbox short:', bbox_min)
    print('bbox long:', bbox_max)
    print('bbox ratio:', bbox_ratio)
    cv.rectangle(mask_out, (x, y), (x + w, y + h), (0, 255, 0), 2)

    # min area rotated rectangle (rbox)
    # sloppy opencv definitions lead to slightly inconsistent results between boundingRect and minAreaRect
    # https://stackoverflow.com/questions/69911364/whats-the-difference-in-results-of-cvboundingrect-and-cvminarearect
    rect = cv.minAreaRect(c)
    rbox = np.int0(cv.boxPoints(rect))
    a, b = cv.norm(rbox[0], rbox[1]), cv.norm(rbox[1], rbox[2])
    rbox_min, rbox_max = min(a, b), max(a, b)
    rbox_ratio = rbox_max / rbox_min
    print('rbox short:', rbox_min)
    print('rbox long:', rbox_max)
    print('rbox ratio:', rbox_ratio)
    cv.drawContours(mask_out, [rbox], 0, (255, 0, 0), 2)
    
# plot results
f, ax = plt.subplots(1,2)
f.set_size_inches(16, 8)
ax[0].imshow(mask)
ax[1].imshow(mask_out)
plt.show()

## Bulk processing

In [None]:
# containers
rows_file = []
rows_inst = []

# process all 
files = sorted(glob(path.join(PATH_IN, "*.png")))
for f in files:
    
    # read as rgb image
    mask_in = cv.imread(f, cv.IMREAD_COLOR) 
    mask_in = cv.cvtColor(mask_in, cv.COLOR_BGR2RGB)

    # append file row
    rows_file.append([path.basename(f), mask_in.shape[1], mask_in.shape[0]])
    
    # set as binary mask
    gray = cv.cvtColor(mask_in, cv.COLOR_BGR2GRAY);
    mask = cv.inRange(gray, 1, 255);

    # detect contours and analyze
    contours, _ = cv.findContours(mask, cv.RETR_TREE, cv.CHAIN_APPROX_SIMPLE);
    instance = 0
    for c in contours:
        instance += 1

        # area, arc length
        area = np.int0(cv.contourArea(c))
        arcl = cv.arcLength(c, True)

        # rectangle (bbox)
        x, y, w, h = cv.boundingRect(c)    
        bbox_min, bbox_max = min(w, h), max(w, h)

        # min area rotated rectangle (rbox)
        # sloppy opencv definitions lead to slightly inconsistent results between boundingRect and minAreaRect
        # https://stackoverflow.com/questions/69911364/whats-the-difference-in-results-of-cvboundingrect-and-cvminarearect
        rect = cv.minAreaRect(c)
        rbox = np.int0(cv.boxPoints(rect))
        a, b = cv.norm(rbox[0], rbox[1]), cv.norm(rbox[1], rbox[2])
        rbox_min, rbox_max = min(a, b), max(a, b)
        
        # append instance row
        rows_inst.append([path.basename(f), instance, area, arcl, bbox_min, bbox_max, rbox_min, rbox_max])
    
    # exceptional case of no contour
    if instance == 0:
        rows_inst.append([path.basename(f), 0, None, None, None, None, None, None])

# create dfs
df_file = pd.DataFrame(rows_file, columns=[
    'file', 'width', 'height'])
df_inst = pd.DataFrame(rows_inst, columns=[
    'file', 'inst', 'area', 'arc_len', 'bbox_min', 'bbox_max', 'rbox_min', 'rbox_max'])

In [None]:
# view files
#df_file

In [None]:
# view instances
#df_inst

## Descriptive analysis

### Feature description

In [None]:
# basic description
df_file.describe()

In [None]:
# extension of instances data frame
df_inst = df_file.join(df_inst.set_index('file'), on = 'file')

# further area features
area_sqrt = np.sqrt(df_inst['area'])
df_inst.insert (5, 'area_sqrt', area_sqrt) # 
area_perc = df_inst['area'] / (df_inst['width'] * df_inst['height']) * 100
df_inst.insert (6, 'area_perc', area_perc) # area percentage

# further box features
bbox_ratio = df_inst['bbox_max'] / df_inst['bbox_min']
df_inst.insert (10, 'bbox_ratio', bbox_ratio) # bbox side ratio
rbox_ratio = df_inst['rbox_max'] / df_inst['rbox_min']
df_inst.insert (13, 'rbox_ratio', rbox_ratio) # rbox side ratio

# remove file-related cols
df_inst = df_inst.drop(columns = ['width', 'height'])

# basic description
df_inst.describe()

### Visual description

In [None]:
# instances per image
fig = px.histogram(df_inst, x="inst", marginal="box")
fig.update_traces(xbins = dict(start=0.0, size=1.0), selector=dict(type='histogram'))
fig.show()

In [None]:
# instance area (percent)
fig = px.histogram(df_inst, x="area_perc", marginal="box", hover_data=df_inst.columns)
fig.update_traces(xbins = dict(start=0.0, size=0.1), selector=dict(type='histogram'))
fig.show()

In [None]:
albu.Flip(p=0.5),# instance arc length
fig = px.histogram(df_inst, x="arc_len", marginal="box", hover_data=df_inst.columns)
fig.update_traces(xbins = dict(start=0.0, size=10), selector=dict(type='histogram'))
fig.show()

In [None]:
df_bbr = pd.DataFrame(df_inst['bbox_ratio'].tolist(), columns=['ratio'])
df_bbr['box'] = 'standard'
df_rbr = pd.DataFrame(df_inst['rbox_ratio'].tolist(), columns=['ratio'])
df_rbr['box'] = 'rotated'

df_ratio = pd.concat([df_bbr, df_rbr])
df_ratio = df_ratio.reset_index()

# standard/rotated box ratios
fig = px.histogram(df_ratio, x='ratio', color='box', marginal='box', hover_data=df_ratio.columns)
fig.update_traces(xbins=dict(start=0.0, size=0.1), selector=dict(type = 'histogram'))
fig.show()

## Classification by property

In [None]:
df_img = df_inst[['file', 'area_perc']]
df_img = df_img.groupby(['file'])['area_perc'].sum()
df_img = pd.DataFrame(data={'file': df_img.index, 'area_perc': df_img.values})
df_img

In [None]:
# area of all instances in one image (percent)
fig = px.histogram(df_img, x="area_perc", marginal="box", hover_data=df_img.columns)
fig.update_traces(xbins = dict(start=0.0, size=0.1), selector=dict(type='histogram'))
fig.show()

In [None]:
# deciles
dcs = df_img['area_perc'].quantile([0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0])
dcs

In [None]:
# classes based on deciles
df_dcs = df_img.copy()
df_dcs['class'] = None
df_dcs.loc[(df_dcs['area_perc'] <= dcs[0.1]), 'class'] = '<=10'
df_dcs.loc[(df_dcs['area_perc'] > dcs[0.1]) & (df_dcs['area_perc'] <= dcs[0.2]), 'class'] = '>10_20'
df_dcs.loc[(df_dcs['area_perc'] > dcs[0.2]) & (df_dcs['area_perc'] <= dcs[0.3]), 'class'] = '>20_30'
df_dcs.loc[(df_dcs['area_perc'] > dcs[0.3]) & (df_dcs['area_perc'] <= dcs[0.4]), 'class'] = '>30_40'
df_dcs.loc[(df_dcs['area_perc'] > dcs[0.4]) & (df_dcs['area_perc'] <= dcs[0.5]), 'class'] = '>40_50'
df_dcs.loc[(df_dcs['area_perc'] > dcs[0.5]) & (df_dcs['area_perc'] <= dcs[0.6]), 'class'] = '>50_60'
df_dcs.loc[(df_dcs['area_perc'] > dcs[0.6]) & (df_dcs['area_perc'] <= dcs[0.7]), 'class'] = '>60_70'
df_dcs.loc[(df_dcs['area_perc'] > dcs[0.7]) & (df_dcs['area_perc'] <= dcs[0.8]), 'class'] = '>70_80'
df_dcs.loc[(df_dcs['area_perc'] > dcs[0.8]) & (df_dcs['area_perc'] <= dcs[0.9]), 'class'] = '>80_90'
df_dcs.loc[(df_dcs['area_perc'] > dcs[0.9]), 'class'] = '>90'

# write without 
df_dcs = df_dcs.drop(columns='area_perc')
df_dcs.to_csv(path.join(PATH_OUT, 'classes_decile.csv'), index=False)

# show
df_dcs

In [None]:
# quartiles
qrs = df_img['area_perc'].describe()
qrs

In [None]:
# classes based on quartiles
df_qrs = df_img.copy()
df_qrs['class'] = None
df_qrs.loc[(df_qrs['area_perc'] <= qrs['25%']), 'class'] = '<=25'
df_qrs.loc[(df_qrs['area_perc'] > qrs['25%']) & (df_qrs['area_perc'] <= qrs['50%']), 'class'] = '>25_50'
df_qrs.loc[(df_qrs['area_perc'] > qrs['50%']) & (df_qrs['area_perc'] <= qrs['75%']), 'class'] = '>50_75'
df_qrs.loc[(df_qrs['area_perc'] > qrs['75%']), 'class'] = '>75'

# write without 
df_qrs = df_qrs.drop(columns='area_perc')
df_qrs.to_csv(path.join(PATH_OUT, 'classes_quartile.csv'), index=False)

# show
df_qrs