# Introduction

This notebook calculates the proportion of moving area in each video and some simple summarty statistics for each group of videos in a single experiment. It also generates visualizations of the summary statistics. The segmentation of moving area is carried out in this [notebook: 1_segment_moving_regions](1_segment_moving_regions.ipynb) and its results are visualised here.

NOTE: this notebook does not write any files to disk

# Imports

In [None]:
%load_ext autoreload
%autoreload 2

import numpy as np
import re
import os
from matplotlib import pyplot as plt
from mpl_toolkits.axes_grid1 import ImageGrid
from tqdm import tqdm
import skimage
import altair as alt
import pandas as pd

from fam13a import utils

# Declare constants

In [None]:
PROJ_ROOT = utils.here()
# declare the data input directory
HBEC_ROOT = os.path.join(PROJ_ROOT, 'data', 'interim', 'hbec')

print(os.listdir(HBEC_ROOT))

In [None]:
EXP_ID = 'ELN14186_8_perc'
EXP_ROOT = os.path.join(PROJ_ROOT, 'data', 'processed', 'hbec', EXP_ID)

# declare the root directories for the various images generated during segmentation process
ROI_ROOT = os.path.join(EXP_ROOT, 'roi')
SEG_ROOT = os.path.join(EXP_ROOT, 'segmented', 'movement')
MAX_FRAME_ROOT = os.path.join(EXP_ROOT, 'max_frame')
NOISY_ROOT = os.path.join(EXP_ROOT, 'segmented', 'noisy')

# regex pattern for identifying the different experimental setups and their batch ID based on their file names
# first group captures experimental setup, second group captures batch ID
REGEX = r'([a-zA-Z_0-9]*?)_([0-9]_[0-9]*?)_.*'

# Setup

Define some convenience functions. These functions are not put into `src` because they are highly dependent on the structure of the DataFrame constructed in this notebook and are only relevant for use in this notebook

In [None]:
def calc_seg_ratio(row):
    seg_mask = np.load(os.path.join(row.seg_root, row.filename))
    roi_mask = np.load(os.path.join(row.roi_root, row.filename))
    ratio = seg_mask.sum() / roi_mask.sum()
    return ratio

def load_imgs(row):
    max_frame = np.load(os.path.join(row.max_root, row.filename))
    noisy_mask = np.load(os.path.join(row.noisy_root, row.filename))
    seg_mask = np.load(os.path.join(row.seg_root, row.filename))
    roi_mask = np.load(os.path.join(row.roi_root, row.filename))
    
    return max_frame, roi_mask, noisy_mask, seg_mask

In [None]:
# get all file names
filenames = sorted([_f for _f in os.listdir(ROI_ROOT) if _f.endswith('npy')])
# compile regex pattern to make repeated use more convenient
pattern = re.compile(REGEX)

# identify the group ID and batch ID for each file
file_ids = [(_f, pattern.match(_f)) for _f in filenames]
file_ids = [(match[0], *match[1].groups()) if match[1] is not None else match for match in file_ids]

# construct a DataFrame with the extracted IDs and the various data directories
df = pd.DataFrame(file_ids, columns=['filename', 'grp_id', 'batch_id'])
df['seg_root'] = SEG_ROOT
df['roi_root'] = ROI_ROOT
df['max_root'] = MAX_FRAME_ROOT
df['noisy_root'] = NOISY_ROOT

# Visualization

In [None]:
# load all the images for each video
# each video has 4 images associated with it: max projection frame, ROI, noisy mask, clean mask
# these are loaded in the order declared
imgs = df.apply(load_imgs, axis=1)
# flatten the list as we only rely on the ordering of the images to identify them
imgs = [im for grp in imgs for im in grp]

In [None]:
# visualize the 4 images from 1 video as example
grps = np.repeat(df.grp_id.to_list(), 4)
batches = np.repeat(df.batch_id.to_list(), 4)
fig = plt.figure(figsize=(50, 50))
grid = ImageGrid(fig, 111, nrows_ncols=(2, 2), axes_pad=(0.1, 0.1))
for idx, (ax, im) in enumerate(zip(grid, imgs[12:16])):
    
    ax.imshow(im)
    ax.set_title(f'{grps[idx]} {batches[idx]}')

In [None]:
# visualize the images from all videos
grps = np.repeat(df.grp_id.to_list(), 4)
batches = np.repeat(df.batch_id.to_list(), 4)
fig = plt.figure(figsize=(500, 100))
grid = ImageGrid(fig, 111, nrows_ncols=(24, 4), axes_pad=(0.02, 0.4))
for idx, (ax, im) in enumerate(zip(grid, imgs)):
    
    ax.imshow(im)
    ax.set_title(f'{grps[idx]} {batches[idx]}')

# Processing

In [None]:
# calculate the proportion of segmented region within the ROI
df['ratio'] = df.apply(calc_seg_ratio, axis=1)

In [None]:
df['replicate_id'] = df['batch_id'].str.split('_').str[0]

# Summarise results

In [None]:
# rename some knockout types to have the same knockout type labels across experiments
knockout_order = ['NT', 'g1', 'DNAI1' ,'gAA']

In [None]:
x_axis_label = 'proportion of moving region'
def produce_porportion_summary_plot(df, x_axis_label):
    base = alt.Chart(
        df
    ).properties(
        width=300,
        height=400
    )
    colorPalette = 'dark2'

    #selection_mean = alt.selection_multi(fields=['experiment'], bind='legend')

    points = base.mark_square(filled=False, color='black',size=300).encode(
        y=alt.Y('ratio:Q', aggregate='mean', axis=alt.Axis(title=x_axis_label)),
        x=alt.X('grp_id:N', axis=alt.Axis(title='experiment'), sort=knockout_order))

    #selection = alt.selection_multi(fields=['experiment'], bind='legend')

    all_points = base.mark_square(size=25, filled=True).encode(
        y=alt.Y('ratio:Q'),
        x=alt.X('grp_id:N', axis=alt.Axis(title='experiment'), sort=knockout_order),
        color=alt.Color('replicate_id', scale=alt.Scale(scheme=colorPalette)),
        tooltip='replicate_id:N')

    mean_error_bars = base.mark_errorbar(extent='stdev').encode(
      y=alt.Y('ratio:Q'),
      x=alt.X('grp_id:N', sort=knockout_order))

    visualise_chart = (points + all_points + mean_error_bars)
    return (visualise_chart)

In [None]:
visualise_chart = produce_porportion_summary_plot(df, 'proportion of moving region')
visualise_chart.save(f'{EXP_ID}_proportion_moving_region.html')
visualise_chart

### Normalised NT average to 1

In [None]:
normalised_mask_summary_df = df.copy()
nt_average = np.mean(normalised_mask_summary_df.loc[(normalised_mask_summary_df.grp_id.str.contains('NT')), 'ratio'])
normalised_mask_summary_df['ratio'] = normalised_mask_summary_df['ratio']/nt_average

In [None]:
visualise_chart = produce_porportion_summary_plot(normalised_mask_summary_df, 'proportion of moving region (NT  normalised)')
visualise_chart.save(f'{EXP_ID}_proportion_moving_region_normalised_to_nt.html')
visualise_chart