# Checking conservation of fluorescence

In [9]:
import numpy as np
import pandas as pd
import scipy.io
import bokeh.plotting
import bokeh.io
import holoviews as hv
import mscl.image
import mscl.plotting
import mscl.mcmc
import tqdm
import skimage.io
import scipy.signal
import skimage.segmentation
import skimage.morphology
import theano.tensor as tt
import statsmodels.tools.numdiff as smnd
import pymc3 as pm
import glob

# Use MATLAB python API to read in files. 
# import matlab.engine as matlab
# try:
#     eng = matlab.connect_matlab()
# except:
#     eng = matlab.start_matlab()
bokeh.io.output_notebook()
# hv.extension('bokeh')
# Define the root directory for the experiment.
root_dir = '../../data/images/20171026_sfGFP_10ngmL_dilution/'

## Measuring the mean autofluorescence 

In [15]:
field_files = glob.glob('../../data/images/20171026_sfGFP_10ngmL_dilution/20171026_fluorescent_slide/*/*.tif')
field_im = skimage.io.ImageCollection(field_files)
field_avg = np.mean(field_im, axis=0)

In [16]:
# Flatten the image.
def im_to_float(im): 
    im_float = (im - im.min()) / (im.max() - im.min())
    im_blur = skimage.filters.gaussian(im_float, sigma=5)
    im_sub = im_float - im_blur
    return im_sub

def otsu_thresh(im):
    im_sub = im_to_float(im)
    thresh = skimage.filters.threshold_otsu(im_sub)
    seg = skimage.segmentation.clear_border(im_sub < thresh)
    seg = skimage.morphology.remove_small_objects(seg)
    mask = skimage.measure.label(seg)
    selem = skimage.morphology.disk(2)
    mask = skimage.morphology.dilation(mask, selem) 
    return mask

In [17]:
# Load the images.
auto_bf_f = np.sort(glob.glob('{0}*auto_snaps/Pos*/*Brightfield*.tif'.format(root_dir)))
auto_bf_im = skimage.io.ImageCollection(auto_bf_f)
auto_gfp_f  = np.sort(glob.glob('{0}*auto_snaps/Pos*/*GFP*.tif'.format(root_dir)))
auto_gfp_im = skimage.io.ImageCollection(auto_gfp_f)

# Segment via Otsu.
auto_df = pd.DataFrame([], columns=['total_fluorescence', 'area'])
for bf, gfp in zip(auto_bf_im, auto_gfp_im):
    # Segment
    mask = otsu_thresh(bf)
    flat = mscl.image.generate_flatfield(gfp, field_avg)
    
    props = skimage.measure.regionprops(mask, flat)
    cell_dict = {}
    for prop in props:
        cell_dict['area'] = prop.area * 0.16**2
        cell_dict['total_fluorescence'] = prop.mean_intensity * prop.area 
        auto_df = auto_df.append(cell_dict, ignore_index=True)

In [38]:
p = mscl.plotting.imshow(field_avg)
bokeh.io.show(p)

In [18]:
mean_auto = np.mean(auto_df['total_fluorescence'])
mean_auto

161916.86585617642

## Parsing SuperSegger `cell.mat` fies

In [19]:
def cell_to_dict(file, add_props=None, excluded_props=None):
    """
    Reads a single cell file and produces a dictionary containing
    the properties of interest. 
    
    The returned properties are 
    * birth - frame number at which the cell was born.
    * death - frame number at which the cell died.
    * divide - bool for an observed cell division.
    * ID - integer ID number of the cell.
    * motherID - integer ID number of the mother cell.
    * sisterID - integer ID number of the sister cell.
    * birth_fluo - fluorescence value at the cell's birth.
    * death_fluo - fluorescence value at the cell's death.
    * daughter_1_ID - integer ID number of the first daughter.
    * daughter_2_ID - integer ID number of the second daughter.
    
    
    Parameters
    ----------
    file: str
        Path of the cell file. This must be in a `.mat` format.
    add_props : dict, default None
        Dictionary of additional properties (not found in the mat file)
        to be included in the returned dictionary.
    excluded_props: list of str
        Properties of cell.mat file to be ignored. These must be 
        exactly how they are defined in the cell file.
    
    Returns
    -------
    cell_dict : dictionary
        Dictionary of all extracted properties from the cell files. 
    """
    
    # Ensure the supplied file is actually a .mat and other types are correct. 
    if file.split('.')[-1] != 'mat':
        raise TypeError("supplied file {0} is not a `.mat` file.".format(file))
    if add_props is not None and type(add_props) is not dict:
        raise TypeError("add_props is {0} and not dict.".format(type(add_props)))                  
    if excluded_props is not None and type(excluded_props) is not list:
        raise TypeError("add_props must be list. Type is currently {0}.".format(type(excluded_props)))
                        
    # Define the values of interest.
    vals = ['birth', 'death', 'divide', 'ID', 'motherID', 'sisterID', 
             'daughter_1_ID', 'daughter_2_ID', 'birth_fluo', 'death_fluo'] 
    
    # Load the mat file using MATLAB.
    mat = scipy.io.loadmat(file, squeeze_me=True,
                      chars_as_strings=True,
                      struct_as_record=True)
    
    # Assemble the dictionary for constant properties. 
    cell_dict = {v: mat[v] for v in vals[:-4]}
    daughters = mat['daughterID']
    
    # Determine  if daughters were produced. If not, change ID to NaN.
    if len(daughters) == 0:
        daughter_1, daughter_2 = None,  None
    else:
        daughter_1, daughter_2 = daughters 
    cell_dict['daughter_1_ID'] = daughter_1
    cell_dict['daughter_2_ID'] = daughter_2
     
    # Extract fluorescence information -- This is a bit gross but checked.
    try:
        fluo = [mat['CellA'][i]['fl1'].flatten()[0].flatten()[0][0] for i, _ in enumerate(mat['CellA'])]
        nonzero = [f for f in fluo if f != 0] 
        num_exposures = len(nonzero)
        cell_dict['birth_fluo'] = nonzero[0] 
        cell_dict['death_fluo'] = nonzero[-1]
    except:
        cell_dict['birth_fluo'] = 0
        cell_dict['death_fluo'] = 0
        num_exposures = 0 
    cell_dict['num_exposures'] = num_exposures
    # Deal with exclusion and addition of props. 
    if excluded_props is not None:
        new_dict = {}
        keys = cell_dict.keys()
        for key in keys:
            if key not in excluded_props:
                new_dict[key] = cell_dict[key]
        cell_dict = new_dict
    if add_props is not None:
        for key in add_props.keys(): 
            cell_dict[key] = add_props[key]
                        
    # Return the cell dictionary.
    return cell_dict

def parse_cell_files(files, **kwargs):
    """
    Executes cell_to_dict across a list of files and returns a Pandas DataFrame.
    """
    if type(files) is not list:
        raise TypeError("'files' is type {0} not list.".format(type(files)))
    for i, f in enumerate(files):
        cell_dict = cell_to_dict(f, **kwargs)  
        if i == 0:
            keys = cell_dict.keys()
            df = pd.DataFrame([], columns=keys)
            df = df.append(cell_dict, ignore_index=True)
        else:
            df = df.append(cell_dict, ignore_index=True)
    return df            

Compute the autofluorescence distribution

In [20]:
# Restrict area to reasonable bounds.
cells = auto_df.loc[auto_df['area'] < 4]
mean_auto = cells['total_fluorescence'].mean()
mean_auto

114116.71557484045

Parse all position files 

In [21]:
# Assemble the entire DataFrame.
data_dir = '{0}growth/xy*'.format(root_dir)

# Grab all of the positions.
positions = glob.glob(data_dir)

# Loop through each position and generate the DataFrame.
for i, pos in enumerate(positions):
    # Figure out the position number.
    num = int(pos.split('xy')[-1])
    
    # Grab all of the files.
    files = glob.glob('{0}/cell/*.mat'.format(pos))
    
    # Generate the DataFrame.
    _df = parse_cell_files(files, add_props={'position': num})
    if i == 0:
        df = _df
    else:
        df = df.append(_df, ignore_index=True) 

In [33]:
# Restrict data frame to only those that divided or died on last frame. 
producers = df[(df['divide'] == 1) | (df['death']==37)].copy()

# Remove anomolous zeros.
measured = producers.loc[(producers['birth_fluo'] > 0) & (producers['death_fluo'] > 0)]
measured['birth_fluo'] -= mean_auto
measured['death_fluo'] -= mean_auto

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


Group by the mother ID's  and position and look at the conservation of fluorescence. 

In [34]:
grouped = measured.groupby(['position', 'motherID'])
# Iterate through and compute the total fluorescence and mother fluorescence.

summed = []
mother = []
for g, d in grouped:
    mother_fluo = (measured[(measured['ID']==g[1]) & (measured['position']==g[0])]['death_fluo']).sum()
    if mother_fluo != 0:
        mother.append(mother_fluo)
        summed.append(np.sum(d['birth_fluo']))

In [35]:
# compute the expected linear trend.
theo = np.linspace(np.min(mother), np.max(mother), 300)
p = mscl.plotting.boilerplate()
p.circle(mother, summed, color='slategray', alpha=0.5, legend='data',
        size=1)
p.line(theo, theo, color='dodgerblue', line_width=2, legend='conserved fluorescence')
bokeh.io.show(p)

## Measuring the fluctuations. 

In [30]:
dilution_df = pd.DataFrame([], columns=['I_tot', 'I_1', 'I_2', 'sq_diff'])
for g, d in grouped:
    if len(d) == 2:
#         sum_total = d['death_fluo'].sum() 
        sum_total = measured.loc[(measured['ID']==g[1]) & (measured['position']==g[0])]['death_fluo'].values.sum()
        I1 = d.iloc[0]['birth_fluo']
        I2 = d.iloc[1]['birth_fluo']
        sq_diff = (np.diff(d['birth_fluo'])**2)[0]
        dilution_df = dilution_df.append(dict(I_tot=sum_total,
                                             sq_diff=sq_diff, I_1=I1, I_2=I2), ignore_index=True)

###  Deterministic solution

In [31]:
def log_post(alpha, I1, I2, p=0.5):
    # Determine the length.
    k = len(I1)
    
    # Set the deterministic parameters. 
    n1 = I1 / alpha
    n2 = I2 / alpha
    ntot = n1 + n2
    # Compute the binomial coefficient using gamma functions.
    binom = scipy.special.gammaln(ntot + 1).sum() - scipy.special.gammaln(n1 + 1).sum()\
            -scipy.special.gammaln(n2 + 1).sum()
    # Compute the probability
    prob = n1.sum() * np.log(p) + (ntot - n1).sum() * np.log(1 - p)
    
    # Define the prior
    prior = -k * np.log(alpha)
    
    # Compute the log posterior
    return prior + prob + binom
            
def neg_log_post(alpha, I1, I2, p=0.5):
    return -log_post(alpha, I1, I2, p)

In [32]:
# get the best-fit value.
x0 = [0, 2**16]
popt = scipy.optimize.minimize_scalar(neg_log_post, args=(dilution_df['I_1'], dilution_df['I_2']))
alpha = popt.x
hess = smnd.approx_hess([alpha], log_post, args=(dilution_df['I_1'], dilution_df['I_2']))
cov = np.linalg.inv(-hess)[0]
sd = np.sqrt(cov)[0]
print('{0} +/- {1}'.format(alpha, sd))

2340.286380283431 +/- 78.43079820246652


In [54]:
# Bin the data to compute the means. 
bin_width = 85 
bins = np.arange(0, len(dilution_df) + bin_width, bin_width)
sum_total_means, sq_diff_means = [], []
dilution_df.sort_values(by=['I_tot'], inplace=True)
for i in range(1, len(bins)):
    slc = dilution_df.iloc[bins[i-1]: bins[i] + 1]
    sum_total_means.append(slc['I_tot'].mean())
    sq_diff_means.append(slc['sq_diff'].mean())
 


# Plot the raw data and the computed means.
p = mscl.plotting.boilerplate(x_axis_type='log', y_axis_type='log',
                     x_axis_label='I\u2081 + I\u2082',
                     y_axis_label='(I\u2081 + I\u2082)\u00B2',
                     width=600, height=450)
p.circle(dilution_df['I_tot'], dilution_df['sq_diff'], 
         color='slategray', alpha=0.5, size=2, legend='data')
p.circle(sum_total_means, sq_diff_means, color='tomato', size=4,
         legend='avg. of {0} measurements'.format(bin_width))
I_tot_range = np.linspace(2E4,6E5, 500)
# p.line(I_tot_range, popt[0] * I_tot_range, color='dodgerblue', legend='α = {0}'.format(int(popt[0])),
#       line_width=2)
upper_bound = (alpha + sd) * I_tot_range
lower_bound = (alpha - sd) * I_tot_range
p.line(I_tot_range, alpha * I_tot_range, color='dodgerblue', line_width=2)
mscl.plotting.fill_between(p, I_tot_range, lower_bound, upper_bound, color='dodgerblue',
                          alpha=0.3)
p.legend.location = 'bottom_right'
bokeh.io.show(p)

In [74]:
# Plot the posterior.
alpha_range = np.linspace(1500, 3000, 2000)
args = (dilution_df['I_1'], dilution_df['I_2'])
post = [log_post(a, *args) for a in alpha_range]
norm_post = np.exp(post - scipy.special.logsumexp(post))

# Compute the gaussian approximation.
gauss_appx = scipy.stats.norm.pdf(alpha_range, loc=alpha, scale=sd)
gauss_appx = gauss_appx / np.sum(gauss_appx)

# Plot them. 
p = mscl.plotting.boilerplate(width=600, height=500, x_axis_label='α (a.u.)',
                             y_axis_label='g(α | [I\u2081, I\u2082])')
p.line(alpha_range, norm_post, color='dodgerblue', legend='normalized posterior',
      line_width=3)
mscl.plotting.fill_between(p, alpha_range, np.zeros_like(norm_post), norm_post,
                           alpha=0.3)
p.line(alpha_range, gauss_appx, color='tomato', line_dash='dashed', 
      legend='gaussian approximation', line_width=3)
p.ray(x=alpha, y=0, angle=np.pi/2, length=0, color='firebrick', 
     line_width=3, legend='best-fit value')

p.xaxis.axis_label_text_font_size = '1.5em'
p.yaxis.axis_label_text_font_size = '1.5em' 
p.legend.location = 'top_left'
bokeh.io.show(p)