# Tracking Lineages with SuperSegger 

In [28]:
import numpy as np
import pandas as pd
import mscl_utils as mscl
import pymc3 as pm 
import theano.tensor as tt
import mscl.plotting
import mscl.stats
import bokeh.io
import glob
import bokeh.plotting
import skimage.io
import scipy.io
import matlab.engine as matlab
eng = matlab.start_matlab()
bokeh.io.output_notebook()

This notebook demonstrates how the output from a SuperSegger session is parsed and read into Python. 

## The `Cell` File 

While supersegger outputs several useful things, the files we will be the most interested in are the `c/Cell.mat` files in each position. The structure of these ouputs are described in great detail [on the SuperSegger Wiki page](). Each individual cell has a corresponding `.mat` file which contains information about the time of birth, time of death, fluorescence information, etc. Most importantly, it contains information about its assigned ID as well as its mothers' and sister IDs. While these files exist as Matlab `.mat` files, we can easily load them in to Python using the `scipy.io.loadmat` function.

We can load an example `cell.mat` file to get the key information. 

In [2]:
# Define the data directory for a single image position. 
root_dir = '../../data/images/20171026_sfGFP_10ngmL_dilution/'
data_dir = '{0}growth/xy00/cell/'.format(root_dir)

# Grab all of the mat files. Note that these exist as cell and Cell. 
files = glob.glob('{0}*.mat'.format(data_dir))
eng.workspace['f'] = files[0]
mat_file = eng.eval('load(f)')

# Print out the keys of the mat_file.
mat_file.keys()

dict_keys(['CellA', 'death', 'birth', 'divide', 'sisterID', 'motherID', 'daughterID', 'ID', 'neighbors', 'stat0', 'ehist', 'contactHist'])

In [3]:
np.array(mat_file['daughterID'][0])

array([ 128.,  129.])

While this looks like a very short list, the `CellA` value is also a dictionary containing information about the segmentation mask, fluorescence, area, etc. for every frame in the time-lapse. We would like to have all cell information for a single position in a tidy pandas DataFrame. Below are two functions which do exactly that. 

In [4]:
def cell_to_dict(file, engine, add_props=None, excluded_props=None):
    """
    Reads a single cell file and produces a dictionary containing
    the properties of interest. 
    
    The returned properties are 
    * birth - frame number at which the cell was born.
    * death - frame number at which the cell died.
    * divide - bool for an observed cell division.
    * ID - integer ID number of the cell.
    * motherID - integer ID number of the mother cell.
    * sisterID - integer ID number of the sister cell.
    * birth_fluo - fluorescence value at the cell's birth.
    * death_fluo - fluorescence value at the cell's death.
    * daughter_1_ID - integer ID number of the first daughter.
    * daughter_2_ID - integer ID number of the second daughter.
    
    
    Parameters
    ----------
    file: str
        Path of the cell file. This must be in a `.mat` format.
    engine: matlab Engine object
        Engine object of a Matlab instance.
    add_props : dict, default None
        Dictionary of additional properties (not found in the mat file)
        to be included in the returned dictionary.
    excluded_props: list of str
        Properties of cell.mat file to be ignored. These must be 
        exactly how they are defined in the cell file.
    
    Returns
    -------
    cell_dict : dictionary
        Dictionary of all extracted properties from the cell files. 
    """
    
    # Ensure the supplied file is actually a .mat and other types are correct. 
    if file.split('.')[-1] != 'mat':
        raise TypeError("supplied file {0} is not a `.mat` file.".format(file))
    if add_props is not None and type(add_props) is not dict:
        raise TypeError("add_props is {0} and not dict.".format(type(add_props)))                  
    if excluded_props is not None and type(excluded_props) is not list:
        raise TypeError("add_props must be list. Type is currently {0}.".format(type(excluded_props)))
                        
    # Define the values of interest.
    vals = ['birth', 'death', 'divide', 'ID', 'motherID', 'sisterID', 
             'daughter_1_ID', 'daughter_2_ID', 'birth_fluo', 'death_fluo'] 
    
    # Load the mat file using MATLAB.
    engine.workspace['f'] = file
#     mat = scipy.io.loadmat(file, squeeze_me=True,
#                       chars_as_strings=True,
#                       struct_as_record=True)
    mat = engine.eval('load(f)')
    
    # Assemble the dictionary for constant properties. 
    cell_dict = {v: mat[v] for v in vals[:-4]}
    daughters = mat['daughterID']
    
    # Determine  if daughters were produced. If not, change ID to NaN.
    if len(daughters) == 0:
        daughter_1, daughter_2 = None,  None
    else:
        daughter_1, daughter_2 = np.array(daughters[0])
    cell_dict['daughter_1_ID'] = daughter_1
    cell_dict['daughter_2_ID'] = daughter_2
     
    # Extract fluorescence information -- This is a bit gross but checked.
    cell_dict['birth_fluo'] = mat['CellA'][0]['fl1']['sum']
    cell_dict['death_fluo'] = mat['CellA'][-1]['fl1']['sum']
    
    # Deal with exclusion and addition of props. 
    if excluded_props is not None:
        new_dict = {}
        keys = cell_dict.keys()
        for key in keys:
            if key not in excluded_props:
                new_dict[key] = cell_dict[key]
        cell_dict = new_dict
    if add_props is not None:
        for key in add_props.keys(): 
            cell_dict[key] = add_props[key]
                        
    # Return the cell dictionary.
    return cell_dict

def parse_cell_files(files, engine, **kwargs):
    """
    Executes cell_to_dict across a list of files and returns a Pandas DataFrame.
    """
    if type(files) is not list:
        raise TypeError("'files' is type {0} not list.".format(type(files)))
    for i, f in enumerate(files):
        cell_dict = cell_to_dict(f, engine, **kwargs)  
        if i == 0:
            keys = cell_dict.keys()
            df = pd.DataFrame([], columns=keys)
            df = df.append(cell_dict, ignore_index=True)
        else:
            df = df.append(cell_dict, ignore_index=True)
    return df            

We can test this function out on this list of files defined above.

In [5]:
# Execute function on a single position.
cell_df = parse_cell_files(files, eng)
cell_df.head()

Unnamed: 0,birth,death,divide,ID,motherID,sisterID,daughter_1_ID,daughter_2_ID,birth_fluo,death_fluo
0,11.0,12.0,1.0,117.0,62.0,118.0,128.0,129.0,0.0,0.0
1,7.0,22.0,1.0,103.0,63.0,102.0,221.0,222.0,0.0,0.0
2,37.0,37.0,0.0,473.0,220.0,472.0,,,41546.0,41546.0
3,30.0,37.0,0.0,315.0,127.0,316.0,,,0.0,62298.0
4,29.0,37.0,0.0,301.0,68.0,302.0,,,54448.0,90081.0


With this tidy dataframe, we can extract a wide variety of useful properties and even track cell lineages. An important quantity to know is the autofluorescence of a single cell. We can use this script to parse all of the images from an autofluorescence sample (also segmented via SuperSegger).

In [6]:
# Define the directory containing the autofluorescence samples. 
auto_dir = '{0}auto/xy*'.format(root_dir)
auto_positions = glob.glob(auto_dir)

# Loop through each position and parse the cell files. 
auto_dfs = []
for i, pos in enumerate(auto_positions):
    # Get the positition number. 
    num = int(pos.split('xy')[-1])
    
    # Glob the cell files and generate the Datarame.
    files = glob.glob('{0}/cell/*.mat'.format(pos))
    _df = parse_cell_files(files, eng, add_props={'position': int(num)},
                          excluded_props=['daughter_1_ID', 'daughter_2_ID',
                                          'death_fluo', 'motherID',
                                          'sisterID','divide', 'birth', 'death'])
    auto_dfs.append(_df)
    
# Assemble all of the positions into a single DataFrame
auto_df = pd.concat(auto_dfs, axis=0)

# Compute the mean autofluorescence value. 
mean_auto = auto_df['birth_fluo'].mean()
print('The mean autofluorescence is {0:.2f} a.u. per cell.'.format(mean_auto))

The mean autofluorescence is 57556.51 a.u. per cell.


For sanity, we can plot the autofluorescence distribution.  

In [7]:
# Compute the ECDF of the data.
x, y = mscl.ecdf(auto_df['birth_fluo'])

# Set up the figure axis. 
p = mscl.bokeh_boiler(width=800, height=600,
                     x_axis_label='fluorescence (a.u.)',
                     y_axis_label='ECDF')

# Plot the data. 
p.circle(x, y, color='dodgerblue', size=3, alpha=0.5, legend='data')

# Show the mean.
p.ray(x=mean_auto, y=[0], length=0, angle=np.pi / 2, line_width=5, color='tomato', legend='mean',
     alpha=0.5)
bokeh.io.show(p)

## Mapping the lineages 

To ensure that the experiment is working as advertised, it's important to show that the total fluorescence is conserved across a lineage. This means that the total fluorescence of a mother cell should be equivalent to the sum of the total fluorescence of all of its daughters. To ensure this is the case, we must be able to identify lineages. As we have information about the mother cell ID, the sister IDs, and the daughter ID's, piecing together the family tree should be relatively trivial. 

For this section, we will use all of the data available.

In [8]:
# Assemble the entire DataFrame.
data_dir = '{0}growth/xy*'.format(root_dir)

# Grab all of the positions.
positions = glob.glob(data_dir)

# Loop through each position and generate the DataFrame.
dfs = []
for i, pos in enumerate(positions):
    print(i)
    # Figure out the position number.
    num = int(pos.split('xy')[-1])
    
    # Grab all of the files.
    files = glob.glob('{0}/cell/*.mat'.format(pos))
    
    # Generate the DataFrame.
    _df = parse_cell_files(files, eng, add_props={'position': num})  
    dfs.append(_df)   

0
1
2
3
4
5
6
7
8
9
10
11
12
13


As we only care about lineages, we can consider only the cells that divided and the cells that died on the last frame. 

In [9]:
# Find only the dividing cells & the final position.
df = pd.concat(dfs, axis=0)
producers = df.loc[(df['divide']==1.0) | (df['death']==26)].copy()

In [10]:
df

Unnamed: 0,birth,death,divide,ID,motherID,sisterID,daughter_1_ID,daughter_2_ID,birth_fluo,death_fluo,position
0,14.0,16.0,0.0,117.0,0.0,0.0,,,0.0,0.0,4.0
1,11.0,25.0,1.0,103.0,21.0,104.0,220.0,221.0,0.0,117126.0,4.0
2,34.0,37.0,0.0,315.0,162.0,316.0,,,0.0,66113.0,4.0
3,33.0,37.0,0.0,301.0,165.0,302.0,,,58259.0,71744.0,4.0
4,34.0,37.0,0.0,329.0,143.0,330.0,,,0.0,65595.0,4.0
5,30.0,37.0,0.0,261.0,109.0,260.0,,,0.0,84260.0,4.0
6,31.0,37.0,0.0,275.0,161.0,274.0,,,0.0,67601.0,4.0
7,29.0,37.0,0.0,249.0,101.0,250.0,,,52451.0,70504.0,4.0
8,9.0,23.0,1.0,88.0,38.0,89.0,187.0,188.0,58777.0,0.0,4.0
9,1.0,16.0,1.0,63.0,0.0,0.0,140.0,141.0,65948.0,0.0,4.0


All of the original mother cells are those present only in the first frame. As these produced all offspring, we can assign a new column to the DataFrame indicating their ID as a lineage identifier,hereafter called a `tree_number`. 

In [13]:
# Define the tree number for those without a clear parent.
producers.loc[:, 'tree_number'] = np.nan
producers.loc[producers['motherID']==0, 'tree_number'] = producers.loc[producers['motherID']==0]['ID']

We can now separate the DataFrame into two groups $-$ `assigned` and `unassigned` $-$ and change the tree number of their daughters to the correct lineage. As we have mixed positions with degenerate ID numbers, we will have do this in a group.

In [15]:
# Group the DataFrame by position.
grouped = producers.groupby(['position'])
position_dfs = []  # Position dataframes will be added here. 
for g, d in grouped:
    # Determine if there are orphans. 
    orphans = d[d['death']!=26]['tree_number'].isnull().values.any()
    
    # Set an iteration counter as a killswitch.
    it_counter = 0
    # Break producers into assigned and unassigned.
    assigned = d.loc[~d['tree_number'].isnull()].copy()
    unassigned = d.loc[d['tree_number'].isnull()].copy()
    while (orphans == True)  & (it_counter < 50): 
        # Get a list of all of the progeny.
        daughters = assigned.loc[:, ('daughter_1_ID', 'daughter_2_ID')].values
        tree_numbers = assigned.loc[:, 'tree_number'].values
    
        # Iterate through all of the unassigned.
        for i in range(len(unassigned)):
            cell_id = unassigned.iloc[i]['ID']
            ind = np.where([cell_id in v for v in daughters])[0]
            if len(ind) != 0:
                tree_number = tree_numbers[ind]
                unassigned.loc[unassigned['ID']==cell_id, 'tree_number'] = tree_number
        
        # Concatenate the assigned and unassigned into a new producers dataframe.
        tree = pd.concat([assigned, unassigned], axis=0, ignore_index=True)
        
        # Determine if there are any remaining orphans.
        orphans = tree[tree['death']!=26]['tree_number'].isnull().values.any()

        # Redefine the assignments.
        assigned = tree.loc[~tree['tree_number'].isnull()].copy()
        unassigned = tree.loc[tree['tree_number'].isnull()].copy()
        it_counter += 1     
         
    # Add the resultant DataFrame to the storage list. 
    position_dfs.append(tree) 
    
lineages = pd.concat(position_dfs, axis=0)

With all of the lineage number assigned, we can now loop through each pairing and compute the quantities of interest.  

In [16]:
# Restrict the data set to only those which fluorescence measurements. 
fluo_data = lineages.loc[(lineages['birth_fluo'] > 0) | (lineages['death_fluo'] > 0)].copy()

# Group by the tree number.
grouped = fluo_data.groupby(['position', 'tree_number'])

# Loop through each and compute the total fluorescence.
mother_cell, summed_daughters = [], []
for g, d in grouped:
    if (len(d) != 1) & (d['birth_fluo'].sum() > 0) & (d['death_fluo'].sum() > 0):
        mother_cell.append((d['birth_fluo']).sum())
        summed_daughters.append((d['death_fluo']).sum())  
mother_cell = np.array(mother_cell)
summed_daughters = np.array(summed_daughters)

To see if things make sense, we can plot the two quantities against each other. If the fluorescence is conserved, we would expect a 
linear relationship.

In [17]:
# Compute an expected linear predictor
x = np.linspace(mother_cell.min() - 100, mother_cell.max() + 100, 300)

# Set up the figure axis.
p = mscl.bokeh_boiler(width=800, height=600,
                      x_axis_label='mother cell intenisty (a.u.)',
                     y_axis_label='summed daughter cell intensity (a.u.)')

# Plot the data and prediction. 
p.circle(mother_cell, summed_daughters, color='slategray', alpha=0.5,
        legend='data')
p.line(x, x, color='dodgerblue', line_width=2, legend='conserved fluorescence')


# Show the plot.
bokeh.io.show(p)

##  Computing the calibration factor

As a reminder, we predict that the intensity of a single cell $I$ should be proportional to the number of fluorescent proteins per cell $N$ multiplied by some calibration factor $\alpha$,

$$
I = \alpha N.
$$

To estimate the value of $\alpha$, we can look at how the intensity fluctuates between any two daughter cells, revealing information about the partitioning of proteins during a division event. Once the mathematical dust settles, we find that relationship to be

$$
\langle (I_1 - I_2)^2 \rangle = \alpha I_\text{tot}
$$

where $I_1$ and $I_2$ are the intensities of the two daughter cells and $I_\text{tot}$ is the sum $I_1 + I_2$.  

To get a better sense of the value, we

In [21]:
df

Unnamed: 0,birth,death,divide,ID,motherID,sisterID,daughter_1_ID,daughter_2_ID,birth_fluo,death_fluo,position
0,14.0,16.0,0.0,117.0,0.0,0.0,,,0.0,0.0,4.0
1,11.0,25.0,1.0,103.0,21.0,104.0,220.0,221.0,0.0,117126.0,4.0
2,34.0,37.0,0.0,315.0,162.0,316.0,,,0.0,66113.0,4.0
3,33.0,37.0,0.0,301.0,165.0,302.0,,,58259.0,71744.0,4.0
4,34.0,37.0,0.0,329.0,143.0,330.0,,,0.0,65595.0,4.0
5,30.0,37.0,0.0,261.0,109.0,260.0,,,0.0,84260.0,4.0
6,31.0,37.0,0.0,275.0,161.0,274.0,,,0.0,67601.0,4.0
7,29.0,37.0,0.0,249.0,101.0,250.0,,,52451.0,70504.0,4.0
8,9.0,23.0,1.0,88.0,38.0,89.0,187.0,188.0,58777.0,0.0,4.0
9,1.0,16.0,1.0,63.0,0.0,0.0,140.0,141.0,65948.0,0.0,4.0


In [25]:
# Only look at cells who died on the final frame.
final_position = df[df['death']==37]
grouped = final_position.groupby(['position', 'motherID'])

# Make a new DataFrame to store the information.
int_df = pd.DataFrame([], columns=['I_1', 'I_2', 'I_tot', 'sq_diff'])
for g, d in grouped:
    if len(d) == 2:
        daughter_fluo = d['death_fluo'].values
        I_tot = daughter_fluo.sum()
        sq_diff = np.diff(daughter_fluo)[0]**2
        int_df = int_df.append(dict(I_tot=I_tot, sq_diff=sq_diff,
                                   I_1=daughter_fluo[0], I_2=daughter_fluo[1]),
                              ignore_index=True)
        
# Compute the log of the values. 
log_I_tot = np.log10(int_df['I_tot'].values)
log_sq_diff = np.log10(int_df['sq_diff'].values)

In [26]:
int_df

Unnamed: 0,I_1,I_2,I_tot,sq_diff
0,66641.0,84257.0,150898.0,3.103235e+08
1,91609.0,77584.0,169193.0,1.967006e+08
2,50135.0,45477.0,95612.0,2.169696e+07
3,70390.0,103437.0,173827.0,1.092104e+09
4,77111.0,93022.0,170133.0,2.531599e+08
5,56605.0,57266.0,113871.0,4.369210e+05
6,134422.0,121418.0,255840.0,1.691040e+08
7,87737.0,101889.0,189626.0,2.002791e+08
8,78326.0,87168.0,165494.0,7.818096e+07
9,123631.0,96803.0,220434.0,7.197416e+08


In [27]:
# Plot the results. 
p = mscl.bokeh_boiler(width=600, height=400,
                      x_axis_label='log\u2081\u2080 (I\u2081 + I\u2082)',
                      y_axis_label='log\u2081\u2080 (I\u2081 - I\u2082)\u00B2')
p.circle(x=log_I_tot, y=log_sq_diff, color='slategray', alpha=0.5)
bokeh.io.show(p)

In [30]:
cal, sd = mscl.stats.estimate_calibration_factor(int_df['I_1'], int_df['I_2'])

In [31]:
cal, sd

(3154.7632404212172, 133.43950932397783)