In [1]:
# import necessary libraries
%reset
import os
import json
import copy
import datetime
import numpy as np
import pandas as pd

from scipy import interpolate
from thin_plate_spline_warp import thin_plate_spline_warp

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [2]:
# define some helper functions
def create_workspace_folders():
    date = datetime.datetime.now()
    workspace = "workspace\{}-{}".format(date.strftime("%Y_%m_%d"), name)

    create_folders = ['workspace', workspace]
    # create workspace
    for folder in create_folders:
        if not os.path.exists(folder):
            os.makedirs(folder)
    
    return workspace

def moving_average(a, window_size):
    new_a = []
    half_window = round(window_size/2)
    for idx, val in enumerate(a):
        if val == 0:
            new_a.append(0)
            continue
            
        start_idx = max(idx-half_window, 0)
        end_idx = min(idx+half_window, a.shape[0]-1)
        a_segment = a[start_idx:end_idx]
        
        a_seg_mean = np.nanmean(np.where(a_segment!=0, a_segment, np.nan)) # nonzero mean
        new_a.append(a_seg_mean)
        
    return new_a

def hampel_filter(input_series, window=5, n_sigmas=1):
    # ensure the data is flattened
    input_series = np.array(input_series).flatten()
    
    # returns filtered timepoints and coordinates along
    n = len(input_series)
    k = 1.4826 # scale factor for Gaussian distribution
    
    outliers_idxs = []
    outliers_filtered = []
    extended_series = np.pad(input_series, (window, window), 'reflect')
    filtered_series = extended_series.copy()
    n_ex = len(extended_series)
    for i in range(window_size, n_ex - window_size):
        x0 = np.median(
            extended_series[(i - window_size):(i + window_size)])
        S0 = k * np.median(np.abs(
            extended_series[(i - window_size):(i + window_size)] - x0))
        if (np.abs(extended_series[i] - x0) > n_sigmas * S0):
            filtered_series[i] = x0 # replaces outlier with median
            outliers_idxs.append(i-window) # logs outlier index
            outliers_filtered.append(x0) # the value that replaces outlier
    
    filtered_series = filtered_series[window:len(filtered_series)-window]
    
    return filtered_series, outliers_idxs, outliers_filtered

In [3]:
name = "OD1599_NU+NerveRing"

In [4]:
# Create workspace folders
workspace_folderpath = create_workspace_folders()
print('Workspace folderpath: {}'.format(workspace_folderpath))

Workspace folderpath: workspace\2020_08_16-OD1599_NU+NerveRing


In [5]:
# Load necessary file structures via config file
with open('config.json') as f:
    config = json.load(f)

In [6]:
# STEP 1: Import all the necessary data
def parse_mipav_data(config, cell_key, strain_folderpath):
    # define output variables
    valid_seam_cells = set(config['settings']['interpolation']['seam_cells_on'].keys())
    seam_cell_output = {}
    annotation_output = {} # structured cell_name: pandas dataframe (timepoint, x, y, z)
    errors = [] # note potential errors in volumes
    
    # determine default or options specific to strain
    folderpaths = config['settings']['folderpaths']
    for folder_data in folderpaths.keys():
        try:
            folderpaths[folder_data] = cell_key['folderpaths'][folder_data]
            print('Using specific {} folder structure'.format(folder_data))
        except:
            # print('Using default {} folder structure'.format(folder_data))
            pass
    
    # find the numbers of the folders
    start_vol = int(cell_key['start'])
    end_vol = int(cell_key['end'])
    all_folderpaths = folderpaths.copy()
    for vol_idx, vol_num in enumerate(range(start_vol, end_vol+1)):
        # define the specific volume number folder
        vol_folder = all_folderpaths['data_folderpath'].replace('#', str(vol_num))
        vol_folderpath = os.path.join(
            strain_folderpath, all_folderpaths['side'], vol_folder)
        
        # check if the folder exists
        if not os.path.isdir(vol_folderpath):
            continue
        
        # define individual, full filepaths from drive
        all_filepaths_folderpaths = {}
        for full_folderpaths_key in all_folderpaths.keys():
            if full_folderpaths_key != "side" and \
                full_folderpaths_key != "data_folderpath":
                all_filepaths_folderpaths[full_folderpaths_key] = os.path.join(
                    vol_folderpath, all_folderpaths[full_folderpaths_key])
                # print(all_filepaths_folderpaths[full_folderpaths_key])

                # check to see if exists
                if not os.path.isfile(all_filepaths_folderpaths[full_folderpaths_key]):
                    error_msg = "FILE DOES NOT EXIST: {}".format(
                        all_filepaths_folderpaths[full_folderpaths_key])
                    errors.append(error_msg)
                    print(error_msg)
        
        ## get individual data by cell in pandas format
        try:
            # import twisted seam cells           
            tw_seam_cells_fp = all_filepaths_folderpaths['twisted_seam_cells']
            tw_seam_cells = pd.read_csv(tw_seam_cells_fp)            
            # import twisted annotations
            tw_annotations_fp = all_filepaths_folderpaths['twisted_annotations']
            tw_annotations = pd.read_csv(tw_annotations_fp)
            # import straighted seam cells
            st_seam_cells_fp = all_filepaths_folderpaths['straightened_seam_cells']
            st_seam_cells = pd.read_csv(st_seam_cells_fp)
            # import straightened annotations
            st_annotations_fp = all_filepaths_folderpaths['straightened_annotations']
            st_annotations = pd.read_csv(st_annotations_fp)
        except:
            continue # file errors should already be logged.
            
        # ERROR CHECKING: Check for file content ----------------------
        if st_seam_cells.size == 0: # check if empty
            error_msg = "DATA ERROR: Empty seam cell file at {}".format(st_seam_cells_fp)
            errors.append(error_msg)
            print(error_msg)
        if st_annotations.size == 0: # check if empty
            error_msg = "DATA ERROR: Empty annotation file at {}".format(st_annotations_fp)
            errors.append(error_msg)
            print(error_msg)
        data_file_list = [ # use to check values of files
            (tw_seam_cells, tw_seam_cells_fp), (tw_annotations, tw_annotations_fp),
            (st_seam_cells, st_seam_cells_fp), (st_annotations, st_annotations_fp)
        ]
        for data, fp in data_file_list:
            coordinates = data[['x_voxels', 'y_voxels', 'z_voxels']]
            cell_ids = data['name'].values.tolist()
            for cell_idx, cell_coord in coordinates.iterrows():
                if not cell_coord.equals(cell_coord.astype(float)):
                    error_msg = "DATA ERROR: Non-float found for cell {} in {}".format(
                        cell_names[cell_idx], fp)
                    errors.append(error_msg)
                    print(error_msg)            
        seam_cells_mismatch = list(set(st_seam_cells['name'].values.tolist()) - 
                                   set(tw_seam_cells['name'].values.tolist()))
        annotations_mismatch = list(set(st_annotations['name'].values.tolist()) - 
                                   set(tw_annotations['name'].values.tolist())) 
        if seam_cells_mismatch:
            error_msg = "DATA ERROR: Mis-match between seam cells ({}) found in {} and {}".format(
                ", ".join(seam_cells_mismatch), st_seam_cells_fp, tw_seam_cells_fp)
            errors.append(error_msg)
            print(error_msg)
        if annotations_mismatch:
            error_msg = "DATA ERROR: Mis-match between annotated cells ({}) found in {} and {}".format(
                ", ".join(annotations_mismatch), st_annotations_fp, tw_annotations_fp)
            errors.append(error_msg)
            print(error_msg)
        # check if extra seam cells exist/mismatch with settings
        extra_seam_cells = list(set(st_seam_cells['name'].values.tolist()) - valid_seam_cells)
        if extra_seam_cells: # check if there are extra seam cells
            # save old one as _bak.csv, save new as straightened_seamcells.csv
            # st_seam_cells.to_csv(st_seam_cells_fp.replace('.csv', '_bak.csv'), index=False)
            for extra_seam_cell in extra_seam_cells:
                st_seam_cells = st_seam_cells[st_seam_cells['name'] != extra_seam_cell]
            # st_seam_cells.to_csv(st_seam_cells_fp, index=False)
            error_msg = "DATA WARNING: Ignoring extra seam cells ({}) found in {}".format(
                ", ".join(extra_seam_cells), st_seam_cells_fp)
            errors.append(error_msg)
            print(error_msg)
        # END FILE ERROR CHECK -----------------------------------------------------------

        # SEAM CELLS: combine into a single data structure and catch data errors
        # go through each row of the file and log the appropriate information
        for idx, seam_row in st_seam_cells.iterrows():
            seam_cell_name = seam_row['name'].upper() # seam cells are upper case? just stay consistent
            if seam_cell_name not in seam_cell_output.keys():
                seam_cell_output[seam_cell_name] = {}
                seam_cell_output[seam_cell_name]['timepoints'] = []
                seam_cell_output[seam_cell_name]['coordinates'] = []

            # append to appropriate seam cell
            if vol_num not in seam_cell_output[seam_cell_name]['timepoints']:
                seam_cell_coords = seam_row[['x_voxels','y_voxels','z_voxels']].values.flatten().tolist()
                seam_cell_output[seam_cell_name]['coordinates'].append(seam_cell_coords)
                seam_cell_output[seam_cell_name]['timepoints'].append(vol_num)

        # ANNOTATIONS: combine into a single data structure and catch data errors
        try:
            for cell_id in cell_key['mapping'].keys():
                cell_name = cell_key['mapping'][cell_id].lower()
                if cell_name not in annotation_output.keys():
                    annotation_output[cell_name] = {}
                    annotation_output[cell_name]['timepoints'] = []
                    annotation_output[cell_name]['coordinates'] = []

                # check if the cell name exists in volume
                # print(vol_folderpath, st_annotations['name'], cell_id)
                cell_row = st_annotations.loc[st_annotations['name'] == cell_id]
                if not cell_row.empty:
                    # try to catch a few errors
                    if cell_row.shape[0] > 1:
                        error_msg = "DATA ERROR: Ignoring identical cell IDs ({}) for cell {} in {}".format(
                            cell_id, cell_name, st_annotations_fp)
                        errors.append(error_msg)
                        print(error_msg)
                        continue
                    
                    # if there is nothing wrong, then proceed
                    if vol_num not in annotation_output[cell_name]['timepoints']:
                        cell_coords = cell_row[['x_voxels','y_voxels','z_voxels']].values.flatten().tolist()
                        annotation_output[cell_name]['coordinates'].append(cell_coords)
                        annotation_output[cell_name]['timepoints'].append(vol_num)
        except:
            error_msg = "DATA ERROR: Failure to read file {}".format(st_annotations_fp)
            errors.append(error_msg)
            print(error_msg)
                
    return seam_cell_output, annotation_output, errors

def convert_cell_key_csv2json(csv_filepath):
    cell_key_json = {}
    # load in csv file
    cell_key = pd.read_csv(csv_filepath, header=None, engine='python')
    
    ## get necessary information
    cell_key_json['name'] = str(cell_key.iloc[0,0])
    cell_key_json['start'] = int(cell_key.iloc[1,0])
    cell_key_json['end'] = int(cell_key.iloc[1,1])
    try:
        cell_key_json['outliers'] = cell_key.iloc[2].astype(int).values.tolist()
    except:
        cell_key_json['outliers'] = [] # if there are none
    
    # grab mapping
    cell_key_json['mapping'] = {}
    for row_idx in range(3, cell_key.shape[0]): # mapping starts row idx 3
        cell_id = str(cell_key.iloc[row_idx,0])
        cell_name = str(cell_key.iloc[row_idx,1])
        cell_key_json['mapping'][cell_id] = cell_name

    return cell_key_json
    
def get_cell_key(pos_folderpath):
    # return the cell key in dict/json format and convert any existing 
    # cell key csv into json if the json version doesn't exist.
    cell_key_filepath_csv = os.path.join(pos_folderpath, 'CellKey.csv')
    cell_key_filepath_json = os.path.join(pos_folderpath, 'cell_key.json')
    
    # if the json exists, use it
    if os.path.isfile(cell_key_filepath_json):
        with open(cell_key_filepath_json) as f:
            return json.load(f)
    elif os.path.isfile(cell_key_filepath_csv):
        cell_key_json = convert_cell_key_csv2json(cell_key_filepath_csv)
        cell_key_json_dump = json.dumps(cell_key_json, sort_keys=True, indent=4)
        with open(cell_key_filepath_json, "w") as f: 
            f.write(cell_key_json_dump)
        return cell_key_json
    else:
        cell_key_error_msg = 'Missing cell key: {}'.format(cell_key_filepath_json)
        print(cell_key_error_msg)
        return None
                
strain_info = config['data']['strains']
compiled_data = {}
for strain in strain_info:
    if strain['include']:
        if strain['name'] not in compiled_data.keys():
            compiled_data[strain['name']] = {}
                
        for pos_folderpath in strain['folderpaths']:
            print('CURRENTLY LOADING: {}'.format(pos_folderpath))
            # find and load cell key
            cell_key = get_cell_key(pos_folderpath)
            if not cell_key:
                continue
                
            # get data
            seam_cells, annotations, strain_errors = parse_mipav_data(
                config, cell_key, pos_folderpath)
            
            # combine data
            pos_name = cell_key['name']
            compiled_data[strain['name']][pos_name] = {}
            compiled_data[strain['name']][pos_name]['cell_key'] = cell_key
            compiled_data[strain['name']][pos_name]['seam_cells'] = seam_cells
            compiled_data[strain['name']][pos_name]['annotations'] = annotations
            compiled_data[strain['name']][pos_name]['errors'] = strain_errors

# save information as intermediate step in workspace
compiled_json = json.dumps(compiled_data, sort_keys=True, indent=4)
compiled_json_filepath = os.path.join(
    workspace_folderpath, '1_compiled_data.json')
with open(compiled_json_filepath, "w") as f: 
    f.write(compiled_json) 

CURRENTLY LOADING: Y:\RyanC\Cell Tracking Project\OD1599_NU\OD1599_MostRecent\120619_Pos2\Decon_reg
CURRENTLY LOADING: Y:\RyanC\Cell Tracking Project\OD1599_NU\OD1599_MostRecent\112719_Pos3\Decon_Reg
CURRENTLY LOADING: Y:\RyanC\Cell Tracking Project\OD1599_NU\OD1599_MostRecent\112619_Pos0\Decon_reg
CURRENTLY LOADING: Y:\RyanC\Cell Tracking Project\DCR6485_RPM1_NU\011419_Pos0\Decon_reg
CURRENTLY LOADING: Y:\RyanC\Cell Tracking Project\DCR6485_RPM1_NU\011419_Pos4\Decon_reg
CURRENTLY LOADING: Y:\RyanC\Cell Tracking Project\DCR6485_RPM1_NU\021020_Pos2\Decon_Reg
DATA ERROR: Ignoring identical cell IDs (A3) for cell nr_3 in Y:\RyanC\Cell Tracking Project\DCR6485_RPM1_NU\021020_Pos2\Decon_Reg\RegB\Decon_reg_8\Decon_reg_8_results\straightened_annotations\straightened_annotations.csv
DATA ERROR: Ignoring identical cell IDs (A7) for cell nr_7 in Y:\RyanC\Cell Tracking Project\DCR6485_RPM1_NU\021020_Pos2\Decon_Reg\RegB\Decon_reg_16\Decon_reg_16_results\straightened_annotations\straightened_annota

In [7]:
# STEP 2: Check for outliers, uses parameters defined in config.json

window_size = config['settings']['outlier_removal']['window_size']
n_stdev = config['settings']['outlier_removal']['n_stdev']
compiled_data_no_outliers = copy.deepcopy(compiled_data)
for strain in compiled_data.keys():
    for pos in compiled_data[strain].keys():

        # determine outliers for both seam cells and annotations
        for cells_type in ['seam_cells', 'annotations']:
            for cell in compiled_data[strain][pos][cells_type].keys():
                coordinates = np.array(compiled_data[strain][pos][cells_type][cell]['coordinates'])

                # separate by x,y,z (index 0-2) and filter
                outlier_idxs = {} # set automatically removes repeats
                for dim_idx in range(3):
                    coord_dim_data = copy.deepcopy(coordinates[:, dim_idx])
                    data_filtered, outlier_idx, outlier_val = hampel_filter(coord_dim_data, 
                            n_sigmas=n_stdev, window=window_size)
                    coordinates[:, dim_idx] = data_filtered

                # replace time series outliers using median
                compiled_data_no_outliers[strain][pos][cells_type][cell]['coordinates'] = coordinates.tolist()

print('Step 2 Completed.')
# save information as intermediate step in workspace
compiled_json = json.dumps(compiled_data_no_outliers, sort_keys=True, indent=4)
compiled_json_filepath = os.path.join(
    workspace_folderpath, '2_compiled_data_no_outliers.json')
with open(compiled_json_filepath, "w") as f: 
    f.write(compiled_json)
print('Step 2 Data Logged.')

Step 2 Completed.
Step 2 Data Logged.


In [8]:
# STEP 3: Interpolate each to appropriate time scale
compiled_data_interp = copy.deepcopy(compiled_data_no_outliers)
seam_cells_on = config['settings']['interpolation']['seam_cells_on'] # in minutes
total_len = config['settings']['interpolation']['total_min'] # in minutes
interp_method = config['settings']['interpolation']['method']
min_timepoints_required = config['settings']['interpolation']['min_timepoints_required']
new_timepoints = np.linspace(0, total_len)

for strain in compiled_data_no_outliers.keys():
    for pos in compiled_data_no_outliers[strain].keys():
        
        cell_key = compiled_data_no_outliers[strain][pos]['cell_key']
        # interpolate for both seam cells and annotations
        for cells_type in ['seam_cells', 'annotations']:
            for cell in compiled_data_no_outliers[strain][pos][cells_type].keys():
                timepoints = np.array(compiled_data_no_outliers[strain][pos][cells_type][cell]['timepoints'])
                coordinates = np.array(compiled_data_no_outliers[strain][pos][cells_type][cell]['coordinates'])
                
                # CHECKS if there are a sufficient number of timepoints. if not, raise error.
                if timepoints.size < min_timepoints_required:
                    error_msg = "DATA ERROR: Insufficient timepoints in {}/{} for cell {} where only {} (volumes {}), less than the required {}, exist.".format(
                        strain, pos, cell, str(len(timepoints)), 
                        ",".join(timepoints.astype(str).tolist()), str(min_timepoints_required))
                    compiled_data_interp[strain][pos]['errors'].append(error_msg)
                    print(error_msg)
                    continue
                
                # handle seam cells and annotations differently because 
                # some seam cells appear after twitching begins
                if cells_type == 'seam_cells':
                    # determine if strain starts after designated seam cell on
                    seam_cell_on_percent = (min(timepoints) - cell_key['start'])/(cell_key['end'] - cell_key['start']) # find how far in is this timpoint
                    seam_cell_on_percent = max(seam_cells_on[cell], seam_cell_on_percent)
                    starting_idx = int(round(seam_cell_on_percent * total_len))
                    
                    # print(len(timepoints), seam_cells_on[cell], seam_cell_on_percent, starting_idx)
                    
                    # calculate when the cell turns on, doesn't necessarily correspond to index
                    og_target = round((cell_key['end'] - cell_key['start']) * seam_cell_on_percent + cell_key['start'])
                    og_starting_idx = np.argmin(np.abs(timepoints - og_target)).astype(int) # find closest to target
                    
                elif cells_type == 'annotations':
                    starting_idx = 0
                    og_starting_idx = 0
                
                new_coordinates = np.zeros((total_len, 3))
                new_timepoints = np.arange(total_len)
                
                # interpolate one dimension at a time
                for dim_idx in range(3):
                    # crop the original data to the appropriate length, e.g. Q: top 25% of volumes
                    cell_timepoints = timepoints[og_starting_idx:].copy()
                    coord_dim_data = coordinates[og_starting_idx:, dim_idx].copy()
                    # rescale time points to range
                    cell_sp_rescaled = cell_timepoints - min(cell_timepoints)
                    cell_sp_rescaled = cell_sp_rescaled/max(cell_sp_rescaled) * (total_len - starting_idx)
                    
                    # tck = interpolate.splrep(cell_sp_rescaled, coord_dim_data, s=0)
                    
                    interp = interpolate.interp1d(cell_sp_rescaled, coord_dim_data,
                                     kind=interp_method) # get interp as if from 0, but shift below
                    # new time scale for specific length (might be part of total)  
                    cell_sp_timepoints = np.arange(total_len - starting_idx)
                    # interped_coords = interpolate.splev(cell_sp_timepoints, tck, der=0)
                    interped_coords = interp(cell_sp_timepoints)
                    new_coordinates[starting_idx:, dim_idx] = interped_coords # shifted

                # replace time series outliers using median
                compiled_data_interp[strain][pos][cells_type][cell]['coordinates'] = new_coordinates.tolist()
                compiled_data_interp[strain][pos][cells_type][cell]['timepoints'] = new_timepoints.tolist()
                
# save information as intermediate step in workspace
print('Step 3 Completed.')
compiled_json = json.dumps(compiled_data_interp, sort_keys=True, indent=4)
compiled_json_filepath = os.path.join(
    workspace_folderpath, '3_compiled_data_interpolation.json')
with open(compiled_json_filepath, "w") as f: 
    f.write(compiled_json)
print('Step 3 Data Logged.')

Step 3 Completed.
Step 3 Data Logged.


In [9]:
# STEP 4: Generate seam cell warping model
exclude_seam_cells = ['QL', 'QR']
seam_cell_strains = config['data']['seam_cells']
smoothing_window = config['settings']['smoothing']['window_size'] 

# go through files and look for the strain names
warp_strain_names = []
for seam_strain_folder in seam_cell_strains:
    cell_key_filepath_json = os.path.join(seam_strain_folder, 'cell_key.json')
    
    # if the json exists, use it
    if os.path.isfile(cell_key_filepath_json):
        with open(cell_key_filepath_json) as f:
            strain_cell_key = json.load(f)
            warp_strain_names.append(strain_cell_key['name'])

# if not warp strain names, use a cached warping model
seam_warp_model_folderpath = config['data']['seam_cells']
print("Using the following strains for warping: {}".format(", ".join(warp_strain_names)))
        
# first combine all the necessary data for seam cells
warp_model_by_cell = {}
for strain in compiled_data_interp.keys():
    for pos in compiled_data_interp[strain].keys():
        
        if pos not in warp_strain_names:
            continue
            
        for cell in compiled_data_interp[strain][pos]['seam_cells'].keys():
            if cell in exclude_seam_cells:
                continue 
                
            if cell not in warp_model_by_cell.keys():
                warp_model_by_cell[cell] = []
            
            seam_cell_coordindates = np.array(
                compiled_data_interp[strain][pos]['seam_cells'][cell]['coordinates'])
            warp_model_by_cell[cell].append(
                seam_cell_coordindates.tolist())
            
# average all coordinates by cell for warping model
for cell in warp_model_by_cell.keys():
    # average all the coordinates, step 5
    cell_coords = np.array(warp_model_by_cell[cell])
    warp_model_by_cell[cell] = np.average(warp_model_by_cell[cell], axis=0) # full coordinates
    
    # smooth, step 7
    coordinates = warp_model_by_cell[cell]
    for dim_idx in range(3):
        coord_dim_data = coordinates[:, dim_idx].copy()
        data_smoothed = moving_average(coord_dim_data, smoothing_window)
        coordinates[:, dim_idx] = data_smoothed
    
    # convert to list
    warp_model_by_cell[cell] = coordinates.tolist() # full coordinates
    
# convert to all cells for each timepoint
total_len = config['settings']['interpolation']['total_min'] # in minutes
new_timepoints = np.arange(0, total_len)
seam_cells_on = config['settings']['interpolation']['seam_cells_on']
sorted_seam_cells = sorted(warp_model_by_cell.keys()) # maintain some order
warp_model_by_timepoint = []
for timepoint in new_timepoints.astype(int):
    timepoint_coords_by_cell = []
    for seam_cell in sorted_seam_cells:
        all_timepoint_coords = np.array(warp_model_by_cell[seam_cell])
        seam_cell_coords = all_timepoint_coords[timepoint, :].tolist()
        timepoint_coords_by_cell.append(seam_cell_coords)
        
    warp_model_by_timepoint.append(timepoint_coords_by_cell)

print('Step 4 Completed.')
warping_model = warp_model_by_timepoint # rename
# save information as intermediate step in workspace
output_json = {}
output_json['seam_cells'] = sorted_seam_cells
output_json['coordinates'] = warping_model
warping_model = output_json

compiled_json = json.dumps(output_json, sort_keys=True, indent=4)
compiled_json_filepath = os.path.join(
    workspace_folderpath, '4_seam_cell_warping_model.json')
with open(compiled_json_filepath, "w") as f: 
    f.write(compiled_json)
print('Step 4 Data Logged.')

Using the following strains for warping: OD1599_NU_1206_Pos2, OD1599_NU_1127_Pos3, OD1599_NU_1126_Pos0
Step 4 Completed.
Step 4 Data Logged.


In [10]:
# STEP 5: Warp strains to warping model
# we need to do this first because seam cell information
timepoints = np.arange(0, total_len).astype(int).tolist()
compiled_data_warped = copy.deepcopy(compiled_data_interp)
for strain in compiled_data_interp.keys():
    for pos in compiled_data_interp[strain].keys():
        print('Warping {}, Position {}'.format(strain, pos))
        for timepoint in timepoints:
            # get warp from/to model at timepoint (the seam cells)
            pos_seam_cells = compiled_data_interp[strain][pos]['seam_cells']
            warp_to_seam_cell_names = warping_model['seam_cells']
            warp_to_seam_cell_coords = warping_model['coordinates']
            warp_to = warp_to_seam_cell_coords[timepoint]
            warp_from = [] # obtains the correct order since it's in a dict
            for warp_to_seam_cell_name in warp_to_seam_cell_names:
                if warp_to_seam_cell_name in pos_seam_cells.keys():
                    all_time_coords = pos_seam_cells[warp_to_seam_cell_name]['coordinates']
                    warp_from_timepoint_coord = all_time_coords[timepoint]
                    warp_from.append(warp_from_timepoint_coord)
            # print('from', np.array(warp_from))
            # print('to', np.array(warp_to))
            
            # warp each cell at timepoint, including the seam cells themselves
            for cell_type in ['seam_cells', 'annotations']:
                # get cell order for warping
                sorted_cell_names = compiled_data_interp[strain][pos][cell_type].keys()
                ordered_coord_list = []
                for cell_name in sorted_cell_names:
                    old_coords = [compiled_data_interp[strain][pos][cell_type][cell_name]['coordinates'][timepoint]]
                    old_coords = np.array(old_coords).flatten().tolist() # ensure correct data shape
                    ordered_coord_list.append(old_coords)
                
                # warping
                new_coords = thin_plate_spline_warp(warp_from, warp_to, ordered_coord_list)
                new_coords = new_coords.tolist()
                
                # assigning
                for cell_idx, cell_name in enumerate(sorted_cell_names):
                    # if the old coordinates used to be 0 (ignored), then keep it that way
                    old_coords = compiled_data_interp[strain][pos][cell_type][cell_name]['coordinates'][timepoint]
                    
                    if old_coords == [0, 0, 0]:
                        assign_coords = old_coords
                    else:
                        assign_coords = new_coords[cell_idx]   
                    compiled_data_warped[strain][pos][cell_type][cell_name]['coordinates'][timepoint] = \
                        assign_coords
    
print('Step 5 Completed.')
# save information as intermediate step in workspace
output_json = compiled_data_warped
compiled_json = json.dumps(output_json, sort_keys=True, indent=4)
compiled_json_filepath = os.path.join(
    workspace_folderpath, '5_compiled_data_warped.json')
with open(compiled_json_filepath, "w") as f: 
    f.write(compiled_json)
print('Step 5 Data Logged.')

Warping OD1599_NU, Position OD1599_NU_1206_Pos2
Warping OD1599_NU, Position OD1599_NU_1127_Pos3
Warping OD1599_NU, Position OD1599_NU_1126_Pos0
Warping DCR6485_RPM1_NU, Position ï»¿DCR6485NU_011419_Pos0
Warping DCR6485_RPM1_NU, Position ï»¿DCR6485NU_011419_Pos4
Warping DCR6485_RPM1_NU, Position ï»¿DCR6485NU_021020_Pos2
Step 5 Completed.
Step 5 Data Logged.


In [11]:
# STEP 6: Reformat data to average points
step_data = copy.deepcopy(compiled_data_warped)
data_by_cell = {'seam_cells':{}, 'annotations':{}}
for strain in step_data.keys():
    for pos in step_data[strain].keys():
        for cell_type in ['seam_cells', 'annotations']:
            for cell in step_data[strain][pos][cell_type].keys():
                if cell not in data_by_cell.keys():
                    data_by_cell[cell_type][cell] = []

                cell_coordindates = np.array(
                    step_data[strain][pos][cell_type][cell]['coordinates'])
                data_by_cell[cell_type][cell].append(
                    cell_coordindates.tolist())
            
# average all coordinates by cell
for cell_type in ['seam_cells', 'annotations']:
    # seam cells should just result in the average warping model
    for cell in data_by_cell[cell_type].keys():
        # average all the coordinates
        cell_coords = np.array(data_by_cell[cell_type][cell])
        data_by_cell[cell_type][cell] = np.average(data_by_cell[cell_type][cell], axis=0).tolist()

print('Step 6 Completed.')
# save information as intermediate step in workspace
output_json = data_by_cell
compiled_json = json.dumps(output_json, sort_keys=True, indent=4)
compiled_json_filepath = os.path.join(
    workspace_folderpath, '6_cell_coordinates_by_timepoint.json')
with open(compiled_json_filepath, "w") as f: 
    f.write(compiled_json)
print('Step 6 Data Logged.')

Step 6 Completed.
Step 6 Data Logged.


In [12]:
# STEP 7: Perform spatial moving average for each cell
spatial_averaged = copy.deepcopy(data_by_cell)
smoothing_window = config['settings']['smoothing']['window_size']

# average all coordinates by cell
for cell_type in ['seam_cells', 'annotations']:
    # seam cells should just result in the average warping model
    for cell in spatial_averaged[cell_type].keys():
        # spatial average all the coordinates by time
        coordinates = np.array(spatial_averaged[cell_type][cell])
        # go through each axis
        for dim_idx in range(3):
            coord_dim_data = coordinates[:, dim_idx].copy()
            data_smoothed = moving_average(coord_dim_data, smoothing_window)
            coordinates[:, dim_idx] = data_smoothed

        spatial_averaged[cell_type][cell] = coordinates.tolist()

print('Step 7 Completed.')
# save information as intermediate step in workspace
output_json = spatial_averaged
compiled_json = json.dumps(output_json, sort_keys=True, indent=4)
compiled_json_filepath = os.path.join(
    workspace_folderpath, '7_cell_coordinates_by_timepoint_smoothed.json')
with open(compiled_json_filepath, "w") as f: 
    f.write(compiled_json)
print('Step 7 Data Logged.')

Step 7 Completed.
Step 7 Data Logged.


In [29]:
# STEP 8: Convert to csv in MIPAV format

# create timepoints array again just to be safe
total_len = config['settings']['interpolation']['total_min'] # in minutes
timepoints = np.arange(0, total_len)

# generate place to put all the files
output_folder = os.path.join(workspace_folderpath, 'output')
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
                

for cell_type in ['seam_cells', 'annotations']:
    cells = spatial_averaged[cell_type]

    for cell_name in cells.keys():
        filename_csv = cell_name +'.csv'
        filepath_csv = os.path.join(output_folder, filename_csv)

        coordinates = spatial_averaged[cell_type][cell_name]
        new_table = pd.DataFrame(coordinates) * 0.1625
        new_table.columns = ['x', 'y', 'z']
        new_table.insert(0, "timepoints", timepoints, True)
        # colors
        placeholder_color = np.ones((total_len,)) * 255
        new_table.insert(4, "R", placeholder_color, True)
        new_table.insert(5, "G", placeholder_color, True)
        new_table.insert(6, "B", placeholder_color, True)
        new_table.insert(7, "A", np.zeros((total_len,)), True)

        new_table.to_csv(filepath_csv, index=False, header=False)
        
full_output_folderpath = os.path.join(os.getcwd(), output_folder)
print('Step 8 Completed.')
print('Output location: {}'.format(full_output_folderpath))

Step 8 Completed.
Output location: Y:\RyanC\model_building_code\workspace\2020_08_16-OD1599_NU+NerveRing\output
