Cyna Shirazinejad, 02/11/2022
# Notebook 1: loading data for model generation

outline:

* load all data, including:
    * movies from AP2-tagRFP-T, DNM2-tagGFP2 cell lines
* filter out 'valid' tracks
    * valid' tracks are tracks which consist of tracks that appear and disappear 
      in the bounds of the movie with no more than 2 consecutive gaps
    * this is characterized when using AP2 as the primary channel for tracking
* creating dataframes of features from tracked events from fitted amplitude and position space to target feature space
    * each track will be decomposed into features, described in the notebook
    * the number of cell line tags will be included as a label (2 or 3)
    * the experiment number will be included as a label (1-8)
    * the date of the experiment
    * the cmeAnalysis classification as "DNM2-positive" (cmeAnalysisDNM2+) 
      or "DNM2-negative" will be included as a label (1 or 0)
* save dataframes and tracks for future notebooks

# user parameters to toggle plot-generation and/or dataframe construction and corresponding calculations

In [2]:
# set a path to the prefix of the pooled working directory with all of the data 
# the folder that contains all data for this analysis is 'ap2dynm2arcp3_project'
# (this folder, containing all raw and tracking data, is available on GitHub)
unique_user_path_tracks_ada_wildtype = '/Volumes/Google Drive/My Drive/Drubin Lab/ap2dynm2arcp3_project/ARPC3 manuscript/raw_data/airyscan/tracked_data_merged_for_analysis' # needs to be set for each user
unique_user_path_notebook = '/Users/cynashirazinejad/Documents/GitHub/Jin_Shirazinejad_et_al_branched_actin_manuscript/analysis/simplified_workflow_airyscan'
unique_user_saved_outputs = '/Volumes/GoogleDrive/My Drive/Drubin Lab/ap2dynm2arcp3_project/stable_outputs_airyscan'

# import all necessary Python modules

In [3]:
%load_ext autoreload
%autoreload 2
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import Image
sys.path.append(unique_user_path_notebook+'/cmeAnalysisPostProcessingSimplified') # add custom Python scripts to the local path
import import_tracks
import feature_extraction_modular

# auto directory adding for notebooks

In [4]:
unique_user_saved_outputs

'/Volumes/GoogleDrive/My Drive/Drubin Lab/ap2dynm2arcp3_project/stable_outputs_airyscan'

In [5]:
if 'plots' not in os.listdir(unique_user_saved_outputs):
    os.mkdir(unique_user_saved_outputs+'/plots/')
if 'dataframes' not in os.listdir(unique_user_saved_outputs):
    os.mkdir(unique_user_saved_outputs+'/dataframes/')

In [6]:
# np.save(unique_user_path_notebook+'/unique_user_saved_outputs_simplified', unique_user_saved_outputs)

In [7]:
# unique_user_saved_outputs = str(np.load('unique_user_saved_outputs_simplified.npy'))

In [8]:
unique_user_saved_outputs

'/Volumes/GoogleDrive/My Drive/Drubin Lab/ap2dynm2arcp3_project/stable_outputs_airyscan'

In [9]:
analysis_metadata = {}
analysis_metadata['path_outputs'] = unique_user_saved_outputs
np.save('analysis_metadata', analysis_metadata)

In [10]:
analysis_metadata = np.load('analysis_metadata.npy', allow_pickle=True)

In [11]:
analysis_metadata.dtype

dtype('O')

In [12]:
# analysis_metadata.item()['path_tracks_ad_wildtype'] = unique_user_path_tracks_ad_wildtype
analysis_metadata.item()['path_notebook'] = unique_user_path_notebook
np.save('analysis_metadata', analysis_metadata)

In [13]:
analysis_metadata

array({'path_outputs': '/Volumes/GoogleDrive/My Drive/Drubin Lab/ap2dynm2arcp3_project/stable_outputs_airyscan', 'path_notebook': '/Users/cynashirazinejad/Documents/GitHub/Jin_Shirazinejad_et_al_branched_actin_manuscript/analysis/simplified_workflow_airyscan'},
      dtype=object)

# all track feature options:

conventions:
1. intensities: fitted amplitude of fluorescence (excluding fitted local background)
2. positions: fitted positions (x,y) of two dimensional point-spread-functions per frame in track
3. voxel-width = 108 nm

features:

1. 'lifetime': time between the first and last frame of detected AP2 (seconds)
2. 'max_int_ap2': maximum intensity (a.u. fluorescence)
3. 'max_int_dnm2': maximum intensity (a.u. fluorescence)
4. 'dist_traveled_ap2': track start-to-finish net movement (pixels)
5. 'dist_traveled_dnm2': track start-to-finish net movement (pixels)
6. 'max_dist_between_ap2_dnm2': the maximum frame-to-frame separation between AP2 and DNM2 (pixels)
7. 'md_ap2': mean displacement (pixels)
8. 'md_dnm2': mean displacement (pixels)
9. 'time_to_peak_ap2': time for the intensity to reach its peak (seconds) [0 if peak is first frame]
10. 'time_to_peak_dnm2': time for the intensity to reach its peak (seconds) [0 if peak is first frame]
11. 'time_after_peak_ap2': time for intensity to decay from its peak (seconds) [0 if peak is last frame]
12. 'time_after_peak_dnm2': time for intensity to decay from its peak (seconds) [0 if peak is last frame]
13. 'time_between_peaks_ap2_dnm2': time between peaks of two channels (seconds)
14. 'avg_int_change_to_peak_ap2': average change in intensity to the peak (a.u. fluorescence) [0 if peak is first frame]
15. 'avg_int_change_to_peak_dnm2': average change in intensity to the peak (a.u. fluorescence) [0 if peak is first frame]
16. 'avg_int_change_after_peak_ap2': average change in intensity after the peak (a.u. fluorescence) [0 if peak is last frame]
17. 'avg_int_change_after_peak_dnm2': average change in intensity after the peak (a.u. fluorescence) [0 if peak is last frame]
18. 'peak_int_diff_ap2_dnm2': difference between maximum intensity of channel 0 and channel 1 (a.u. fluorescence)
19. 'ratio_max_int_ap2_dnm2': ratio between maximum intensity of channel 0 and channel 1 (unitless)
20. 'mean_ap2': average of fluorescence (a.u. fluorescence)
21. 'mean_dnm2': average of fluorescence (a.u. fluorescence)
22. 'variation_ap2': variation of fluorescence (a.u. fluorescence^2)
23. 'variation_dnm2': variation of fluorescence (a.u. fluorescence^2)
24. 'skewness_ap2': skewness of fluorescence (unitless)
25. 'skewness_dnm2': skewness of fluorescence (unitless)
26. 'kurtosis_ap2': kurtosis of fluorescence (unitless)
27. 'kurtosis_dnm2': kurtosis of fluorescence (unitless)
28. 'number_significant_dnm2': number of significant detections with p-val lower than provided threshold (counts) [p-val < 0.01]
29. 'max_consecutive_significant_dnm2': maximum number of consecutive significant detections with p-val lower than provided threshold (counts) [p-val < 0.01]
30. 'fraction_significant_dnm2': fraction of event with significant detections with p-val lower than provided threshold (unitless) [p-val < 0.01]
31. 'fraction_peak_ap2': fraction of the event where the peak is located (unitless)
32. 'fraction_peak_dnm2': fraction of the event where the peak is located (unitless)

In [14]:
# the physical units of each track feature
feature_units = ['seconds',
                 'a.u. fluorescence',
                 'a.u. fluorescence',
                 'pixels',
                 'pixels',
                 'pixels',
                 'pixels',
                 'pixels',
                 'seconds',
                 'seconds',
                 'seconds',
                 'seconds',
                 'seconds',
                 'a.u. fluorescence',
                 'a.u. fluorescence',
                 'a.u. fluorescence',
                 'a.u. fluorescence',
                 'a.u. fluorescence',
                 'unitless',
                 'a.u. fluorescence',
                 'a.u. fluorescence',
                 'a.u. fluorescence**2',
                 'a.u. fluorescence**2',
                 'unitless',
                 'unitless',
                 'unitless',
                 'unitless',
                 'counts',
                 'counts',
                 'unitless',
                 'unitless',
                 'unitless']

In [15]:
possible_track_features_labels = ['lifetime',
                                 'max_int_dnm2',
                                 'max_int_arpc3',
                                 'dist_traveled_dnm2',
                                 'dist_traveled_arpc3',
                                 'max_dist_between_dnm2_arpc3',
                                 'md_dnm2',
                                 'md_arpc3',
                                 'time_to_peak_dnm2',
                                 'time_to_peak_arpc3',
                                 'time_after_peak_dnm2',
                                 'time_after_peak_arpc3',
                                 'time_between_peaks_dnm2_arpc3',
                                 'avg_int_change_to_peak_dnm2',
                                 'avg_int_change_to_peak_arpc3',
                                 'avg_int_change_after_peak_dnm2',
                                 'avg_int_change_after_peak_arpc3',
                                 'peak_int_diff_dnm2_arpc3',
                                 'ratio_max_int_dnm2_arpc3',
                                 'mean_dnm2',
                                 'mean_arpc3',
                                 'variation_dnm2',
                                 'variation_arpc3',
                                 'skewness_dnm2',
                                 'skewness_arpc3',
                                 'kurtosis_dnm2',
                                 'kurtosis_arpc3',
                                 'number_significant_arpc3',
                                 'max_consecutive_significant_arpc3',
                                 'fraction_significant_arpc3',
                                 'fraction_peak_dnm2',
                                 'fraction_peak_arpc3']

In [16]:
features_to_keep = list((possible_track_features_labels.index('lifetime'),
possible_track_features_labels.index('max_int_dnm2'),
possible_track_features_labels.index('max_int_arpc3'),
possible_track_features_labels.index('md_dnm2'),
possible_track_features_labels.index('fraction_significant_arpc3')))

In [17]:
features_to_keep

[0, 1, 2, 6, 29]

In [18]:
possible_track_features = ['lifetime',
                            'max_int_ch0',
                            'max_int_ch1',
                            'dist_traveled_ch0',
                            'dist_traveled_ch1',
                            'max_dist_between_ch0-ch1',
                            'md_ch0',
                            'md_ch1',
                            'time_to_peak_ch0',
                            'time_to_peak_ch1',
                            'time_after_peak_ch0',
                            'time_after_peak_ch1',
                            'time_between_peaks_ch0-ch1',
                            'avg_int_change_to_peak_ch0',
                            'avg_int_change_to_peak_ch1',
                            'avg_int_change_after_peak_ch0',
                            'avg_int_change_after_peak_ch1',
                            'peak_int_diff_ch0-ch1',
                            'ratio_max_int_ch0-ch1',
                            'mean_ch0',
                            'mean_ch1',
                            'variation_ch0',
                            'variation_ch1',
                            'skewness_ch0',
                            'skewness_ch1',
                            'kurtosis_ch0',
                            'kurtosis_ch1',
                            'number_significant_ch1',
                            'max_consecutive_significant_ch1',
                            'fraction_significant_ch1',
                            'fraction_peak_ch0',
                            'fraction_peak_ch1']

In [19]:
analysis_metadata.item()['feature_units'] = feature_units
analysis_metadata.item()['possible_track_features'] = possible_track_features
analysis_metadata.item()['possible_track_features_labels'] = possible_track_features_labels

In [20]:
# analysis_metadata.item()['feature_units'] = [feature_units[idx] for idx in features_to_keep]
# analysis_metadata.item()['possible_track_features'] = [possible_track_features[idx] for idx in features_to_keep]
# analysis_metadata.item()['possible_track_features_labels'] = [possible_track_features_labels[idx] for idx in features_to_keep]

# extract features from all tracks, labeled by experiment (0-7), number of imaging channels/labels, and date of experiment

In [21]:
experiment_groups = {}

ad_wildtype_group = {}
ad_wildtype_group['path'] = unique_user_path_tracks_ada_wildtype
ad_wildtype_group['df'] = 'df_ada_wildtype_merged_features'
ad_wildtype_group['tracks'] = 'merged_ada_wildtype_valid_tracks'

experiment_groups['ada_wildtype'] = ad_wildtype_group

In [22]:
analysis_metadata.item()['experiment_groups'] = experiment_groups

In [23]:
np.save('analysis_metadata', analysis_metadata)

In [24]:
import_tracks.upload_tracks_and_metadata(analysis_metadata,
                                         'ada_wildtype',
                                         [1],
                                         ['Cell'],
                                         possible_track_features,
                                         possible_track_features_labels)


folders to mine:
61_20211208_ap2-dnm2-arpc3_dnm2-arpc3_wildtype-airyscan_no-treatment_Cell1_0.2s
62_20211208_ap2-dnm2-arpc3_dnm2-arpc3_wildtype-airyscan_no-treatment_Cell2_0.2s
63_20211208_ap2-dnm2-arpc3_dnm2-arpc3_wildtype-airyscan_no-treatment_Cell3_0.2s
64_20211214_ap2-dnm2-arpc3_dnm2-arpc3_wildtype-airyscan_no-treatment_Cell1_0.2s
65_20211214_ap2-dnm2-arpc3_dnm2-arpc3_wildtype-airyscan_no-treatment_Cell2_0.2s
66_20211214_ap2-dnm2-arpc3_dnm2-arpc3_wildtype-airyscan_no-treatment_Cell3_0.2s
67_20211214_ap2-dnm2-arpc3_dnm2-arpc3_wildtype-airyscan_no-treatment_Cell4_0.2s
68_20211214_ap2-dnm2-arpc3_dnm2-arpc3_wildtype-airyscan_no-treatment_Cell5_0.2s
69_20211222_ap2-dnm2-arpc3_dnm2-arpc3_wildtype-airyscan_no-treatment_Cell1_0.2s
70_20211222_ap2-dnm2-arpc3_dnm2-arpc3_wildtype-airyscan_no-treatment_Cell2_0.2s
71_20211222_ap2-dnm2-arpc3_dnm2-arpc3_wildtype-airyscan_no-treatment_Cell3_0.2s
72_20211222_ap2-dnm2-arpc3_dnm2-arpc3_wildtype-airyscan_no-treatment_Cell4_0.2s
73_20211222_ap2-dnm2-a

In [25]:
analysis_metadata = np.load('analysis_metadata.npy', allow_pickle=True)

In [26]:
analysis_metadata

array({'path_outputs': '/Volumes/GoogleDrive/My Drive/Drubin Lab/ap2dynm2arcp3_project/stable_outputs_airyscan', 'path_notebook': '/Users/cynashirazinejad/Documents/GitHub/Jin_Shirazinejad_et_al_branched_actin_manuscript/analysis/simplified_workflow_airyscan', 'feature_units': ['seconds', 'a.u. fluorescence', 'a.u. fluorescence', 'pixels', 'pixels', 'pixels', 'pixels', 'pixels', 'seconds', 'seconds', 'seconds', 'seconds', 'seconds', 'a.u. fluorescence', 'a.u. fluorescence', 'a.u. fluorescence', 'a.u. fluorescence', 'a.u. fluorescence', 'unitless', 'a.u. fluorescence', 'a.u. fluorescence', 'a.u. fluorescence**2', 'a.u. fluorescence**2', 'unitless', 'unitless', 'unitless', 'unitless', 'counts', 'counts', 'unitless', 'unitless', 'unitless'], 'possible_track_features': ['lifetime', 'max_int_ch0', 'max_int_ch1', 'dist_traveled_ch0', 'dist_traveled_ch1', 'max_dist_between_ch0-ch1', 'md_ch0', 'md_ch1', 'time_to_peak_ch0', 'time_to_peak_ch1', 'time_after_peak_ch0', 'time_after_peak_ch1', 'time