Cyna Shirazinejad, 12/28/21

# outline of notebook 18:

* load data from cell lines:
* * AP2-tagRFP-T, tagGFP2-DNM2, ARPC3-HaloTag (prior to shock)
* * AP2-tagRFP-T, tagGFP2-DNM2, ARPC3-HaloTag (after shock)
* extract features from tracks
* * use existing feature scaler, decomposition axes, and mixture model to predict the identity of each new event
* merge the new data with existing tracks, features, and model cluster identities

# import all necessary Python modules

In [66]:
%load_ext autoreload
%autoreload 2
import sys
import os
import pandas as pd
import numpy as np
import sklearn.preprocessing as preprocessing
from sklearn.decomposition import PCA
import pickle
from scipy.fft import rfft, rfftfreq
from scipy import signal
unique_user_path_notebook = str(np.load('unique_user_path_notebook.npy'))
unique_user_saved_outputs = str(np.load('unique_user_saved_outputs.npy'))
unique_user_path_tracks = str(np.load('unique_user_path_tracks.npy'))
sys.path.append(unique_user_path_notebook+'/cmeAnalysisPostProcessingPythonScripts') # add custom Python scripts to the local path
import display_tracks
import merge_tools
import return_track_attributes
import generate_index_dictionary
import feature_extraction_with_buffer

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# load dataframe from notebook 3 containing normal-pdf scaled features: PC's and GMM predicted clusters, and dataframe with cmeAnalysis labels

In [7]:
df_pcs_normal_scaled_with_gmm_cluster = pd.read_csv(unique_user_saved_outputs+'/dataframes/df_new_incorporated_data_pcs_gmm_clusters.zip')
df_merged_features = pd.read_csv(unique_user_saved_outputs+'/dataframes/df_new_incorporated_data_merged_features.zip')
feature_units = np.load(unique_user_saved_outputs+'/dataframes/feature_units.npy')
index_DNM2positive = np.load(unique_user_saved_outputs+'/dataframes/cluster_dnm2_positive.npy')
number_of_track_splits = np.load(unique_user_saved_outputs+'/dataframes/number_of_track_splits.npy')
number_of_clusters = np.load(unique_user_saved_outputs+"/dataframes/number_of_clusters.npy")
best_fit_peak_params = np.load(unique_user_saved_outputs+'/dataframes/parameters_best_fit_peak_finding.npy')
ccp_predictions = np.load(unique_user_saved_outputs+'/dataframes/merged_ccp_predictions.npy')

In [8]:
df_merged_features

Unnamed: 0,lifetime,max_int_ap2,max_int_dnm2,dist_traveled_ap2,dist_traveled_dnm2,max_dist_between_ap2_dnm2,md_ap2,md_dnm2,time_to_peak_ap2,time_to_peak_dnm2,...,number_significant_dnm2,max_consecutive_significant_dnm2,fraction_significant_dnm2,fraction_peak_ap2,fraction_peak_dnm2,experiment_number,number_of_channels,date,cell_condition,cmeAnalysis_dynamin2_prediction
0,216.0,1796.284550,740.516756,7.203812,7.203812,2.630656,0.566262,0.725913,159.0,186.0,...,167.0,43.0,0.738938,0.703540,0.823009,0.0,2.0,200804.0,no_treatment,1.0
1,201.0,2215.532695,1505.433273,17.220726,14.507135,3.797526,0.424574,0.912671,111.0,107.0,...,115.0,89.0,0.545024,0.526066,0.507109,0.0,2.0,200804.0,no_treatment,1.0
2,201.0,864.976087,421.405691,17.621866,17.130473,3.472332,0.566588,0.776404,42.0,44.0,...,167.0,70.0,0.791469,0.199052,0.208531,0.0,2.0,200804.0,no_treatment,1.0
3,192.0,509.795166,356.302521,10.804211,13.292842,5.888569,0.428736,1.058177,44.0,190.0,...,112.0,57.0,0.554455,0.217822,0.940594,0.0,2.0,200804.0,no_treatment,1.0
4,188.0,1636.422386,883.606436,11.255090,7.793074,5.531006,0.415792,0.753372,174.0,169.0,...,169.0,80.0,0.853535,0.878788,0.853535,0.0,2.0,200804.0,no_treatment,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
154359,2.0,71.707995,35.951988,5.059682,5.014421,2.181787,0.766658,1.257316,5.0,4.0,...,0.0,0.0,0.000000,0.416667,0.333333,20.0,3.0,200722.0,no_treatment,0.0
154360,2.0,25.338391,38.053364,2.261592,2.067468,4.632292,1.427061,1.486410,6.0,2.0,...,7.0,3.0,0.583333,0.500000,0.166667,20.0,3.0,200722.0,no_treatment,1.0
154361,2.0,207.539790,166.846303,2.391855,2.634261,4.270496,1.536834,1.356576,5.0,0.0,...,5.0,3.0,0.416667,0.416667,0.000000,20.0,3.0,200722.0,no_treatment,1.0
154362,2.0,16.610995,36.530651,3.546351,2.761683,3.700703,1.630578,1.907946,5.0,9.0,...,2.0,1.0,0.166667,0.416667,0.750000,20.0,3.0,200722.0,no_treatment,0.0


# load all valid tracks

In [9]:
merged_all_valid_tracks = np.load(unique_user_saved_outputs+'/dataframes/all_experiments_merged_all_valid_tracks_0.npy', allow_pickle=True)

for i in range(1,number_of_track_splits):

    merged_all_valid_tracks = np.concatenate((merged_all_valid_tracks,
                                             np.load(unique_user_saved_outputs+'/dataframes/all_experiments_merged_all_valid_tracks_'+str(i)+'.npy', allow_pickle=True)))

# load new hypotonic-shock imaging data, create a dataframe of merged features

In [10]:
# upload only AP2 and DNM2 data for now
all_tracks = [] # a list of all the track objects; each value is one experiment

new_tracks_prefix = '/Volumes/Google Drive/My Drive/Drubin Lab/ap2dynm2arcp3_project/revision_tracking/hypotonic_tracking_data_ap2dnm2'
# this cell is for the following experiment set: prior to shock
tracks_Cell1before_1s=display_tracks.load_tracks(new_tracks_prefix+'/Cell1before_1s/Ch1/Tracking/ProcessedTracks.mat')
tracks_Cell2before_1s=display_tracks.load_tracks(new_tracks_prefix+'/Cell2before_1s/Ch1/Tracking/ProcessedTracks.mat')
tracks_Cell3before_1s=display_tracks.load_tracks(new_tracks_prefix+'/Cell3before_1s/Ch1/Tracking/ProcessedTracks.mat')
tracks_Cell4before_1s=display_tracks.load_tracks(new_tracks_prefix+'/Cell4before_1s/Ch1/Tracking/ProcessedTracks.mat')
tracks_Cell5before_1s=display_tracks.load_tracks(new_tracks_prefix+'/Cell5before_1s/Ch1/Tracking/ProcessedTracks.mat')

all_tracks.append(tracks_Cell1before_1s)
all_tracks.append(tracks_Cell2before_1s)
all_tracks.append(tracks_Cell3before_1s)
all_tracks.append(tracks_Cell4before_1s)
all_tracks.append(tracks_Cell5before_1s)

# this cell is for the following experiment set: following shock
tracks_Cell1after_1s=display_tracks.load_tracks(new_tracks_prefix+'/Cell1after_1s/Ch1/Tracking/ProcessedTracks.mat')
tracks_Cell2after_1s=display_tracks.load_tracks(new_tracks_prefix+'/Cell2after_1s/Ch1/Tracking/ProcessedTracks.mat')
tracks_Cell3after_1s=display_tracks.load_tracks(new_tracks_prefix+'/Cell3after_1s/Ch1/Tracking/ProcessedTracks.mat')
tracks_Cell4after_1s=display_tracks.load_tracks(new_tracks_prefix+'/Cell4after_1s/Ch1/Tracking/ProcessedTracks.mat')
tracks_Cell5after_1s=display_tracks.load_tracks(new_tracks_prefix+'/Cell5after_1s/Ch1/Tracking/ProcessedTracks.mat')

all_tracks.append(tracks_Cell1after_1s)
all_tracks.append(tracks_Cell2after_1s)
all_tracks.append(tracks_Cell3after_1s)
all_tracks.append(tracks_Cell4after_1s)
all_tracks.append(tracks_Cell5after_1s)

In [73]:
num_old_experiments = len(set(df_merged_features['experiment_number']))

In [79]:
def upload_tracks_and_metadata(path_to_tracks,
                               track_categories,
                               identifier_string,
                               experiment_number_adjustment,
                               features,
                               labels):
    
    all_track_paths = os.listdir(path_to_tracks)
    all_track_paths = [exp for exp in all_track_paths if identifier_string in exp]
    print(all_track_paths)
    
    tracks = []
    dates = []
    cell_line_tags = []
    current_tracked_channels = []
    number_of_tags = []
    experiment = []
    condition = []
    experiment_number = []
    framerate = []
    
    for exp_number, exp in enumerate(all_track_paths):
        
        current_tracks = display_tracks.load_tracks(path_to_tracks + exp)
        current_tracks = display_tracks.remove_tracks_by_criteria(current_tracks, track_category=track_categories)
        tracks.append(current_tracks)
        
        num_tracks = len(current_tracks)
        
        metadata = exp.split('_')
        
        tracks += current_tracks
        dates += [metadata_temp[0]]*num_tracks
        cell_line_tags += [metadata[1]]*num_tracks
        current_tracked_channels += [metadata[2]]*num_tracks
        number_of_tags += [len(metadata[1])]*num_tracks
        experiment += [metadata[3]]*num_tracks
        condition += [metadata[4]]*num_tracks
        experiment_number += [exp_number+experiment_number_adjustment]*num_tracks
        framerate += [metadata[6]]
        
    
    merged_all_tracks = merge_tools.merge_experiments(tracks,[list(range(len(track_set))) for track_set in tracks])
    
    # extract the output of cmeAnalysis' predictions on whether a track is DNM2 positive or negative
    significant_dynamin2_cmeAnalysis_prediction = []

    # an index map for ProcessedTracks.mat attributes for 2 color tracking experiments from cmeAnalysis
    index_dictionary = generate_index_dictionary.return_index_dictionary()

    for track in merged_all_tracks: # iterate through all tracks

        significant_dynamin2 = track[index_dictionary['index_significantSlave']][1]
        significant_dynamin2_cmeAnalysis_prediction.append(significant_dynamin2)

    all_track_features = feature_extraction_with_buffer.TrackFeatures(merged_all_tracks) # an instance of a to-be feature matrix of tracks
    all_track_features.add_features(features) # set the features to be extracted
    all_track_features.extract_features() # extract all features
    extracted_features = all_track_features.feature_matrix # feature matrix for all tracks
    
    # merge features with labels (experiment number, date, and number of channels)
    extracted_features = np.array(extracted_features)

    merged_features = np.concatenate((extracted_features,
                                      np.array(experiment_number).reshape(extracted_features.shape[0],-1)), axis=-1)
    merged_features = np.concatenate((merged_features,
                                      np.array(number_of_tags).reshape(merged_features.shape[0],-1)), axis=-1)
    merged_features = np.concatenate((merged_features,
                                      np.array(dates).reshape(merged_features.shape[0],-1)), axis=-1)
    merged_features = np.concatenate((merged_features,
                                      np.array(condition).reshape(merged_features.shape[0],-1)), axis=-1)
    merged_features = np.concatenate((merged_features,
                                      np.array(significant_dynamin2_cmeAnalysis_prediction).reshape(merged_features.shape[0],-1)), axis=-1)

In [78]:
upload_tracks_and_metadata('/Users/cynashirazinejad/Desktop/test/',
                           [1],
                           'Cell')

['211203_ap2-dnm2-arpc3_ap2-dnm2_hypotonic_before_Cell1_1s', '211203_ap2-dnm2-arpc3_ap2-dnm2_hypotonic_after_Cell1_1s']


In [26]:
# extract tracks and metadata for hypotonic treatment data
valid_tracks_separate_experiments_3_color = [display_tracks.remove_tracks_by_criteria(track_set, track_category=[1]) for track_set in all_tracks]
# merge all valid tracks into one tracks array
merged_all_valid_tracks_3_color = merge_tools.merge_experiments(valid_tracks_separate_experiments_3_color,[list(range(len(track_set))) for track_set in valid_tracks_separate_experiments_3_color])
experiment_number_3_channel_label = [i 
                                     for i in range(num_old_experiments, 
                                                    num_old_experiments
                                                    +len(valid_tracks_separate_experiments_3_color)) 
                                     for _ in range(len(valid_tracks_separate_experiments_3_color[i-num_old_experiments]))]
# labels for the two days of imaging
date_of_experiment_3_channel = []
for i in range(len(experiment_number_3_channel_label)):
    
        date_of_experiment_3_channel.append(211203)
        
# experimental treatment
cell_condition = []
for i in range(num_old_experiments, 
               num_old_experiments + 
               len(valid_tracks_separate_experiments_3_color)):
    if i<num_old_experiments+5:
            
        cell_condition+=['hypotonic_no_treatment']*len(valid_tracks_separate_experiments_3_color[i-num_old_experiments])
        
    else:

        cell_condition+=['hypotonic_treatment']*len(valid_tracks_separate_experiments_3_color[i-num_old_experiments])

number_of_channels_label = [3 for i in range(len(experiment_number_3_channel_label))]

The number of tracks returned: 7399

The number of tracks returned: 8653

The number of tracks returned: 7902

The number of tracks returned: 8067

The number of tracks returned: 7786

The number of tracks returned: 10522

The number of tracks returned: 8030

The number of tracks returned: 9899

The number of tracks returned: 9887

The number of tracks returned: 10165



# load new CK666-treatment imaging data, create a dataframe of merged features

In [75]:
num_old_experiments += len(valid_tracks_separate_experiments_3_color)

In [76]:
num_old_experiments

31

In [None]:
# upload only AP2 and DNM2 data for now
all_tracks = [] # a list of all the track objects; each value is one experiment

new_tracks_prefix = '/Volumes/Google Drive/My Drive/Drubin Lab/ap2dynm2arcp3_project/revision_tracking/ck666_tracking_data_ap2dnm2'
# this cell is for the following experiment set: 211210, DMSO
tracks_211210DMSOCell1_1s=display_tracks.load_tracks(new_tracks_prefix+'/Cell1before_1s/Ch1/Tracking/ProcessedTracks.mat')
tracks_211210DMSOCell2_1s=display_tracks.load_tracks(new_tracks_prefix+'/Cell2before_1s/Ch1/Tracking/ProcessedTracks.mat')
tracks_211210DMSOCell2_1s=display_tracks.load_tracks(new_tracks_prefix+'/Cell3before_1s/Ch1/Tracking/ProcessedTracks.mat')


all_tracks.append(tracks_Cell1before_1s)
all_tracks.append(tracks_Cell2before_1s)
all_tracks.append(tracks_Cell3before_1s)
all_tracks.append(tracks_Cell4before_1s)
all_tracks.append(tracks_Cell5before_1s)

# this cell is for the following experiment set: following shock
tracks_Cell1after_1s=display_tracks.load_tracks(new_tracks_prefix+'/Cell1after_1s/Ch1/Tracking/ProcessedTracks.mat')
tracks_Cell2after_1s=display_tracks.load_tracks(new_tracks_prefix+'/Cell2after_1s/Ch1/Tracking/ProcessedTracks.mat')
tracks_Cell3after_1s=display_tracks.load_tracks(new_tracks_prefix+'/Cell3after_1s/Ch1/Tracking/ProcessedTracks.mat')
tracks_Cell4after_1s=display_tracks.load_tracks(new_tracks_prefix+'/Cell4after_1s/Ch1/Tracking/ProcessedTracks.mat')
tracks_Cell5after_1s=display_tracks.load_tracks(new_tracks_prefix+'/Cell5after_1s/Ch1/Tracking/ProcessedTracks.mat')

all_tracks.append(tracks_Cell1after_1s)
all_tracks.append(tracks_Cell2after_1s)
all_tracks.append(tracks_Cell3after_1s)
all_tracks.append(tracks_Cell4after_1s)
all_tracks.append(tracks_Cell5after_1s)

# save all valid tracks

In [27]:
for i in range(len(valid_tracks_separate_experiments_3_color)):
    
    np.save(unique_user_saved_outputs+"/dataframes/valid_arpc3_shock_tracks_"+str(i), np.array(list(valid_tracks_separate_experiments_3_color[i])))

In [28]:
# extract the output of cmeAnalysis' predictions on whether a track is DNM2 positive or negative
significant_dynamin2_cmeAnalysis_prediction = []

# an index map for ProcessedTracks.mat attributes for 2 color tracking experiments from cmeAnalysis
index_dictionary = generate_index_dictionary.return_index_dictionary()

for track in merged_all_valid_tracks_3_color: # iterate through all tracks

    significant_dynamin2 = track[index_dictionary['index_significantSlave']][1]
    significant_dynamin2_cmeAnalysis_prediction.append(significant_dynamin2)

In [29]:
print('total number of valid tracks: ' + str(len(merged_all_valid_tracks_3_color)))

total number of valid tracks: 88310


In [30]:
possible_track_features = np.load(unique_user_saved_outputs+'/dataframes/possible_track_features.npy')

In [31]:
possible_track_features

array(['lifetime', 'max_int_ch0', 'max_int_ch1', 'dist_traveled_ch0',
       'dist_traveled_ch1', 'max_dist_between_ch0_ch1', 'md_ch0',
       'md_ch1', 'time_to_peak_ch0', 'time_to_peak_ch1',
       'time_after_peak_ch0', 'time_after_peak_ch1',
       'time_between_peaks_ch0_ch1', 'avg_int_change_to_peak_ch0',
       'avg_int_change_to_peak_ch1', 'avg_int_change_after_peak_ch0',
       'avg_int_change_after_peak_ch1', 'peak_int_diff_ch0_ch1',
       'ratio_max_int_ch0_ch1', 'mean_ch0', 'mean_ch1', 'variation_ch0',
       'variation_ch1', 'skewness_ch0', 'skewness_ch1', 'kurtosis_ch0',
       'kurtosis_ch1', 'number_significant_ch1',
       'max_consecutive_significant_ch1', 'fraction_significant_ch1',
       'fraction_peak_ch0', 'fraction_peak_ch1'], dtype='<U31')

In [32]:
all_track_features_3_color = feature_extraction_with_buffer.TrackFeatures(merged_all_valid_tracks_3_color) # an instance of a to-be feature matrix of tracks
all_track_features_3_color.add_features(possible_track_features) # set the features to be extracted
all_track_features_3_color.extract_features() # extract all features
extracted_features_all_tracks_3_color = all_track_features_3_color.feature_matrix # feature matrix for all tracks

In [33]:
# merge features with labels (experiment number, date, and number of channels)
extracted_features_all_tracks_3_color = np.array(extracted_features_all_tracks_3_color)

merged_features = np.concatenate((extracted_features_all_tracks_3_color,
                                  np.array(experiment_number_3_channel_label).reshape(extracted_features_all_tracks_3_color.shape[0],-1)), axis=-1)
merged_features = np.concatenate((merged_features,
                                  np.array(number_of_channels_label).reshape(merged_features.shape[0],-1)), axis=-1)
merged_features = np.concatenate((merged_features,
                                  np.array(date_of_experiment_3_channel).reshape(merged_features.shape[0],-1)), axis=-1)
merged_features = np.concatenate((merged_features,
                                  np.array(cell_condition).reshape(merged_features.shape[0],-1)), axis=-1)
merged_features = np.concatenate((merged_features,
                                  np.array(significant_dynamin2_cmeAnalysis_prediction).reshape(merged_features.shape[0],-1)), axis=-1)

In [34]:
merged_features.shape

(88310, 37)

# use prefit scaler, PCA model, and GMM to fit new dataset to clusters

In [35]:
with open(unique_user_saved_outputs+'/dataframes/normal_scaler_model', 'rb') as f:
    scaler = pickle.load(f)      
    
with open(unique_user_saved_outputs+'/dataframes/pca_model_fit', 'rb') as f:
    pca_model = pickle.load(f)              
    
with open(unique_user_saved_outputs+'/dataframes/gmm_trained', 'rb') as f:
    gmm_model = pickle.load(f)                

In [36]:
scaled_features_new_data = scaler.transform(merged_features[:,:len(feature_units)]) # scale features to normal distribution, taking into account all previously scaled data
pcs_new_data = pca_model.transform(scaled_features_new_data) # find projections of newly scaled data on previous PC axes
gmm_predictions_new_data = gmm_model.predict(pcs_new_data) # find gmm cluster assignments using previously fit model

# run DNM2 positive events through smoothing and single-peak selection

In [37]:
# get DNM2 positive events
dnm2_positive_events = np.array(list(merged_all_valid_tracks_3_color))[np.nonzero(gmm_predictions_new_data==index_DNM2positive)[0]]

In [38]:
len(dnm2_positive_events)

5962

In [39]:
all_dnm2_signal = []

for i in range(len(dnm2_positive_events)): # stack all DNM2 intensities

    raw_dnm2_intensity = list(return_track_attributes.return_track_amplitude_no_buffer_channel(dnm2_positive_events,i,1))

    all_dnm2_signal.append(raw_dnm2_intensity)

In [40]:
sos = signal.butter(4, 0.2, 'lp', fs=1, output='sos') # low-pass 4-th order Butterworth filter

filtered_amplitudes = [] # filtered DNM2 traces per track of interest

for i in range(len(all_dnm2_signal)):

    raw_intensity = all_dnm2_signal[i]
    # add zeros to end to account for phase shift of near-track-end peaks
    filtered_amplitudes.append(list(list(signal.sosfilt(sos, raw_intensity)) + [0, 0, 0, 0, 0])) 
    
current_param_outputs = [] # one-hot encoding of indices of tracks with a single peak (0: multiple peaks)


for i in range(len(filtered_amplitudes)): # iterate through all filtered amplitudes
    
    pvals_dnm2 = return_track_attributes.return_pvals_detection_no_buffer(dnm2_positive_events, i, 1)
    
    # measure whether there is 1 peak with the specified peak-finding parameters
    if len(signal.find_peaks(filtered_amplitudes[i], 
                             distance=best_fit_peak_params[0], 
                             height=best_fit_peak_params[1],
                             width=best_fit_peak_params[2])[0])==1 and len(np.where(np.array(pvals_dnm2)<0.01)[0])>0:

        current_param_outputs.append(1)

    else:

        current_param_outputs.append(0)

In [41]:
len(np.where(np.array(current_param_outputs)==1)[0])

1889

In [43]:
data_add_pc_gmm_dataframe = np.hstack((pcs_new_data, gmm_predictions_new_data.reshape(pcs_new_data.shape[0], 1)))
df_new_incorporated_data_pcs_gmm_clusters = df_pcs_normal_scaled_with_gmm_cluster.copy()
df_new_incorporated_data_pcs_gmm_clusters = df_new_incorporated_data_pcs_gmm_clusters.append(pd.DataFrame(data_add_pc_gmm_dataframe, 
                                                                                                          columns=df_pcs_normal_scaled_with_gmm_cluster.columns))

In [44]:
df_new_incorporated_data_merged_features = df_merged_features.copy()
df_new_incorporated_data_merged_features = df_new_incorporated_data_merged_features.append(pd.DataFrame(merged_features, 
                                                                                                        columns=df_merged_features.columns))
df_new_incorporated_data_merged_features_including_shock = df_new_incorporated_data_merged_features

In [45]:
# save the dataframe 
compression_opts = dict(method='zip',
                        archive_name=unique_user_saved_outputs+'/dataframes/df_new_incorporated_data_merged_features_including_shock.csv')  

df_new_incorporated_data_merged_features.to_csv(unique_user_saved_outputs+'/dataframes/df_new_incorporated_data_merged_features_including_shock.zip', index=False,
                                                compression=compression_opts) 

In [46]:
df_new_incorporated_data_merged_features_including_shock

Unnamed: 0,lifetime,max_int_ap2,max_int_dnm2,dist_traveled_ap2,dist_traveled_dnm2,max_dist_between_ap2_dnm2,md_ap2,md_dnm2,time_to_peak_ap2,time_to_peak_dnm2,...,number_significant_dnm2,max_consecutive_significant_dnm2,fraction_significant_dnm2,fraction_peak_ap2,fraction_peak_dnm2,experiment_number,number_of_channels,date,cell_condition,cmeAnalysis_dynamin2_prediction
0,216,1796.28,740.517,7.20381,7.20381,2.63066,0.566262,0.725913,159,186,...,167,43,0.738938,0.70354,0.823009,0,2,200804,no_treatment,1
1,201,2215.53,1505.43,17.2207,14.5071,3.79753,0.424574,0.912671,111,107,...,115,89,0.545024,0.526066,0.507109,0,2,200804,no_treatment,1
2,201,864.976,421.406,17.6219,17.1305,3.47233,0.566588,0.776404,42,44,...,167,70,0.791469,0.199052,0.208531,0,2,200804,no_treatment,1
3,192,509.795,356.303,10.8042,13.2928,5.88857,0.428736,1.05818,44,190,...,112,57,0.554455,0.217822,0.940594,0,2,200804,no_treatment,1
4,188,1636.42,883.606,11.2551,7.79307,5.53101,0.415792,0.753372,174,169,...,169,80,0.853535,0.878788,0.853535,0,2,200804,no_treatment,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88305,2.0,28.05485953173697,16.586817708708633,2.0835307029948082,5.014687853620793,4.483773037323796,1.4131950317371305,2.791557900089774,5.0,8.0,...,0.0,0.0,0.0,0.4166666666666667,0.6666666666666666,30.0,3.0,211203.0,hypotonic_treatment,0.0
88306,2.0,50.57941223248445,8.44103422317398,6.6831155386852386,2.285076746706151,4.231455668600671,1.775795978871691,0.67534818946296,5.0,1.0,...,0.0,0.0,0.0,0.4166666666666667,0.08333333333333333,30.0,3.0,211203.0,hypotonic_treatment,0.0
88307,2.0,46.81864135262841,-1.9837735485018282,1.533789621829017,2.4092783497472157,3.6604108109348874,1.3990229335486142,1.9535236484493963,6.0,4.0,...,0.0,0.0,0.0,0.5,0.3333333333333333,30.0,3.0,211203.0,hypotonic_treatment,0.0
88308,2.0,65.5500317076839,6.285178371689806,1.1085628321040195,0.4636611014094705,2.970810086174385,1.5732536239772907,0.09273472720836921,6.0,11.0,...,0.0,0.0,0.0,0.5,0.9166666666666666,30.0,3.0,211203.0,hypotonic_treatment,0.0


In [47]:
compression_opts = dict(method='zip',
                        archive_name=unique_user_saved_outputs+'/dataframes/df_new_incorporated_data_pcs_gmm_clusters_including_shock.csv')  

df_new_incorporated_data_pcs_gmm_clusters.to_csv(unique_user_saved_outputs+'/dataframes/df_new_incorporated_data_pcs_gmm_clusters_including_shock.zip', index=False,
                                                 compression=compression_opts) 

In [48]:
df_new_incorporated_data_pcs_gmm_clusters

Unnamed: 0,PC-0,PC-1,gmm_predictions
0,10.331699,-6.965018,0.0
1,10.538036,-6.055586,0.0
2,10.313709,-4.689303,0.0
3,9.514337,-6.369698,0.0
4,11.067962,-5.938112,0.0
...,...,...,...
88305,-6.771860,1.528631,3.0
88306,-7.208034,2.253944,3.0
88307,-7.299619,2.564839,3.0
88308,-7.938938,0.411854,3.0


In [49]:
df_merged_features_temp = pd.read_csv(unique_user_saved_outputs+'/dataframes/df_new_incorporated_data_merged_features_including_shock.zip')

# upload hotspot predictions from previous data, then merge and save with newly incorporated predictions

In [50]:
merged_ccp_predictions = np.array(list(ccp_predictions) + list(current_param_outputs))
np.save(unique_user_saved_outputs+'/dataframes/merged_ccp_predictions_including_shock', merged_ccp_predictions)

In [51]:
len(merged_ccp_predictions)

19759

In [52]:
len(merged_ccp_predictions)

19759

# merged all previous valid tracks with new valid tracks

In [53]:
all_merged_valid_tracks = np.concatenate((merged_all_valid_tracks, np.array(list(merged_all_valid_tracks_3_color))))

In [54]:
all_merged_valid_tracks.shape

(242674,)

In [55]:
split_valid_tracks = np.array_split(np.array(list(all_merged_valid_tracks)),number_of_track_splits)

In [56]:
# save each track array chunk
for i in range(len(split_valid_tracks)):

    np.save(unique_user_saved_outputs+"/dataframes/all_experiments_merged_all_valid_tracks_including_shock"+str(i), split_valid_tracks[i])