# This notebook prepares the raw data for plotting.
## First we load the data
## Then we process the data
## Finally we output the processed data as a .csv that can be uploaded to Github.

In [4]:
# Import libraries
import flika_JSON_IO as flikaIO
import numpy as np
import pandas as pd

from pathlib import Path

In [5]:
# Configure display
pd.set_option('display.max_colwidth', 75)

<h2> Step 1: Load the data </h2>

In [6]:
# Generate list of filepaths and filenames for tdTomato mNSPCs
tdT_mNSPC_directory = '/home/vivek/Documents/Python Programs/Piezo1_Datasets/Gabby_RIP_Talk_2021_Datasets/Analyzed_Tracks_tdt_mNSPC/AL_40_42_43_44_JSONs/'
tdT_mNSPC_path_object = Path(tdT_mNSPC_directory).glob("*.json")
tdT_mNSPC_JSON_filepaths = sorted([x for x in tdT_mNSPC_path_object if x.is_file()])
tdT_mNSPC_JSON_filenames = [x.stem for x in tdT_mNSPC_JSON_filepaths]
# Generate data frame of trajectories
tdT_mNSPC_trajs = flikaIO.json_to_pandas(tdT_mNSPC_JSON_filepaths, tdT_mNSPC_JSON_filenames, 200)
tdT_mNSPC_trajs.Frame = tdT_mNSPC_trajs.Frame.astype('int')

ValueError: need at least one array to concatenate

In [3]:
# Generate list of filepaths and filenames for halo tag NSC
ht_NSC_directory = '/home/vivek/Documents/Python Programs/Piezo1_Datasets/Gabby_RIP_Talk_2021_Datasets/Analyzed_Tracks_ht_NSC/'
ht_NSC_path_object = Path(ht_NSC_directory).glob("*.json")
ht_NSC_JSON_filepaths = sorted([x for x in ht_NSC_path_object if x.is_file()])
ht_NSC_JSON_filenames = [x.stem for x in ht_NSC_JSON_filepaths]
# Generate data frame of trajectories
ht_NSC_trajs = flikaIO.json_to_pandas(ht_NSC_JSON_filepaths, ht_NSC_JSON_filenames, 200)

<h2> Step 2: Process the data </h2>

In [74]:
# Length of trajectories
def traj_length(df):
    lengths = []
    exp_names = []
    trackIDs = []
    for eachID in df.ID.unique():
        length = len(df.loc[df['ID'] == eachID])
        lengths.append(length)
        exp_name = str(df.loc[df['ID'] == eachID]['Exp_Name'].iloc[0])
        exp_names.append(exp_name)
        trackIDs.append(eachID)
    df = pd.DataFrame({'Exp_Name':exp_names, 'ID':trackIDs, 'Length':lengths})
    return df

In [75]:
tdT_MNSPC_trajLengths = traj_length(tdT_mNSPC_trajs)
tdT_MNSPC_trajLengths = tdT_MNSPC_trajLengths.loc[tdT_MNSPC_trajLengths['Length'] <= 580]
tdT_MNSPC_trajLengths.insert(1, 'Tag', 'tdTomato')
tdT_MNSPC_trajLengths.insert(1, 'Cell_Type', 'mNSPC')

ht_NSC_trajLengths = traj_length(ht_NSC_trajs)
ht_NSC_trajLengths = ht_NSC_trajLengths.loc[ht_NSC_trajLengths['Length'] <= 1180]
ht_NSC_trajLengths.insert(1, 'Tag', 'Halo')
ht_NSC_trajLengths.insert(1, 'Cell_Type', 'NSC')

In [76]:
tdT_MNSPC_trajLengths

Unnamed: 0,Exp_Name,Cell_Type,Tag,ID,Length
2,AL_40_2020-02-27-TIRFM_Diff_tdt-mNSPCs_A_1,mNSPC,tdTomato,2,339
6,AL_40_2020-02-27-TIRFM_Diff_tdt-mNSPCs_A_1,mNSPC,tdTomato,6,334
9,AL_40_2020-02-27-TIRFM_Diff_tdt-mNSPCs_A_1,mNSPC,tdTomato,9,382
10,AL_40_2020-02-27-TIRFM_Diff_tdt-mNSPCs_A_1,mNSPC,tdTomato,10,214
12,AL_40_2020-02-27-TIRFM_Diff_tdt-mNSPCs_A_1,mNSPC,tdTomato,12,450
...,...,...,...,...,...
655,AL_44_2020-03-03-TIRFM_Diff_tdt-mNSPCs_C_15,mNSPC,tdTomato,655,338
656,AL_44_2020-03-03-TIRFM_Diff_tdt-mNSPCs_C_2,mNSPC,tdTomato,656,256
657,AL_44_2020-03-03-TIRFM_Diff_tdt-mNSPCs_C_2,mNSPC,tdTomato,657,332
660,AL_44_2020-03-03-TIRFM_Diff_tdt-mNSPCs_C_3,mNSPC,tdTomato,660,247


<h2> Step 3: Output the processed data </h2>

In [77]:
# Combine data into a single dataframe
combined_df = pd.concat([tdT_MNSPC_trajLengths, ht_NSC_trajLengths])

In [78]:
# Output the data
output_dir = '/home/vivek/Documents/Python Programs/Gabby-RIP-Talk-2021/'
output_filename = 'Gabby_RIP_Talk_2021_data_tdT_v_ht_trajLengths.csv'
combined_df.to_csv((Path(output_dir) / output_filename), sep=',', index=False)