# This notebook prepares the raw data for plotting.
## First we load the data
## Then we process the data
## Finally we output the processed data as a .csv that can be uploaded to Github.

In [1]:
# Import libraries
import flika_JSON_IO as flikaIO
import numpy as np
import pandas as pd

from pathlib import Path

In [2]:
# Configure display
pd.set_option('display.max_colwidth', 75)

<h2> Step 1: Load the data </h2>

In [3]:
# Generate list of filepaths and filenames for tdTomato mNSPCs
tdT_mNSPC_directory = '/home/vivek/Documents/Python Programs/Piezo1_Datasets/Gabby_RIP_Talk_2021_Datasets/Analyzed_tdT_mNSPC/'
tdT_mNSPC_path_object = Path(tdT_mNSPC_directory).glob("*.json")
tdT_mNSPC_JSON_filepaths = sorted([x for x in tdT_mNSPC_path_object if x.is_file()])
tdT_mNSPC_JSON_filenames = [x.stem for x in tdT_mNSPC_JSON_filepaths]
# Generate data frame of trajectories
tdT_mNSPC_trajs = flikaIO.json_to_pandas(tdT_mNSPC_JSON_filepaths, tdT_mNSPC_JSON_filenames, 200)
tdT_mNSPC_trajs.Frame = tdT_mNSPC_trajs.Frame.astype('int')

In [4]:
# Generate list of filepaths and filenames for halo tag NSC
ht_NSC_directory = '/home/vivek/Documents/Python Programs/Piezo1_Datasets/Gabby_RIP_Talk_2021_Datasets/Analyzed_ht_NSC/'
ht_NSC_path_object = Path(ht_NSC_directory).glob("*.json")
ht_NSC_JSON_filepaths = sorted([x for x in ht_NSC_path_object if x.is_file()])
ht_NSC_JSON_filenames = [x.stem for x in ht_NSC_JSON_filepaths]
# Generate data frame of trajectories
ht_NSC_trajs = flikaIO.json_to_pandas(ht_NSC_JSON_filepaths, ht_NSC_JSON_filenames, 200)

In [5]:
# Generate list of filepaths and filenames for tdTomato Kera
tdT_Kera_directory = '/home/vivek/Documents/Python Programs/Piezo1_Datasets/Gabby_RIP_Talk_2021_Datasets/Analyzed_tdT_Kera/'
tdT_Kera_path_object = Path(tdT_Kera_directory).glob("*.json")
tdT_Kera_JSON_filepaths = sorted([x for x in tdT_Kera_path_object if x.is_file()])
tdT_Kera_JSON_filenames = [x.stem for x in tdT_Kera_JSON_filepaths]
# Generate data frame of trajectories
tdT_Kera_trajs = flikaIO.json_to_pandas(tdT_Kera_JSON_filepaths, tdT_Kera_JSON_filenames, 200)
tdT_Kera_trajs.Frame = tdT_Kera_trajs.Frame.astype('int')

In [6]:
# Generate list of filepaths and filenames for halo tag Kera
ht_Kera_directory = '/home/vivek/Documents/Python Programs/Piezo1_Datasets/Gabby_RIP_Talk_2021_Datasets/Analyzed_ht_Kera/'
ht_Kera_path_object = Path(ht_Kera_directory).glob("*.json")
ht_Kera_JSON_filepaths = sorted([x for x in ht_Kera_path_object if x.is_file()])
ht_Kera_JSON_filenames = [x.stem for x in ht_Kera_JSON_filepaths]
# Generate data frame of trajectories
ht_Kera_trajs = flikaIO.json_to_pandas(ht_Kera_JSON_filepaths, ht_Kera_JSON_filenames, 200)

<h2> Step 2: Process the data </h2>

In [11]:
# Length of trajectories
def traj_length(df):
    lengths = []
    exp_names = []
    trackIDs = []
    for eachID in df.ID.unique():
        length = len(df.loc[df['ID'] == eachID])
        lengths.append(length)
        exp_name = str(df.loc[df['ID'] == eachID]['Exp_Name'].iloc[0])
        exp_names.append(exp_name)
        trackIDs.append(eachID)
    df = pd.DataFrame({'Exp_Name':exp_names, 'ID':trackIDs, 'Length':lengths})
    return df

In [12]:
tdT_mNSPC_trajLengths = traj_length(tdT_mNSPC_trajs)
tdT_mNSPC_trajLengths = tdT_mNSPC_trajLengths.loc[tdT_mNSPC_trajLengths['Length'] <= 580]
tdT_mNSPC_trajLengths.insert(1, 'Tag', 'tdTomato')
tdT_mNSPC_trajLengths.insert(1, 'Cell_Type', 'mNSPC')

In [13]:
ht_NSC_trajLengths = traj_length(ht_NSC_trajs)
ht_NSC_trajLengths = ht_NSC_trajLengths.loc[ht_NSC_trajLengths['Length'] <= 1180]
ht_NSC_trajLengths.insert(1, 'Tag', 'Halo')
ht_NSC_trajLengths.insert(1, 'Cell_Type', 'NSC')

In [14]:
tdT_Kera_trajLengths = traj_length(tdT_Kera_trajs)
tdT_Kera_trajLengths = tdT_Kera_trajLengths.loc[tdT_Kera_trajLengths['Length'] <= 580]
tdT_Kera_trajLengths.insert(1, 'Tag', 'tdTomato')
tdT_Kera_trajLengths.insert(1, 'Cell_Type', 'mKera')

In [24]:
ID_list = sorted(ht_Kera_trajs.ID.unique())
dfs = {}
for i in range(10):
    temp_df = ht_Kera_trajs.loc[ht_Kera_trajs['ID'].isin(ID_list[10000*(i):10000*(i+1)])]
    dfs[i] = traj_length(temp_df)
ht_Kera_trajLengths = pd.concat(dfs, axis=0).reset_index(drop=True)
ht_Kera_trajLengths = ht_Kera_trajLengths.loc[ht_Kera_trajLengths['Length'] <= 1180]
ht_Kera_trajLengths.insert(1, 'Tag', 'Halo')
ht_Kera_trajLengths.insert(1, 'Cell_Type', 'Kera')

<h2> Step 3: Output the processed data </h2>

In [26]:
# Combine data into a single dataframe
combined_stem_cell_df = pd.concat([tdT_mNSPC_trajLengths, ht_NSC_trajLengths])
combined_kera_cell_df = pd.concat([tdT_Kera_trajLengths, ht_Kera_trajLengths])

In [27]:
# Output the data
output_dir = '/home/vivek/Documents/Python Programs/Gabby-RIP-Talk-2021/'
stem_cell_output_filename = 'Gabby_RIP_Talk_2021_StemCell_data_tdT_v_ht_trajLengths.csv'
kera_cell_output_filename = 'Gabby_RIP_Talk_2021_KeraCell_data_tdT_v_ht_trajLengths.csv'
combined_stem_cell_df.to_csv((Path(output_dir) / stem_cell_output_filename), sep=',', index=False)
combined_kera_cell_df.to_csv((Path(output_dir) / kera_cell_output_filename), sep=',', index=False)

In [29]:
len(tdT_mNSPC_trajLengths.ID.unique())

489

In [30]:
len(ht_NSC_trajLengths.ID.unique())

19708

In [31]:
len(tdT_Kera_trajLengths.ID.unique())

8213

In [32]:
len(ht_Kera_trajLengths.ID.unique())

88759