In [19]:
# "Imaris2ALAn" Data Collation Code
# Code written by Joe Glichowski at the University of Rochester Summer 2021
# This code will take Statistics output by Imaris Nuclear Segmentation tool and generate .csv files containing nuclear spatial coordinates

In [20]:
# This cell imports all necessary Python packages
import os                                   
import numpy as np                                     
from scipy import optimize, stats                      
import pandas as pd
import time

In [21]:
# This cell specifies the directories containing 
# 1. The input: Imaris Statistics data files and 
# 2. The output: .csv files

#path of input directory
path = "/Users/tarafinegan/Desktop/imaris/"

#define the output directory

# Name of output Directory
directory = "ALAninput"
  
# Define the location of the output Directory on your computer
parent_dir = "/Users/tarafinegan/Desktop/"
  
# # Create the directory defined in line 11 in path defined on line 14
outpath = os.path.join(parent_dir, directory)
os.mkdir(outpath)
output_path = outpath
folder = os.listdir(path) #returns list of folders in directory

In [22]:
# This cell writes all of the files that the code will process for the user to check
folder

['2020-07-15_07.56.05_100k_16hr_a2_Statistics',
 '2020-07-20_08.04.03_100K_8hr_a5_Statistics',
 '2020-07-15_08.47.01_200k_16hr_b4_Statistics',
 '2020-07-15_08.02.35_100k_16hr_a7_Statistics',
 '2020-07-20_08.09.15_100K_8hr_a9_Statistics',
 '2020-07-15_08.56.16_200k_16hr_b10_Statistics',
 '2020-07-15_08.41.36_200k_16hr_a16_Statistics',
 '2020-07-20_08.05.47_100K_8hr_a6_Statistics',
 '2020-07-15_08.34.25_200k_16hr_a10_Statistics',
 '2020-07-15_09.19.22_100k_16hr_b12_Statistics',
 '2020-07-20_08.01.55_100K_8hr_a4_Statistics',
 '2020-07-15_09.12.42_100k_16hr_b6_Statistics',
 '2020-07-15_08.28.47_200k_16hr_a7_Statistics',
 '.DS_Store',
 '2020-07-15_08.24.53_200k_16hr_a5_Statistics',
 '2020-07-15_08.14.58_100k_16hr_a14_Statistics',
 '2020-07-15_08.57.45_200k_16hr_b11_Statistics',
 '2020-07-15_08.22.46_200k_16hr_a4_Statistics',
 '2020-07-15_08.36.45_200k_16hr_a12_Statistics',
 '2020-07-15_09.08.09_100k_16hr_b3_Statistics',
 '2020-07-20_08.06.44_100K_8hr_a7_Statistics',
 '2020-07-15_08.29.43_20

In [23]:
# This cell defines the function that will extract the relevant information from the Imaris statistics file and output these into a .csv that can be input into ALAn.

def subfolder_name_lists(folder_address, list_of_folders, output_path):
    
    labels = []
    list_of_folders.sort()
    list_of_folders = [i for i in list_of_folders if i != '.DS_Store'] #remove unwanted element in list
#     print(list_of_folders)
    for item in list_of_folders:
        x = item.split('_')
#         print(x)
        label = x[0] + x[1] +x[2] +x[3] #take only the desired labels for the collated file
                                   
#         print(str(label))
        labels.append(str(label)) #append each label to a list for later use
#     print(labels)
    
    positions = [] #list of position files names
    volumes = [] #list of volume file names
    subfolder_addresses = [] #list of subfolder paths
    for item in list_of_folders:
#         item = list_of_folders[i+1]
        subfolder = folder_address + "/" + item  #subfolder address
        subfolder_addresses.append(subfolder)
        files = os.listdir(subfolder) #returns list of files in subfolder
        #next two lines will assign position and volume csv file names
        position = [f for f in files if f[-12:] == "Position.csv"]
        volume = [f for f in files if f[-10:] == "Volume.csv"]
        #append position/volume file names into their respective lists
        positions.append(position)
        volumes.append(volume)
#         print(position,volume)
    #next two lines transform the lists of lists of strings into lists of strings for ease of use later
    positions = [''.join(i) for i in positions]
    volumes = [''.join(i) for i in volumes]
    
    #need to go into folder then subfolder then pd.read_excel the desired csv, then append to a list of dfs
    #use list of subfolder addresses to bypass first step, then just say for each subfolder address add on our
    #position of volume files suffix and pd.read_excel that sucker
    position_dfs = []
    volume_dfs = []
    
    for i in range(len(subfolder_addresses)):
        
        #read in position csv's as data frames and append to list
        position_address = subfolder_addresses[i] + "/" + positions[i]
        df = pd.read_csv(position_address, encoding = "utf-8", skiprows = 3)
        df = df.drop(['Unit', "Category", "Collection", "Time", "CellID", "ID", "Unnamed: 9"], axis = 1) #dropping unwanted columns
        position_dfs.append(df)
        
        #read in volume csv's as data frames and append to list
        volume_address = subfolder_addresses[i] + "/" + volumes[i]
        volume_dfs.append(pd.read_csv(volume_address, encoding = "utf-8", skiprows = 3))
    
    #taking the one column we want from the volume csv's and merging it with the processed position csv's
    merged_dfs = []
    for i in range(len(volume_dfs)):
        df = volume_dfs[i]
        sub_df = df["Nucleus Volume"] #single column we want
        p_df = position_dfs[i]
        merged_df = pd.concat([p_df, sub_df], axis=1)
        merged_dfs.append(merged_df)
        #saves collated data as csv's in desired location
        merged_df.to_csv(output_path + '/' + labels[i] + '.csv', index = False)
        
    return merged_dfs, labels

In [24]:
# This cell will run the function on your files and time taken for this to occur, outputting the time in seconds for this to occur.

toc = time.time()
subfolder_name_lists(path, folder, output_path)
tic = time.time()
print(tic-toc)

0.3273320198059082
