# Create metadata dataframe

- Get full paths to raw and denoised images
- Extract metadata from paths

Generate output path for that dataset and save pandas dataframe with metadata to fileList.csv

In [1]:
import glob
import os
import pandas as pd
from convert_paths import correct_path, correct_save_path

## Input parameters

In [2]:
#### input parameters ######
path_type="mac"  # could be "server", "mac" or "wsl"

#raw_input_path = correct_path('/mnt/external.data/MeisterLab/Kalyan/TF_strains/wPM1353_bet1_mSG/2025_25_02_wPM1353/2025_25_02_wPM1353_HS')
#raw_input_path = correct_path('/mnt/external.data/MeisterLab/Kalyan/TF_strains/wPM1353_bet1_mSG/2025_04_03_wPM1353/2025_04_03_wPM1353_HS')
#raw_input_path = correct_path('/mnt/external.data/MeisterLab/Kalyan/TF_strains/wPM1353_bet1_mSG/2025_10_05_wPM1353/2025_10_05_wPM1353_HS')
raw_input_path = correct_path('/mnt/external.data/MeisterLab/jsemple/demo_VIBE/images/wPM1353_bet1_mSG/2025_10_05_wPM1353/2025_10_05_wPM1353_HS', path_type)
denoised_input_path = os.path.join(raw_input_path,'n2v_denoise/denoised')


#output_path = '/Volumes/external.data/MeisterLab/jsemple/lhinder/segmentation_Dario/DPY27/'
output_path_base = correct_path('/mnt/external.data/MeisterLab/jsemple/demo_VIBE/results', path_type)


## Dataset specific import functions and metadata generators

In [3]:
def kalyan_metadata(raw_input_path, raw_filepaths, output_path_base, denoised_input_path):
    # extract identifying directories from raw_input_path
    metadata = os.path.normpath(raw_input_path).split(os.sep)[-3:]
    protein = '-'.join(metadata[0].split('_')[1:])
    strain = metadata[0].split('_')[0]
    date = '-'.join(metadata[1].split('_')[:3])

    output_path = os.path.join(output_path_base, '_'.join([date,protein,strain]))
    if not os.path.exists(output_path):
        os.makedirs(output_path, exist_ok=True)

    df = pd.DataFrame()
    df['filename'] = [os.path.basename(filepath)[:-4] for filepath in raw_filepaths]
    #tmpdate = [os.path.normpath(filepath).split(os.sep)[-2] for filepath in raw_filepaths]
    df['date'] = date
    df['protein'] = protein
    df['strain'] = strain
    df['treatment'] = [filename.split('_')[-2] for filename in df['filename']]
    df['worm_id'] = [filename.split('_')[-1] for filename in df['filename']]    
    df['id'] = df['protein'] + '_' + df['date'] + '_' + df['filename'] 
    df['raw_filepath'] = raw_filepaths
    df['denoised_filepath'] = [os.path.join(denoised_input_path,filename+'_green_n2v.tif') for filename in df['filename']]
    df_to_save = correct_save_path(df.copy(), path_type, col_names=['raw_filepath', 'denoised_filepath'])        
    df_to_save.to_csv(os.path.join(output_path,'fileList.csv'),index=False)
    return(df, output_path)




def dario_metadata(raw_input_path, raw_filepaths, output_path_base, denoised_input_path):
    # extract identifying directories from raw_input_path
    protein_strain_date = os.path.normpath(raw_input_path).split(os.sep)[-3:]
    protein_strain_date = '/'.join(protein_strain_date)
    output_path = os.path.join(output_path_base, protein_strain_date)
    df = pd.DataFrame()
    df['filename'] = [os.path.basename(filepath)[:-4] for filepath in raw_filepaths]
    tmpdate = [os.path.normpath(filepath).split(os.sep)[-2] for filepath in raw_filepaths]
    df['date'] = pd.Series([exp.split('_')[0] for exp in tmpdate])
    df['stage'] = pd.Series([exp.split('_')[1] for exp in tmpdate])
    df['experiment'] = pd.Series([exp.split('_')[2] for exp in tmpdate])
    df['strain'] = [os.path.normpath(filepath).split(os.sep)[-3] for filepath in raw_filepaths]
    df['protein'] = [os.path.normpath(filepath).split(os.sep)[-4] for filepath in raw_filepaths]
    df['id'] = df['protein'] + '_' + df['stage'] + '_' + df['experiment'] + '_' + df['filename'] 
    df['raw_filepath'] = raw_filepaths
    df['denoised_filepath'] = [os.path.join(denoised_input_path,filename+'_n2v.tif') for filename in df['filename']]
    df_to_save = correct_save_path(df.copy(), path_type, col_names=['raw_filepath', 'denoised_filepath'])  
    df_to_save.to_csv(os.path.join(output_path,'fileList.csv'),index=False)
    return(df, output_path)

## Get paths and make dataframe

In [4]:
if not os.path.exists(output_path_base):
    os.makedirs(output_path_base, exist_ok=True)

raw_file_name_pattern = "/*.nd2"
raw_filepaths = sorted(glob.glob(raw_input_path + raw_file_name_pattern,recursive=True))
raw_filepaths = [filepath for filepath in raw_filepaths if '_bad.nd2' not in filepath]

print(f"Found {len(raw_filepaths)} *.nd2 files.")


df, output_path = kalyan_metadata(raw_input_path, raw_filepaths, output_path_base, denoised_input_path)
output_path 

Found 2 *.nd2 files.


'/Volumes/external.data/MeisterLab/jsemple/demo_VIBE/results/2025-10-05_bet1-mSG_wPM1353'

In [5]:
df.head()

Unnamed: 0,filename,date,protein,strain,treatment,worm_id,id,raw_filepath,denoised_filepath
0,2025_10_05_wPM1353_HS_001,2025-10-05,bet1-mSG,wPM1353,HS,1,bet1-mSG_2025-10-05_2025_10_05_wPM1353_HS_001,/Volumes/external.data/MeisterLab/jsemple/demo...,/Volumes/external.data/MeisterLab/jsemple/demo...
1,2025_10_05_wPM1353_nHS_001,2025-10-05,bet1-mSG,wPM1353,nHS,1,bet1-mSG_2025-10-05_2025_10_05_wPM1353_nHS_001,/Volumes/external.data/MeisterLab/jsemple/demo...,/Volumes/external.data/MeisterLab/jsemple/demo...
