## Merge all the datasets into one table

The code below processes multiple CSV files, applies anisotropy corrections, calculates clone types, and merges the data into a single DataFrame. 

It also calculates major and minor axes, elongation, and other metrics, and saves the combined data to a CSV file.



In [None]:
import pandas as pd
import os 
import numpy as np 
import glob 
from tqdm import tqdm

#print versions of libraries
print(f"Pandas version: {pd.__version__}")
print(f"Numpy version: {np.__version__}")


local_pc_path = "./merged/"#"./merged/"

#get list of files with tab extension using glob 
files = glob.glob(local_pc_path + "*.csv")

#Create a directory to save final_graphs
if not os.path.exists("./final_graphs"):
    os.makedirs("./final_graphs")

def apply_anisotropy(df,all_px_size,fname,column_name):
    #As the images are downsampled in y and x by 3, the pixel sizes need to be multiplied by 3
    fname = os.path.basename(file).split('.')[0].split("_")[0]
    orig_pixel_sizes = all_px_size[fname]
    #image used is from series 2 of czi which is downsampled in z,y,x by 1,3,3
    downsample_series2 = (1,3,3)
    pixel_sizes = tuple([orig_pixel_sizes[i]*downsample_series2[i] for i in range(3)])
    #calculate anisotropy in the downsampled image and apply that to centroid_z column
    anisotropy = pixel_sizes[0]/pixel_sizes[2]
    df[column_name] = df[column_name]*anisotropy
    return df

def calc_clone_type(row, clone_proportion_thresh=0):
    #Return clone type as a string based on the clone proportions
    clone_cols={'1': '1', '10': '2', '11': '3', '100': '4', '101': '5', '110': '6', '111': '7'}
    col_iterator = list(clone_cols.keys())  
    return "_".join([clone_cols[col] for col in col_iterator if row[col]>clone_proportion_thresh])

#Metadata for the different files
#pixel sizes of full resolution images
all_px_size = {
    '883': (2.8846, 1.1434, 1.1434),
    '934': (2.9169, 1.2196, 1.2196),
    '935': (2.8845, 1.2196, 1.2196),
    '1066': (2.9169, 1.2196, 1.2196),
    '1064': (2.9169, 1.2196, 1.2196),
    '1067':(2.9169, 1.2196, 1.2196),
    '1069':(2.9169, 1.2196, 1.2196),
    '1070':(2.9169, 1.2196, 1.2196),
    '1381':(2.9169, 1.2196, 1.2196),
}

#Total Volume
#czi file was pyramidal, series 2 was used for measurements
#series2 is downscaled by 3 in x and y, so volume in micron needs to be multipled by 9 to get the actual volume
total_volume = {
    '883': 8766163219,
    '934': 4944596934,
    '935': 20385745016,
    '1066': 7342453626.988,
    '1064': 3084277530.199,
    '1067':3724566738.016,
    '1069':2147206338,
    '1070':4717521108,
    '1381':8833829122.655,
}
#mutliply volume by 9 to get actual tissue volume
total_volume = {k: v*9 for k,v in total_volume.items()}

#Vessel volume and surface area calculated from the segmented vessels
vessel_volume = {
    '883': 2591659977.133,
    '934': 2413540309.371,
    '935': 5691226009.958,
    '1066': 1252447949.720,
    '1064': 852975560.819,
    '1067':1267274988.222,
    '1069':977273231.264,
    '1070':579797002.655,
    '1381':8793217979.968,
}

vessel_surface_area = {
    '883': 318029422.065,
    '934': 244531157.649,
    '935': 968212256.941,
    '1066': 185432718.407,
    '1064': 151017041.746,
    '1067': 127043494.744,
    '1069': 75306775.307,
    '1070': 98136863.522,
    '1381':430553447.482,
}

merged_df = []

data_list=[]

#get files with extension csv and loop through them
for file in tqdm(files):
    fname = os.path.basename(file).split('.')[0].split("_")[0]
    pixel_size = all_px_size[fname]
    
    print(f"Processing {fname}")
    #get fname 
    fname = os.path.basename(file).split('_')[0]
    df = pd.read_csv(file,sep="\t") 
    df = df.iloc[2:]
    if fname in ['883','934','935','1064','1066']:
        treatment='IV'
    elif fname in ['1067','1069','1070','1381']:
        treatment='MFP'
    else:
        raise Exception(f"File {fname} not found")
    #For surface area measurments, there could be nan values in the columns.
    #these means that met does not touch a vessel, so change to zero
    #if column total_touching_surface_area/2, MetNum_that_touches, number_of_touchPoints, total_touching_volume or total_touching_surface_area/2  have nan, convert it to zero
    df[['total_touching_surface_area/2','MetNum_that_touches','number_of_touchPoints','total_touching_volume']] = \
        df[['total_touching_surface_area/2','MetNum_that_touches','number_of_touchPoints','total_touching_volume']].fillna(0)
    
    df= df.dropna(how='any')#(how='all')
    df = df.astype(float)

    #correct column names to remove leading spaces
    df.rename(columns=lambda x: x.strip(), inplace=True)
    df = apply_anisotropy(df,all_px_size,fname,'centroid_z')

    #Keep only rows with volume corrected > 9000
    df = df[df['Volume corrected'] > 9000]
    df['id']=fname
    
    #Create a column with clone_type
    df['clone_type'] = df.apply(calc_clone_type, axis=1)
    
    #remove rows with clone_type = ''
    df = df[df['clone_type'] != '']

    df.rename(columns=lambda x: x.strip(), inplace=True)

    df['treatment'] = treatment
    df['total_lung_volume'] = total_volume[fname]
    df['vessel_volume'] = vessel_volume[fname]
    df['vessel_surface_area'] = vessel_surface_area[fname]
    
    #Calculate major and minor axes
    #Create empty pandas dataframe 
    major_minor_axis_df = pd.DataFrame()
    #get major and minor axis given bounding boxes in x,y,z in columns Box.X.Min, Box.Y.Min, Box.Z.Min, Box.X.Max, Box.Y.Max, Box.Z.Max
    major_minor_axis_df['x_axis'] = df.apply(lambda x: x['Box.X.Max'] - x['Box.X.Min'], axis=1)
    major_minor_axis_df['y_axis'] = df.apply(lambda x: x['Box.Y.Max'] - x['Box.Y.Min'], axis=1)
    major_minor_axis_df['z_axis'] = df.apply(lambda x: x['Box.Z.Max'] - x['Box.Z.Min'], axis=1)
    #apply anisotropy correction in z axis
    major_minor_axis_df = apply_anisotropy(major_minor_axis_df,all_px_size,fname,'z_axis')
    df['minor_axis'] = major_minor_axis_df.min(axis=1)
    df['major_axis'] = major_minor_axis_df.max(axis=1)
    df['elongation'] = df['major_axis']/df['minor_axis']
    merged_df.append(df)

Pandas version: 2.2.1
Numpy version: 1.23.4


  0%|          | 0/9 [00:00<?, ?it/s]

Processing 1064


 22%|██▏       | 2/9 [00:00<00:01,  5.11it/s]

Processing 1066
Processing 1067


 44%|████▍     | 4/9 [00:00<00:00,  6.72it/s]

Processing 1069
Processing 1070


 56%|█████▌    | 5/9 [00:00<00:00,  4.95it/s]

Processing 1381
Processing 883


 78%|███████▊  | 7/9 [00:01<00:00,  5.33it/s]

Processing 934
Processing 935


100%|██████████| 9/9 [00:01<00:00,  5.37it/s]


In [3]:
#concatenate all dataframes
merged_all = pd.concat(merged_df)
print(merged_all.treatment.unique())
#Verify the number of unique ids
merged_all.groupby(['treatment','id']).size().unstack()

['IV' 'MFP']


id,1064,1066,1067,1069,1070,1381,883,934,935
treatment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
IV,2896.0,2356.0,,,,,2781.0,405.0,5356.0
MFP,,,1380.0,1426.0,3406.0,1146.0,,,


In [4]:
#delete column Volume as we use Volume corrected
merged_all = merged_all.drop(columns=['Volume'])
#if underscore in row clone_type, create new column and give value polyclonal, otherwise monoclonal
merged_all['met_type_cat'] = np.where(merged_all['clone_type'].str.contains('_'), 'polyclonal', 'monoclonal')
merged_all.to_csv("./MFP_IV_combined_raw_data.csv",index=False)