In [None]:
import pandas as pd
import requests
import os
import numpy as np
from io import StringIO
import dask.array as da
import skimage.io as io


In [None]:
def load_labeled_features(path,drop_non_labelled=True, potential_features=['rods','clumped','planktonic','filaments','positive','negative','intermediate']):
    '''
    we assume that the server is locally hosted as described in the settings, but the user can also
    write sql code to read from the database directly
    '''
    response = requests.get(f"http://localhost:8080/download_csv_by_path?path={path}")
    df = pd.read_csv(StringIO(response.text),index_col=0)
    df.drop(df[df.trash==True].index,inplace=True) # drop all trash

    df['dataset']=path.split('/')[-2]
    
    actual_feataures=list(df.columns)
    features=[f for f in potential_features if f in actual_feataures]

    # we add if its labelled
    foo=pd.DataFrame(df.groupby(['chip','label']).apply(lambda x: x[features].any(axis=1)))
    foo.reset_index(inplace=True)
    foo=foo.drop(columns=['level_2'])
    foo.rename(columns={0:'labeled'},inplace=True)
    
    df=df.merge(foo,on=['chip','label'])
    if(drop_non_labelled):
        df.drop(df[df.labeled==False].index,inplace=True) # drop all non labelled
    df.reset_index(inplace=True,drop=True)
    return df

def to8bits(image, imin=None, imax=None): #min 400 max 450-600
    """
    Converting to 8 bits
    If min, max not provided, they are calculated automatically
    """
    if imin is None:
        imin = image.min()
    if imax is None:
        imax = image.max()
        
    # Clip the image to the specified range
    image_clipped = np.clip(image, imin, imax)
    
    # Rescale the image to the range 0-255
    image_8bit = ((image_clipped - imin) / (imax - imin) * 255).astype(np.uint8)
    
    return image_8bit    


def bf_fluo_2rgb_black(fluo):
    """
    creates rgb stack combining bf as grayscale and fluo as cyan
    params:
    -------
    bf: 2D np.ndarray of type 'uint8'
    fluo: 2D np.ndarray of type 'uint8'
    """
    r = np.ones_like(fluo)
    if fluo is not None:
        g = fluo
        
       # g=fluo
    else:
        g = bf
    b = r
    return np.dstack((r, g, b)).astype("uint8")

def create_crops(df,mounting_folder,folder_out): 
    # for given path we will create crops of quadratic size for end point measurements: This assumes masked in aligend zarr file and hence crop size is fixed

    prefix=df.path_short.unique()[0]
    folder_in=df.path.unique()[0]
    if not os.path.exists(folder_out):
        os.makedirs(folder_out)

    #load data
    bf_tritc = da.from_zarr(f'{mounting_folder}{folder_in}/BF_TRITC_aligned.crops.zarr/') # droplet,chips, channel (bf,fluo), image coordinates
    for c in df.chip.unique():# loop over chips 
        data=df[df.chip==c].copy()

        for i in data.label.unique(): # loop over ids
            tritc_image=bf_tritc[i-1,c,1,:].compute()
            io.imsave(f'{folder_out}{prefix}_Crop_{c}_{i}.tiff',to8bits(image=tritc_image,imin=400,imax=600))


# Create training data table and crops from database 

This script assumes that the webserver is running at  http://localhost:8080/ but one could also
access the database directly.
For faster training we create crops from the zarr files and save them directly. Also pytorch resnet requires 8 bit images.


In [None]:
#these are all paths of the datasets provided 

# Cipro
Cipro_paths=['Cirpofloxacin/20230131-ecoli-cipro-1/day2',
'Cirpofloxacin/20230131-ecoli-cipro-2/day2',
'Cirpofloxacin/20220531-MIC-e.coli-cipro/2ndexp/day2',
'Cirpofloxacin/20220531-MIC-e.coli-cipro/1stexp/day2']
# Genta
Genta_paths=['Gentamicin/20230110-e.coli-genta/day2', 
'Gentamicin/20230110-e.coli-genta-2/day2', 
'Gentamicin/20221101-ecoli-genta1/day2',
'Gentamicin/20221101-ecoli-genta2/day2'] 

#Tetra
Tetra_paths=['Tetracycline/20230404-ecoli-Tetracycline/set2/day2', 
'Tetracycline/20230404-ecoli-Tetracycline/set1/day2', 
'Tetracycline/20230315-ecoli/set-2/day2', 
'Tetracycline/20230315-ecoli/set-1/day2'] 

#CHP
CHP_paths=['Chloramphenicol/20230313-ecoli-chp-2/day2',
'Chloramphenicol/20230313-ecoli-chp-1/day2', 
'Chloramphenicol/20230221-ecoli-chp-1/day2', 
'Chloramphenicol/20230111-ecoli-chp-2/day2', 
'Chloramphenicol/20230111-ecoli-chp/day2', 
'Chloramphenicol/20221122-ecoli-chp/day2',
'Chloramphenicol/20221031-ecoli-chp2/day2', 
'Chloramphenicol/20221031-ecoli-chp1/day2', 
'Chloramphenicol/20221013-ecoli-chp/day2', 
'Chloramphenicol/20221012-ecoli-chp/day2',
'Chloramphenicol/20220628-MIC-e.coli-chp-LB-2/day2', 
'Chloramphenicol/20220628-MIC-e.coli-chp-LB-1/day2', 
'Chloramphenicol/20220602-MIC-e.coli-chp-LB/day2', 
'Chloramphenicol/20220524-MIC-e.coli-chp-LB/day2'] 


AMP_paths=['Ampicillin/20220614-MIC-e.coli-amp-LB-2/day2','Ampicillin/20220614-MIC-e.coli-amp-LB-1/day2']

all_paths= Cipro_paths + Genta_paths + Tetra_paths + CHP_paths + AMP_paths

In [None]:
#now we collect all information for table and crop creation. If only training data is to be cropped
#set drop_non_labelled=True

df_list=[]
for p in all_paths: 
    foo=load_labeled_features(p,drop_non_labelled=False) # load all data and drops thrash and then all non labelled if specified
    # some datasets for Cipro and Tetra are incosistently labelled and have a folder 1stexp and 2ndexp, we catch that here directly
    path_split=p.split('/')
    is_match = path_split[-2] in ['1stexp', '2ndexp', 'set-1', 'set-2', 'set1', 'set2']
    
    if(is_match):
        foo['path_short']= path_split[-3] +'_'+ path_split[-2]
    else:    
        foo['path_short']=path_split[-2]
        
    foo['path']=p
    df_list.append(foo)

In [20]:
df=pd.concat(df_list) 
df.reset_index(inplace=True)
df = df.fillna(0)

In [21]:
df

Unnamed: 0,index,chip,label,junk,coalescent,empty,clumped,positive,planktonic,big-droplet,...,n_cells_1,n_cells_2,concentration,bad alignment,trash,dataset,labeled,path_short,path,intermediate
0,0,0,2,False,False,False,0.0,0.0,0.0,False,...,2.0,106.0,0.0,False,False,20230131-ecoli-cipro-1,False,20230131-ecoli-cipro-1,Cirpofloxacin/20230131-ecoli-cipro-1/day2,0.0
1,1,0,3,False,False,False,0.0,0.0,0.0,False,...,3.0,93.0,0.0,False,False,20230131-ecoli-cipro-1,False,20230131-ecoli-cipro-1,Cirpofloxacin/20230131-ecoli-cipro-1/day2,0.0
2,2,0,4,False,False,False,0.0,0.0,0.0,False,...,0.0,105.0,0.0,False,False,20230131-ecoli-cipro-1,False,20230131-ecoli-cipro-1,Cirpofloxacin/20230131-ecoli-cipro-1/day2,0.0
3,3,0,5,False,False,False,0.0,0.0,0.0,False,...,1.0,105.0,0.0,False,False,20230131-ecoli-cipro-1,False,20230131-ecoli-cipro-1,Cirpofloxacin/20230131-ecoli-cipro-1/day2,0.0
4,4,0,6,False,False,False,0.0,0.0,0.0,False,...,1.0,109.0,0.0,False,False,20230131-ecoli-cipro-1,False,20230131-ecoli-cipro-1,Cirpofloxacin/20230131-ecoli-cipro-1/day2,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76905,2699,5,486,False,False,False,0.0,0.0,0.0,False,...,1.0,2.0,7.0,False,False,20220614-MIC-e.coli-amp-LB-1,False,20220614-MIC-e.coli-amp-LB-1,Ampicillin/20220614-MIC-e.coli-amp-LB-1/day2,0.0
76906,2700,5,487,False,False,False,0.0,0.0,0.0,False,...,0.0,0.0,7.0,False,False,20220614-MIC-e.coli-amp-LB-1,False,20220614-MIC-e.coli-amp-LB-1,Ampicillin/20220614-MIC-e.coli-amp-LB-1/day2,0.0
76907,2701,5,488,False,False,False,0.0,0.0,0.0,False,...,0.0,0.0,7.0,False,False,20220614-MIC-e.coli-amp-LB-1,False,20220614-MIC-e.coli-amp-LB-1,Ampicillin/20220614-MIC-e.coli-amp-LB-1/day2,0.0
76908,2702,5,489,False,False,False,0.0,0.0,0.0,False,...,0.0,0.0,7.0,False,False,20220614-MIC-e.coli-amp-LB-1,False,20220614-MIC-e.coli-amp-LB-1,Ampicillin/20220614-MIC-e.coli-amp-LB-1/day2,0.0


In [None]:
df.trash.unique(),df.labeled.unique()   # make sure all trash and not labelled are dropped (if wanted) 

In [None]:
# we set all intermediates to positive for this study
df.loc[df.intermediate==1,'positive']=1

In [None]:
df[(df.positive==True) & (df.negative==True)] #both true should be empty

In [None]:
paths_for_crops=df.path.unique()

In [None]:
folder_out='Crops/' #folder where Crops are stored: will be relative to the current path
mounting_folder='/home/your_username/' #  where the image data is mounted 
for p in all_paths: 
    print(p)
    create_crops(df[df.path==p],mounting_folder,folder_out)    
    print(f'{p} done')

In [None]:
# we add the crop names into the table

df['filename']=''
for p in paths_for_crops:  	
    dg=df[df.path==p].copy()
    prefix=dg.path_short.unique()[0]
    for c in dg.chip.unique():
        data=dg[dg.chip==c].copy()
        for i in data.label.unique():
            df.loc[(df.path==p) & (df.chip==c) & (df.label==i),'filename']=f'{prefix}_Crop_{c}_{i}.tiff'

In [None]:
df.drop(df[df.labeled==False].index,inplace=True) # drop not labelled if not allready dropped to save training information

In [None]:
df_to_save=df[['filename','positive','planktonic','clumped','rods','filaments']].copy()
df_to_save[['positive','planktonic','clumped','rods','filaments']]=df_to_save[['positive','planktonic','clumped','rods','filaments']].astype('int8')

In [None]:
#df_to_save.to_csv('../tables/LabelingSetAll.csv') # table with training information, we did not reindex after only selecting some data on purpose