In [1]:
import numpy as np
import glob 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
from functools import reduce
from itertools import product

In [2]:
Folder='PredictedLabelsTables/'

paths=glob.glob(f'{Folder}*_trained.csv')

In [4]:

def calc_cooc_cond_prob(data):
    """
    Compute all 8 co-occurrence probabilities for 3 binary classes. Given the zero class is positive
    Uses chmod-like indexing: C=4, R=2, F=1
    """
    mask = data[:, 0] == 1  # Condition
    conditioned_data = data[mask][:, 1:]  # Only C, R, F where D == 1
    n_samples = len(conditioned_data)
    
    probabilities = {}
    
    for combo in product([0, 1], repeat=3):
        index = combo[0] * 4 + combo[1] * 2 + combo[2] * 1
        
        if n_samples == 0:
            # Return NaN values but keep same structure
            probabilities[index] = {
                'combination': combo,
                'count': np.nan,
                'probability': np.nan,
                'label': f"{'C' if combo[0] else '¬C'}{'R' if combo[1] else '¬R'}{'F' if combo[2] else '¬F'}"
            }
        else:
            match = np.all(conditioned_data == combo, axis=1)
            count = np.sum(match)
            probability = count / n_samples
            probabilities[index] = {
                'combination': combo,
                'count': count,
                'probability': probability,
                'label': f"{'C' if combo[0] else '¬C'}{'R' if combo[1] else '¬R'}{'F' if combo[2] else '¬F'}"
            }
    
    return probabilities

In [5]:
def generate_cooccurance_dataframe(path,save_folder):
    data = pd.read_csv(path,index_col=0)
    data['n_cells_1']=data.n_cells_1.astype(int)
    data['n_cells_2']=data.n_cells_2.astype(int)
    

    classes=['rods','filaments','planktonic','clumped']
    for c in classes:
        data[c]=data[c].astype(int)
        
    out=data.groupby('chip').apply(lambda x: calc_cooc_cond_prob(x[['positive','clumped','rods','filaments']].to_numpy()))
    records = []
    for chip, result_dict in out.items():
        for idx, entry in result_dict.items():
            records.append({
                'chip': chip,
                'code': idx,
                'C': entry['combination'][0],
                'R': entry['combination'][1],
                'F': entry['combination'][2],
                'label': entry['label'],
                'count': entry['count'],
                'probability': entry['probability']
            })

    out_df = pd.DataFrame.from_records(records)
    Filename=p.split('/')[1].split('_trained')[0]
    out_df.to_csv(f'{save_folder}cooccurance_info_{Filename}.csv')


In [12]:
save_folder='PredictedLabelsTables/'
for p in paths:
    generate_cooccurance_dataframe(p,save_folder)

save done
save done
save done
save done
save done
save done
save done
save done
save done
save done
save done
save done
save done
save done
save done
save done
save done
save done
save done
save done
save done
save done
save done
save done
save done
save done
save done
save done
save done
save done


#  Unify tables

In [6]:
def unionise_datasets(data, dates, date_index):
    """Combine datasets"""
    result = []
    
    for i, df in enumerate(data):
        df_copy = df.copy()
        df_copy['dataset'] = i
        df_copy['date'] = dates[i]
        df_copy['date_index'] = date_index[i]
        result.append(df_copy)
    
    return pd.concat(result, ignore_index=True)

In [None]:
#-------------------------------------Genta---------------------------------------------------------------------
dates=['20221101','20221101','20230110','20230110'] # we add this metadata to keep track of the experiments
date_index=[1,2,1,2]


data =[]


data.append(pd.read_csv(f'{Folder}cooccurance_info_20221101-ecoli-genta1.csv'))
data.append(pd.read_csv(f'{Folder}cooccurance_info_20221101-ecoli-genta2.csv'))

data.append(pd.read_csv(f'{Folder}cooccurance_info_20230110-e.coli-genta.csv'))
data.append(pd.read_csv(f'{Folder}cooccurance_info_20230110-e.coli-genta-2.csv'))

df=  unionise_datasets(data,dates,date_index)
df.to_csv(f'../tables/probability_tables/Gentamicin_cooccurance.csv')



In [None]:
#-------------------------------------Tetra---------------------------------------------------------------------
dates=['20230315','20230315','20230404','20230404']
date_index=[1,2,1,2]

data=[]


data.append(pd.read_csv(f'{Folder}cooccurance_info_20230315-ecoli_set-1.csv'))
data.append(pd.read_csv(f'{Folder}cooccurance_info_20230315-ecoli_set-2.csv'))

data.append(pd.read_csv(f'{Folder}cooccurance_info_20230404-ecoli-Tetracycline_set1.csv'))
data.append(pd.read_csv(f'{Folder}cooccurance_info_20230404-ecoli-Tetracycline_set2.csv'))

df=unionise_datasets(data,dates,date_index)
df.to_csv(f'../tables/probability_tables/Tetracycline_cooccurance.csv')

In [None]:
#-------------------------------------Cipro---------------------------------------------------------------------
dates=['20220531','20220531','20230131','20230131']
date_index=[1,2,1,2]

data=[]


data.append(pd.read_csv(f'{Folder}cooccurance_info_20220531-MIC-e.coli-cipro-1.csv'))
data.append(pd.read_csv(f'{Folder}cooccurance_info_20220531-MIC-e.coli-cipro-2.csv'))

data.append(pd.read_csv(f'{Folder}cooccurance_info_20230131-ecoli-cipro-1stexp.csv'))
data.append(pd.read_csv(f'{Folder}cooccurance_info_20230131-ecoli-cipro-2ndexp.csv'))

df=unionise_datasets(data,dates,date_index)
df.to_csv(f'../tables/probability_tables/Ciprofloxacin_cooccurance.csv')



In [None]:
#-------------------------------------AMP---------------------------------------------------------------------
dates=['20220614','20220614']
date_index=[1,2]

data=[]


data.append(pd.read_csv(f'{Folder}cooccurance_info_20220614-MIC-e.coli-amp-LB-1.csv'))
data.append(pd.read_csv(f'{Folder}cooccurance_info_20220614-MIC-e.coli-amp-LB-2.csv'))


df=unionise_datasets(data,dates,date_index)
df.to_csv(f'../tables/probability_tables/Ampicilin_cooccurance.csv')

In [None]:
#-------------------------------------CHP---------------------------------------------------------------------
dates=['20220524','20220602','20220628','20220628','20221012','20221013','20221031','20221031','20221122','20230111','20230111','20230221','20230313','20230313']
date_index=[1,1,1,2,1,1,1,2,1,1,2,1,1,2]

data =[]


data.append(pd.read_csv(f'{Folder}cooccurance_info_20220524-MIC-e.coli-chp-LB.csv'))
data.append(pd.read_csv(f'{Folder}cooccurance_info_20220602-MIC-e.coli-chp-LB.csv'))
data.append(pd.read_csv(f'{Folder}cooccurance_info_20220628-MIC-e.coli-chp-LB-1.csv'))
data.append(pd.read_csv(f'{Folder}cooccurance_info_20220628-MIC-e.coli-chp-LB-2.csv'))

data.append(pd.read_csv(f'{Folder}cooccurance_info_20221012-ecoli-chp.csv'))
data.append(pd.read_csv(f'{Folder}cooccurance_info_20221013-ecoli-chp.csv'))
data.append(pd.read_csv(f'{Folder}cooccurance_info_20221031-ecoli-chp1.csv'))
data.append(pd.read_csv(f'{Folder}cooccurance_info_20221031-ecoli-chp2.csv')) # A lot of empty wells which becomes positive (more than 50%) ==> Data not trustworthy for 0ug and 2ug, the rest all dead so we can't say much
data.append(pd.read_csv(f'{Folder}cooccurance_info_20221122-ecoli-chp.csv'))

data.append(pd.read_csv(f'{Folder}cooccurance_info_20230111-ecoli-chp.csv'))
data.append(pd.read_csv(f'{Folder}cooccurance_info_20230111-ecoli-chp-2.csv'))
data.append(pd.read_csv(f'{Folder}cooccurance_info_20230221-ecoli-chp-1.csv'))
data.append(pd.read_csv(f'{Folder}cooccurance_info_20230313-ecoli-chp-1.csv'))
data.append(pd.read_csv(f'{Folder}cooccurance_info_20230313-ecoli-chp-2.csv'))

df= unionise_datasets(data,dates,date_index)
df.to_csv(f'../tables/probability_tables/Chloramphenicol_cooccurance.csv')