In [61]:
import pandas as pd
import os
import numpy as np

In [62]:
excel_table =  'funprof_geomosaic_kos.csv'
gthered_table = './counts/raw_counts_intersect_bp.tsv'

lista = [
"CR18_ER180415_F",
"CR18_ER180415_S",
"CR18_LE180416_F",
"CR18_LE180416_S",
"CR18_LW180405_F",
"CR18_LW180405_S",
"CR18_XF180416_F",
"CR18_XF180416_S"
]

In [63]:
table = pd.read_csv(excel_table, sep = ',')
print(table.head())

   Element        Relevance         cycle  Pathway  \
0  Arsenic  Biogeochemistry       Arsenic     AsOx   
1  Arsenic  Biogeochemistry       Arsenic     AsOx   
2  Arsenic  Biogeochemistry       Arsenic    AsRed   
3  Arsenic  Biogeochemistry       Arsenic    AsRed   
4   Carbon  Biogeochemistry  Fermentation  Acetate   

                                        pathway name  gene  \
0                                 Arsenite oxidation  aoxA   
1                                 Arsenite oxidation  aoxB   
2  Dissimilatory arsenic reduction (arsRBC or ars...  arsC   
3  Dissimilatory arsenic reduction (arsRBC or ars...  arsC   
4                                Mixed acid: acetate  poxB   

                                           gene_name key_gene energyRole  \
0  arsenite oxidase small subunit [EC:1.20.2.1 1....      yes          D   
1  arsenite oxidase large subunit [EC:1.20.2.1 1....      yes          D   
2                                 arsenate reductase      yes          A  

In [64]:
import math
def redox_metaboli_index(donors_list:list,acceptors_list: list):
    num_donors = len(donors_list)
    num_acceptors = len(acceptors_list)
    
    donors_counts = np.array([num_donors])
    acceptors_counts = np.array([num_acceptors])
    # Compute the thoreatical maximum number of pairs 
    index2 = np.log(donors_counts) + np.log(acceptors_counts)
    # or equivalently    
    index = math.log(num_donors) + math.log(num_acceptors)
    #index2 = math.log(num_donors * num_acceptors)
    
    return index, index2



In [65]:
    
def get_KOS(working_dir:str,s:str):
    
    pckg = 'funprofiler'
    path_to_dir = os.path.join(working_dir,s,pckg)
    if os.path.exists(path_to_dir):
        table = os.path.join(path_to_dir,'prefetch_out.csv')
        results = pd.read_csv(table, sep = ',')

        ko_list = results["match_name"].str.split(':').str[1].unique().tolist()
        subset_ = results[["intersect_bp", "match_name"]]
        n_kos = len(ko_list)
    
    return n_kos,ko_list,subset_


In [66]:
def match_kos(unique_ko_list:list,spreadsheet:str):

    master_table = pd.read_csv(spreadsheet,sep = ',')
    # 1. Filter Sample KOs in our master table
    detected_mask = master_table['KO'].isin(unique_ko_list)
    filtered_df = master_table[detected_mask]
    # 2. Split into Donors (D) and Acceptors (A)
    donors = set(filtered_df[filtered_df['energyRole'] == 'D']['KO'])
    acceptors = set(filtered_df[filtered_df['energyRole'] == 'A']['KO'])

    return acceptors, donors

In [67]:
dictio = {}
all_A = []
all_D = []

raw_reads = pd.read_csv(gthered_table, sep = '\t')
for s in lista:

    sample_data = raw_reads[raw_reads[s] != 0][['ko_id', s]]
    kos = sample_data['ko_id'].unique()

    # #n,unique,raw_data = get_KOS(working_dir=working_dir,s=sample)
    print(f'Sample {s} has : {len(kos)} kos')
    acceptors, donors = match_kos(unique_ko_list=kos,spreadsheet=excel_table)
    all_D.append(donors)
    all_A.append(acceptors)
    
    print(f'Acceptors: {len(acceptors)}')
    print(f'Donors: {len(donors)}')
    index,i2 = redox_metaboli_index(donors,acceptors)
    print("Index:", index, '\n')

    dictio[s] = {'A': acceptors, 'D' : donors}

Sample CR18_ER180415_F has : 5676 kos
Acceptors: 52
Donors: 86
Index: 8.405591014834934 

Sample CR18_ER180415_S has : 5158 kos


Acceptors: 49
Donors: 80
Index: 8.273846932784508 

Sample CR18_LE180416_F has : 5949 kos
Acceptors: 57
Donors: 78
Index: 8.399760094524142 

Sample CR18_LE180416_S has : 4793 kos
Acceptors: 46
Donors: 74
Index: 8.132706489693266 

Sample CR18_LW180405_F has : 2241 kos
Acceptors: 36
Donors: 51
Index: 7.515344571180435 

Sample CR18_LW180405_S has : 4591 kos
Acceptors: 53
Donors: 86
Index: 8.42463920980563 

Sample CR18_XF180416_F has : 5748 kos
Acceptors: 57
Donors: 78
Index: 8.399760094524142 

Sample CR18_XF180416_S has : 4922 kos
Acceptors: 46
Donors: 76
Index: 8.159374736775426 



In [68]:
print(dictio)
#INtersection
common_A = set(all_A[0]).intersection(*all_A[1:])
common_D = set(all_D[0]).intersection(*all_D[1:])
print(common_A)
print(common_D)
# ALL
all_A = set().union(*all_A)
all_D = set().union(*all_D)
print(all_A)
print(all_D)

{'CR18_ER180415_F': {'A': {'K00404', 'K00374', 'K11180', 'K04561', 'K11181', 'K00537', 'K14028', 'K02567', 'K00405', 'K00425', 'K02275', 'K02276', 'K07306', 'K02299', 'K03385', 'K00194', 'K00363', 'K00407', 'K15862', 'K00370', 'K03741', 'K02568', 'K12528', 'K00410', 'K00413', 'K00371', 'K04015', 'K02297', 'K00411', 'K02274', 'K23995', 'K00467', 'K02298', 'K10946', 'K00426', 'K12527', 'K10944', 'K00158', 'K00198', 'K02827', 'K14029', 'K00362', 'K00412', 'K00368', 'K02257', 'K17877', 'K00406', 'K00395', 'K15408', 'K00376', 'K15864', 'K00394'}, 'D': {'K00443', 'K15022', 'K06282', 'K18007', 'K00024', 'K22622', 'K00125', 'K00122', 'K00371', 'K14091', 'K04072', 'K03390', 'K00202', 'K17229', 'K00132', 'K00158', 'K10944', 'K00245', 'K00241', 'K00235', 'K08356', 'K00200', 'K14028', 'K00242', 'K00124', 'K17227', 'K08264', 'K22516', 'K14090', 'K00030', 'K00467', 'K03389', 'K00234', 'K17993', 'K10946', 'K22515', 'K17218', 'K08355', 'K00164', 'K14127', 'K00123', 'K01678', 'K10535', 'K00031', 'K0407

In [69]:
lista = common_A.to_list()
for item in common_A:
    print(item)



AttributeError: 'set' object has no attribute 'to_list'

In [None]:
#ACCEPTORS
dsr = ["K11180","K11181","K27187"] #dsrA,B,gamma
DMSORed = ['K00184', 'K00185', 'K07306', 'K07307', 'K07308', 'K16964']
#DONors
hyd = ['K17993', 'K17994', 'K17995', 'K17996'] #Sulfhydrogenase, (sulfide)n -> (sulfide)n-1 #NOT fo


In [None]:

#item = "K11180"
for item in hyd:
    if item in all_D:
        print(item)

K17993
K17994
K17995
K17996
