In [9]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm



# Prepare BridgIT+ input based on BridgIT results

The input for BridgIT+ is generated using [BridgIT](https://www.pnas.org/doi/10.1073/pnas.1818877116) . The standard BridgIT output has a similarity score for each predicted EC number. The reactive site is identified using BNICE.ch reaction rules, the fingerprint is generated, and the EC number and score are predicted as described in the original BridgIT publication. Alternatively, any set of EC numbers per reaction can be used as input.

This script helps to translate BridgIT output as provided by [LCSB database website](https://lcsb-databases.epfl.ch/pathways/Bridgit)

To access BridgIT, please, fill the [request form](https://lcsb-databases.epfl.ch/pathways/database_requests)


## 1. Use BridgIT for your orphan reactions according to the instructions on the website
BridgIT provides predictions for any orphan reaction that is balanced and can be reconstructed using a BNICE reaction rule
![BridgIT interface](figures_for_jupyter/BridgITinterface.png)

## 2. Make sure you got the BridgIT results
If BridgIT was able to find the results for your reaction, your BridgIT results ( Tanimoto_Atom_reactions_\*.txt file) should look similar to the following: 
![BridgIT results example](figures_for_jupyter/tanimotoResultsExample.png)

## 3. Decide on your parameters
Now that you got your BridgIT predictions file, you can decide whether your would prefer to change the default parameters:
1. Tanimoto similarity threshold (see BridgIT publication for more detail). Here the default is 0.9.
2. If you choose to select only the maximum level with promiscuous EC class predictions or all levels for one reaction. The default is only the maximum level with promiscuous predictions. Choosing to select all levels leads to increased time of calculation for BridgIT+ during the sequence request and alignment steps.

The desicion is affected by the BridgIT results. If the top-ranked results have high scores (>0.9) and many alternatives are proposed, we suggest to keep the default parameters. In case BridgIT's best predictions have low Tanimoto similarity scores or few alternatives are proposed, we suggest to lower the Tanimoto similarity threshold accordingly and create input for each individual level (MAX_LVL_ONLY = False).

In [2]:
# TST - Tanimoto similarity threshold
TST = 0.9 # Default value for TST is 0.9

# MAX_LVL_ONLY - Boolean indicating if only the maximum level with promiscuity is selected or all the levels
MAX_LVL_ONLY = False

## 4. Create the project folder structure for your orphan reaction inside the projects folder.
put the name of the folder as the project name in the cell below and execute the following cell

In [1]:
projectname = "all_known_reactions_generate_profiles"

In [2]:
if not os.path.exists('projects/'+projectname):
    os.mkdir('projects/'+projectname)
if not os.path.exists('projects/'+projectname+'/BridgIT_results/'):
    os.mkdir('projects/'+projectname+'/BridgIT_results/')
if not os.path.exists('projects/'+projectname+'/input/'):
    os.mkdir('projects/'+projectname+'/input/')

NameError: name 'os' is not defined

This is the path where you should transfer the BridgIT results that you downloaded from the website

In [3]:
path_to_bridgit_results = 'projects/'+projectname+'/BridgIT_results/'

This is were your BridgIT+ input will be after the execution

In [4]:
bridgit_plus_input = 'projects/'+projectname+'/input/'+'/input_profiles_'+projectname+'.txt'

## 5. Transfer the Tanimoto_Atom_reactions_* files of your BridgIT results into the BridgIT_results folder (manual step)

## 6. Execute the cells below to create BridgIT+ input file

In [7]:
def extract_BridgIT_candidates(rxn, L=0):
    """
    Convert the table of BridgIT results into BridgIT+ input according to the defined BridgIT fingerprint level
    params:
    rxn: reaction ID (* of the Tanimoto_Atom_reactions_*.txt)
    L: BridgIT fingerprint level
    return:
    returns list of candidate ECs and reference EC
    """
    ECs_all=[]
    reference=[]
    
    # The iteration over the files of the directory is necessary in case 
    # one reaction has results for several tautomer options
    for file in os.listdir(path_to_bridgit_results):
        if file.startswith('Tanimoto_Atom_reactions_'+rxn):
            # Read the data from the BridgIT results as pandas dataframe   
            df=pd.read_csv(path_to_bridgit_results+file,sep='\t+|/',engine="python")
            if 'simialr_reaction_ID' in df.columns:
                L0,L1,L2,L3,L4,L5,L6,L7 = 'L0','L1','L2','L3','L4','L5','L6','L7'
                Similar_rxn, Input_rxn, Bridgit_Score = 'Enzymatic_rule','not_used','BridgIT_Scores'
            else:
                L0,L1,L2,L3,L4,L5,L6,L7 = 'TL0','TL1','TL2','TL3','TL4','TL5','TL6','TL7'
                Similar_rxn, Input_rxn, Bridgit_Score = 'ECA','not_used','Tanimoto_FBI_Scores'

            df.rename({' '+L0:L0},axis='columns',inplace=True)

            # Collect the promiscuous ECs per level according to the threshold
            if L==0:
                ECs=df.loc[(df[L0]==1)&(df[Bridgit_Score]>=0.3)][Similar_rxn]
            elif L==1:
                ECs=df.loc[(df[L0]==1)&(df[L1]>=TST)&(df[Bridgit_Score]>=0.3)][Similar_rxn]
            elif L==2:
                ECs=df.loc[(df[L0]==1)&(df[L1]>=TST)&(df[L2]>=TST)&(df[Bridgit_Score]>=0.3)][Similar_rxn]
            elif L==3:
                ECs=df.loc[(df[L0]==1)&(df[L1]>=TST)&(df[L2]>=TST)&(df[L3]>=TST)&(df[Bridgit_Score]>=0.3)][Similar_rxn]
            elif L==4:
                ECs=df.loc[(df[L0]==1)&(df[L1]>=TST)&(df[L2]>=TST)&(df[L3]>=TST)&(df[L4]>=TST)&(df[Bridgit_Score]>=0.3)][Similar_rxn]
            elif L==5:
                ECs=df.loc[(df[L0]==1)&(df[L1]>=TST)&(df[L2]>=TST)&(df[L3]>=TST)&(df[L4]>=TST)&(df[L5]>=TST)&(df[Bridgit_Score]>=0.3)][Similar_rxn]
            elif L==6:
                ECs=df.loc[(df[L0]==1)&(df[L1]>=TST)&(df[L2]>=TST)&(df[L3]>=TST)&(df[L4]>=TST)&(df[L5]>=TST)&(df[L6]>=TST)&(df[Bridgit_Score]>=0.3)][Similar_rxn]
            elif L==7:
                ECs=df.loc[(df[L0]==1)&(df[L1]>=TST)&(df[L2]>=TST)&(df[L3]>=TST)&(df[L4]>=TST)&(df[L5]>=TST)&(df[L6]>=TST)&(df[L7]>=TST)&(df[Bridgit_Score]>=0.3)][Similar_rxn]
            else:
                raise inputError('Error, L not valid')

            
            ECs=ECs.unique()[0:10000]
            ECs=np.delete(ECs,np.where(ECs==';'))
            ECs=[x for x in ','.join(ECs).replace(';','').split(',') if not (x.endswith('-') or x=='' or x=='SPONTANEOUS' or x=='NonEnzymatic')]
            ECs=[x for x in ECs if x]
            ECs_all.extend(ECs)

            # Collect reference ECs - ECs of reactions exactly matched to the input reaction on all levels
            
            refs=df.loc[(df[Bridgit_Score]==1)][Similar_rxn].unique()[0:10000]
            refs=np.delete(refs,np.where(refs==';'))
            refs=[x for x in ','.join(refs).replace(';','').split(',') if not (x.endswith('-') or x=='' or x=='SPONTANEOUS' or x=='NonEnzymatic')]
            refs=[x for x in refs if x]
            reference.extend(refs)

    return [list(set(ECs_all)), list(set(reference))]

The function to choose reference level from 3D array of level-ECs-threshold ![3D array](figures_for_jupyter/3DarrayOfCandidateEcs.png)

In [8]:
def choose_Candidate_Ecs_level(candidate_ECs):
    # initiate the search with maximum possible level
    chosen_level = 7
    while chosen_level > 0:
        # compare promiscuous ECs to reference ECs
        if not set(candidate_ECs[chosen_level][0])==set(candidate_ECs[chosen_level][1]):
            return chosen_level
        else:
            chosen_level-=1
    # In case none of the levels have promiscuous predictions, return the max level
    return 7    

In [9]:
# Collect the list of the reactions to be transformed into BridgIT+ input
# split('t')[0] is necessary in case tautomers are considered
# Therefore, be sure that your reaction name does not contain 't', otherwise, remove split('t')[0]

list_rxns=[]
for file in os.listdir(path_to_bridgit_results):
    if file.startswith('Tanimoto_Atom_reactions_'):
        list_rxns.append(file.replace('Tanimoto_Atom_reactions_','').replace('.txt','').split('t')[0])
list_rxns = list(set(list_rxns))
list_rxns.sort()
exit()

with open(bridgit_plus_input, mode='w') as res_selected, open(bridgit_plus_input.split('.')[0]+'_all_levels.txt', 'w') as res_all:
    for rxn in tqdm(list_rxns):
        # Extract the candidates per level for each BridgIT fingerprint level
        candidate_ECs = [extract_BridgIT_candidates(rxn,lvl) for lvl in range(8)]

        # Identify the maximum level with promiscuity
        lvl = choose_Candidate_Ecs_level(candidate_ECs)
        res_selected.write(str(rxn)+'L'+str(lvl)+'\t,'+str(','.join(candidate_ECs[lvl][1]))+'\t,'\
                      +str(','.join(candidate_ECs[lvl][0]))+'\n')
        if not MAX_LVL_ONLY:
            for lvl in range(8):
                res_all.write(str(rxn)+'L'+str(lvl)+'\t,'+str(','.join(candidate_ECs[lvl][1]))+'\t,'\
                          +str(','.join(candidate_ECs[lvl][0]))+'\n')


100%|██████████| 6916/6916 [4:24:03<00:00,  1.20s/it]   
ERROR:root:Invalid alias: The name clear can't be aliased because it is another magic command.
ERROR:root:Invalid alias: The name more can't be aliased because it is another magic command.
ERROR:root:Invalid alias: The name less can't be aliased because it is another magic command.
ERROR:root:Invalid alias: The name man can't be aliased because it is another magic command.


In [10]:
# separate empty results from ready results
with open(bridgit_plus_input) as f, \
     open(bridgit_plus_input.split('.')[0]+'_empty.txt','w') as empty,\
     open(bridgit_plus_input.split('.')[0]+'_processable.txt','w') as processable:
    for line in f:
        if line.split('\t')[1].replace(',','') == '' and line.strip().split('\t')[2].replace(',','') == '':
            empty.write(line)
        else:
            processable.write(line)
os.system('mv '+bridgit_plus_input.split('.')[0]+'_processable.txt' + ' ' + bridgit_plus_input)

0