In [11]:
import pandas as pd
import os
from collections import defaultdict
import numpy as np
import glob

## Read in and process KEGG reaction information for each osmolyte

In [2]:
df = pd.read_csv('../input/Osmolyte-Reactions.tsv', sep='\t')
df.ORTHOLOGY = df.ORTHOLOGY.str.split(';')
df = df[df['ORTHOLOGY'].notna()]
df['COMBINED_ORTHO'] = df[['NAMED_GROUP', 'PATHWAY']].astype(str).agg(':'.join, axis=1)
df['COMBINED_ORTHO_STEP'] = df[['NAMED_GROUP', 'PATHWAY', 'STEP']].astype(str).agg(':'.join, axis=1)
df['max_step']= df.groupby('COMBINED_ORTHO')['STEP'].transform('max')
df['ALLKO']=df.groupby('COMBINED_ORTHO_STEP')['ORTHOLOGY'].transform(sum)

#Subset the DF to just columns that we will be using
subset_df = df[['COMPOUND_NAME','COMPOUND','BROAD', 'NAMED_GROUP', 'COMBINED_ORTHO','COMBINED_ORTHO_STEP', 'ALLKO', 'max_step']]
subset_df = subset_df.loc[subset_df[['COMPOUND_NAME','COMPOUND','BROAD', 'NAMED_GROUP', 'COMBINED_ORTHO','COMBINED_ORTHO_STEP']].drop_duplicates().index]

#Index by KOs in an explosive fashion
ko_indexed = subset_df.explode('ALLKO').set_index('ALLKO')

# Read in and process Genome information from all MAG runs

In [3]:
def read_in_genomes(suffix, dirpath):
    #init dataframe
    full_ko_df = pd.read_csv(os.path.join(dirpath,'GABA'+suffix), index_col=0)
    full_ko_df.index = full_ko_df.index.str.split('/').str[-1].str.strip('.csv')
    for file in glob.glob(os.path.join(dirpath,'*'+suffix)):
        in_test = pd.read_csv(file, index_col=0)
        in_test.index = in_test.index.str.split('/').str[-1].str.strip('.csv')
        cols_to_use = in_test.columns.difference(full_ko_df.columns)
        full_ko_df = full_ko_df.merge(in_test[cols_to_use], left_index=True, right_index=True)

    # Convert dataframe to dictionary    
    ko_list = full_ko_df[full_ko_df ==1].stack().index.tolist()
    d1 = defaultdict(list)

    for k, v in ko_list:
        d1[k].append(v)

    ko_dict = dict((k, tuple(v)) for k, v in d1.items())
    for key in ko_dict:
        ko_dict[key] = list(ko_dict[key])
    return(ko_dict, full_ko_df)

In [4]:
bact_ko_dict, bact_ko_df = read_in_genomes('_bact.csv', '../kofamscan/ko_tables/')
mmetsp_ko_dict, mmetsp_ko_df = read_in_genomes('_mmetsp.csv', '../kofamscan/ko_tables/')


In [5]:
def determine_membership(ko_indexed_grouped, ko_dict):
    #Define the broad groups that we are considering
    out_dict = {}
    for metabolite, ko_indexed in ko_indexed_grouped.groupby('COMPOUND_NAME'):
        print(metabolite)
        broadgroups = ['TRANSPORT', 'BREAKDOWN', 'SYNTHESIS']
        #Create the dataframe that will ultimately be put out
        outdf = pd.DataFrame(columns=broadgroups, index = ko_dict.keys())
        # Loop over all the keys in the ko_dictionary (the dictionary that is form of organism: KOs)
        for key in ko_dict:
            kos = ko_dict[key] # list of kos
            kos = list(set(kos).intersection(ko_indexed.index))
            ko_subset = ko_indexed.loc[kos] #get just the entries from the DF that align with the organism
            # Loop over each of the broad categories
            for B in broadgroups: 
                # if broad group isn't in the DF then there is no point to go further-- it is absent
                if B not in ko_subset.BROAD.unique():
                    outdf.loc[key, B] = 0
                # Otherwise check and see if any of the broad group genes match to a pathway 
                #that only has one gene requried. If it shows up we are good. 
                else:
                    s = ko_subset[ko_subset.BROAD==B]
                    if any(s.max_step ==1):
                        outdf.loc[key, B] = 1
                    # Otherwise loop through all the paths and check that all of the steps are present for a given organism
                    else: 
                        for name, gs in s.groupby('COMBINED_ORTHO'):
                            c = int(gs.max_step.unique())
                            steps_present = []
                            for i in np.arange(1, c+1):
                                if np.any(gs.COMBINED_ORTHO_STEP == name+':'+(str(i))):
                                    steps_present.append(name+':'+(str(i)))
                            if len(steps_present) == c:
                                outdf.loc[key, B] = 1
                            elif len(steps_present)<c:
                                outdf.loc[key, B] = 0
                            else:
                                print('Something is wrong.')
        out_dict[metabolite]=outdf
                            
                                
    return(out_dict)


In [6]:
bact_synthesis_dict = determine_membership(ko_indexed,bact_ko_dict)

DMSP
Ectoine
GABA
Glutamate
Glutamine
Glycerol
Glycine betaine
Hydroxyectoine
Mannitol
Proline
Sarcosine
Sorbitol
Sucrose
TMAO
Taurine
Trehalose


In [7]:
mmetsp_synthesis_dict = determine_membership(ko_indexed,mmetsp_ko_dict)

DMSP
Ectoine
GABA
Glutamate
Glutamine
Glycerol
Glycine betaine
Hydroxyectoine
Mannitol
Proline
Sarcosine
Sorbitol
Sucrose
TMAO
Taurine
Trehalose


In [8]:
for key in mmetsp_synthesis_dict:
    mmetsp_synthesis_dict[key].to_csv(os.path.join('mmetsp',key+'_mmetsp_predicted_synthesis.csv'), sep='\t')

for key in bact_synthesis_dict:
    bact_synthesis_dict[key].to_csv(os.path.join('bact',key+'_bact_predicted_synthesis.csv'), sep='\t')

In [9]:
bact_synthesis_combined_df = pd.concat(bact_synthesis_dict.values(), keys=bact_synthesis_dict.keys(), axis=1)
mmetsp_synthesis_combined_df = pd.concat(mmetsp_synthesis_dict.values(), keys=mmetsp_synthesis_dict.keys(), axis=1)

In [10]:
bact_synthesis_combined_df.to_csv('predicted-bacterial-osmolyte-synthesis.tsv', sep='\t')
mmetsp_synthesis_combined_df.to_csv('predicted-mmetsp-osmolyte-synthesis.tsv', sep='\t')