# Processing the transporter data

In [14]:
import pandas as pd, os, numpy as np

In [15]:
def filter_low_abundance(df, f=0.5):
    df_sum = df.groupby(level=0).sum()
    df_sum.drop("SUM",inplace=True,errors="ignore")
    n = df_sum.div(df_sum.sum())*100
    n = n.loc[n.max(axis=1)>=f]
    return df.loc[list(n.index)+["SUM"]]

In [16]:
def normalize(df):
    n = df.groupby(level=0).sum()
    n = n.div(n.sum())*100
    df.iloc[:,3:] = n
    return df

In [17]:
def tpm(df,rl=100):
    
    # Make gene length data frame for divisions
    sample_fl = {}
    for sample in df.columns[1:]:
        sample_fl[sample] = list(df.gene_length)
    sample_fl_df = pd.DataFrame(sample_fl,index=df.index)
    
    t = rl * df.loc[:,df.columns[1:]].divide(sample_fl_df)
    T = t.sum()
    TPM = t.multiply(1e6).divide(T)
    
    return TPM

## Load the metaomic data

### Taxonomy

In [18]:
def make_replace_dict(series, old_string, new_string):
    replace_dict = {}
    for item in series.unique():
        replace_dict[item] = item.replace(old_string, new_string)
    return replace_dict

In [19]:
def reformat_taxtable(tax):
    # Fill empty entries
    tax_reformatted = tax.fillna("")
    for i,rank in enumerate(tax_reformatted.columns):
        # Remove trailing numbers
        series = tax_reformatted.loc[tax_reformatted[rank].str.match(".+_{}".format(i))][rank]
        trailing = len(series)
        replace_dict = make_replace_dict(series, "_{}".format(i), "")
        tax_reformatted.loc[tax_reformatted[rank].str.match(".+_{}".format(i)),rank] = series.replace(replace_dict)
        # Remove ranks in parenthesis
        try:
            prev_rank = tax_reformatted.columns[i-1]
            series = tax_reformatted.loc[tax_reformatted[rank].str.match(".+ \({}\)".format(prev_rank))][rank]
            parentheses = len(series)
            replace_dict = make_replace_dict(series, " ({})".format(prev_rank), "")
            tax_reformatted.loc[tax_reformatted[rank].str.match(".+ \({}\)".format(prev_rank)),rank] = series.replace(replace_dict)
        except IndexError:
            pass
        # Replace "unclassified " with "Unclassified."
        series = tax_reformatted.loc[tax_reformatted[rank].str.contains("unclassified .+"),rank]
        unclassifieds = len(series)
        replace_dict = make_replace_dict(series, "unclassified ", "Unclassified.")
        tax_reformatted.loc[tax_reformatted[rank].str.contains("unclassified .+"),rank] = series.replace(replace_dict)
        # Replace empty cells with "Unclassified.<lca>"
        # Handle unclassified at superkingdom rank
        series = tax_reformatted.loc[tax_reformatted[rank]=="",rank]
        lcas = len(series)
        if i == 0:
            tax_reformatted.loc[tax_reformatted[rank]=="",rank] = series.replace({"": "Unclassified"})
        else:
            series = tax_reformatted.loc[tax_reformatted[rank]=="",rank]
            parent_series = tax_reformatted.loc[series.index, tax_reformatted.columns[i-1]]
            replace_dict = {}
            for item in parent_series.unique():
                if item[0:12] == "Unclassified":
                    new_string = item
                else:
                    new_string = "Unclassified.{}".format(item)
                replace_dict[item] = new_string
            tax_reformatted.loc[tax_reformatted[rank]=="",rank] = parent_series.replace(replace_dict)
        print("Processed rank {}: {} trailing numbers removed, {} parentheses replaced, {} unclassifieds replaced, {} empty entries filled".format(rank, trailing, parentheses, unclassifieds, lcas))
    return tax_reformatted

In [20]:
tax = pd.read_table("data/barm_files/annotations/taxonomy_per_gene.tsv.gz", header=0, index_col=0)
tax.head()

Unnamed: 0_level_0,superkingdom,phylum,class,order,family,genus,species
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
k99_240438_6,Bacteria,,,,,,
k99_165505_3,Bacteria,Verrucomicrobia,,,,,
k99_143289_2,Bacteria,,,,,,
k99_310166_6,Eukaryota,Chlorophyta,Mamiellophyceae,Mamiellales,,,
k99_284995_17,Eukaryota,Chlorophyta,,,,,


Edit the taxonomic data to:

1. Remove trailing '_n' tags
2. Remove '(class)' tags for e.g. Actinobacteria
3. Add 'Unclassified.<taxname>' to lower unclassified ranks

In [21]:
if os.path.exists("data/barm_files/annotations/taxonomy_per_gene_reformat.tsv.gz"):
    tax_reformatted = pd.read_table("data/barm_files/annotations/taxonomy_per_gene_reformat.tsv.gz", index_col=0)
else:
    tax_reformatted = reformat_taxtable(tax)
    tax_reformatted.to_csv("data/barm_files/annotations/taxonomy_per_gene_reformat.tsv.gz", sep="\t", compression='gzip')

### TIGRFAM annotations

In [22]:
tigrfams = pd.read_csv("data/barm_files/annotations/all.TIGRFAM.standardized.tsv.gz", usecols=[0,1],names=["gene_id","fam"],header=0,sep="\t")
tigrfams.head(10)

Unnamed: 0,gene_id,fam
0,k99_10000020_1,TIGR00214
1,k99_10000020_2,TIGR00510
2,k99_10000077_5,TIGR01473
3,k99_1000008_1,TIGR00200
4,k99_10000154_2,TIGR00049
5,k99_10000155_3,TIGR01904
6,k99_1000015_1,TIGR01941
7,k99_10000270_10,TIGR01063
8,k99_10000270_12,TIGR00181
9,k99_10000270_15,TIGR00696


### Transporter info

Specify Uniprot version to use

In [23]:
uniprot_ver = "2017_12"

In [24]:
transdef = pd.read_csv("https://raw.githubusercontent.com/johnne/transporters/master/results/transport-clusters.{}.tab".format(uniprot_ver), 
                       header=None, sep="\t", names=["transporter","fam"])
print("{} transporters, {} protein families".format(len(transdef.transporter.unique()), len(transdef.fam)))

1076 transporters, 1403 protein families


Limit to TIGRFAM definitions.

In [25]:
transdef = transdef.loc[transdef.fam.str.contains("TIGR")]
print("{} remaining transporters, {} TIGRFAMs".format(len(transdef.transporter.unique()), len(transdef.fam)))

406 remaining transporters, 458 TIGRFAMs


## Merge transporter definitions with annotations and abundance

### Merge annotations

Merge gene TIGRFAM annotations with transporter definitions.

In [26]:
gene_trans = pd.merge(tigrfams,transdef, left_on="fam",right_on="fam")
print(" {} open reading frames, {} transporters, {} TIGRFAMs".format(len(gene_trans.gene_id.unique()), len(gene_trans.transporter.unique()), len(gene_trans.fam.unique())))

 66029 open reading frames, 275 transporters, 314 TIGRFAMs


In [27]:
gene_trans.sample(10)

Unnamed: 0,gene_id,fam,transporter
50920,k99_18181643_2,TIGR01256,T72
3169,k99_11203365_12,TIGR04183,T328
38613,k99_9232285_2,TIGR01730,T2
17353,k99_7268451_23,TIGR04183,T328
58306,k99_13349335_5,TIGR01945,T15
31568,k99_9790064_2,TIGR01352,T59
17867,k99_8344141_2,TIGR04183,T328
13097,k99_34183219_1,TIGR04183,T328
35499,k99_19658758_8,TIGR01730,T2
21880,k99_11782779_4,TIGR00879,T12


Add taxonomy.

In [28]:
gene_trans_tax = pd.merge(gene_trans,tax_reformatted,left_on="gene_id",right_index=True)
gene_trans_tax.head()

Unnamed: 0,gene_id,fam,transporter,superkingdom,phylum,class,order,family,genus,species
0,k99_10000306_14,TIGR00797,T85,Bacteria,Actinobacteria,Unclassified.Actinobacteria,Unclassified.Actinobacteria,Unclassified.Actinobacteria,Unclassified.Actinobacteria,Unclassified.Actinobacteria
1,k99_10027128_5,TIGR00797,T85,Bacteria,Unclassified.Bacteria,Unclassified.Bacteria,Unclassified.Bacteria,Unclassified.Bacteria,Unclassified.Bacteria,Unclassified.Bacteria
2,k99_1008623_2,TIGR00797,T85,Bacteria,Bacteroidetes,Unclassified.Bacteroidetes,Unclassified.Bacteroidetes,Unclassified.Bacteroidetes,Unclassified.Bacteroidetes,Unclassified.Bacteroidetes
3,k99_10095352_6,TIGR00797,T85,Bacteria,Proteobacteria,Unclassified.Proteobacteria,Unclassified.Proteobacteria,Unclassified.Proteobacteria,Unclassified.Proteobacteria,Unclassified.Proteobacteria
4,k99_10130254_3,TIGR00797,T85,Eukaryota,Unclassified.Eukaryota,Unclassified.Eukaryota,Unclassified.Eukaryota,Unclassified.Eukaryota,Unclassified.Eukaryota,Unclassified.Eukaryota


In [30]:
gene_trans_tax.set_index("gene_id", inplace=True)

### Merge abundances

#### Metagenomes

The metagenomic time-series has some dubious samples that may have been mis-labeled.

In [31]:
dubious = ["120507","120521","120910","121123"]

Read abundance tables for metagenomic samples

In [32]:
mg_cov = pd.read_table("data/mg/all_genes.tpm.tsv.gz", index_col=0)
mg_raw = pd.read_table("data/mg/all_genes.raw_counts.tsv.gz", index_col=0)

# Drop gene length column and dubious samples
mg_cov.drop("gene_length",axis=1,inplace=True,errors="ignore")
mg_cov.drop(dubious,inplace=True,axis=1,errors="ignore")

mg_raw.drop("gene_length",axis=1,inplace=True,errors="ignore")
mg_raw.drop(dubious,inplace=True,axis=1,errors="ignore")

Merge with taxonomic annotations for all genes

In [46]:
mg_taxcov = pd.merge(tax_reformatted,mg_cov,left_index=True,right_index=True)
mg_taxraw = pd.merge(tax_reformatted,mg_raw,left_index=True,right_index=True)
if not os.path.exists("data/mg/all_genes.tpm.taxonomy.tsv.gz"):
    mg_taxcov.to_csv("data/mg/all_genes.tpm.taxonomy.tsv.gz", sep="\t", compression="gzip")
if not os.path.exists("data/mg/all_genes.raw_counts.taxonomy.tsv.gz"):
    mg_taxraw.to_csv("data/mg/all_genes.raw_counts.taxonomy.tsv.gz", sep="\t", compression="gzip")

Merge with transporters/taxonomy table.

In [45]:
mg_transcov = pd.merge(gene_trans_tax, mg_cov, left_index=True, right_index=True)
mg_transraw = pd.merge(gene_trans_tax, mg_raw, left_index=True, right_index=True)
if not os.path.exists("results/mg/all_transporters.tpm.taxonomy.tsv.gz"):
    mg_transcov.to_csv("results/mg/all_transporters.tpm.taxonomy.tsv.gz", sep="\t", compression="gzip")
if not os.path.exists("results/mg/all_transporters.raw_counts.taxonomy.tsv.gz"):
    mg_transraw.to_csv("results/mg/all_transporters.raw_counts.taxonomy.tsv.gz", sep="\t", compression="gzip")

Store total raw counts per sample.

In [35]:
mg_raw_tot = mg_raw.loc[mg_raw.index.str.match("^k.+")].sum()
mg_raw_tot = pd.DataFrame(mg_raw_tot,columns=["total_counts"])
mg_raw_tot.to_csv("data/mg/all_genes.total_counts.tsv", sep="\t")

#### Metatranscriptomes

The metatranscriptomic time-series needs to have the sample_ids renamed to sample dates.

In [36]:
mt_sample_names = {"P1456_101":"120516", "P1456_102":"120613", "P1456_103":"120712", 
                   "P1456_104":"120813", "P1456_105":"120927", "P1456_106":"121024", 
                   "P1456_107":"121220", "P1456_108":"130123", "P1456_109":"130226", 
                   "P1456_110":"130403", "P1456_111":"130416", "P1456_112":"130422", 
                   "P3764_101":"130507", "P3764_102":"130605", "P3764_103":"130705", 
                   "P3764_104":"130815", "P3764_105":"130905", "P3764_106":"131003", 
                   "P3764_112":"140408", "P3764_113":"140506", "P3764_114":"140604", 
                   "P3764_115":"140709", "P3764_116":"140820", "P3764_117":"140916", 
                   "P3764_118":"141013"}

In [37]:
mt_cov = pd.read_csv("data/mt/all_genes.tpm.tsv.gz", header=0, sep="\t", index_col=0)
mt_raw = pd.read_csv("data/mt/all_genes.raw_counts.tsv.gz", header=0, sep="\t", index_col=0)

In [39]:
mt_cov.drop("gene_length",axis=1,inplace=True,errors="ignore")
mt_cov.rename(columns=mt_sample_names,inplace=True)
mt_raw.drop("gene_length",axis=1,inplace=True,errors="ignore")
mt_raw.rename(columns=mt_sample_names,inplace=True)

Merge with taxonomic annotations for all genes

In [47]:
mt_taxcov = pd.merge(tax_reformatted,mt_cov,left_index=True,right_index=True)
mt_taxraw = pd.merge(tax_reformatted,mt_raw,left_index=True,right_index=True)
if not os.path.exists("data/mt/all_genes.tpm.taxonomy.tsv.gz"):
    mt_taxcov.to_csv("data/mt/all_genes.tpm.taxonomy.tsv.gz", sep="\t", compression="gzip")
if not os.path.exists("data/mt/all_genes.raw_counts.taxonomy.tsv.gz"):
    mt_taxraw.to_csv("data/mt/all_genes.raw_counts.taxonomy.tsv.gz", sep="\t", compression="gzip")

Merge with transporters/taxonomy table.

In [48]:
mt_transcov = pd.merge(gene_trans_tax, mt_cov, left_index=True, right_index=True)
mt_transraw = pd.merge(gene_trans_tax, mt_raw, left_index=True, right_index=True)
if not os.path.exists("results/mt/all_transporters.tpm.taxonomy.tsv.gz"):
    mt_transcov.to_csv("results/mt/all_transporters.tpm.taxonomy.tsv.gz", sep="\t", compression="gzip")
if not os.path.exists("results/mt/all_transporters.raw_counts.taxonomy.tsv.gz"):
    mt_transraw.to_csv("results/mt/all_transporters.raw_counts.taxonomy.tsv.gz", sep="\t", compression="gzip")

Store total raw counts per sample.

In [42]:
mt_raw_tot = mt_raw.loc[mt_raw.index.str.match("^k.+")].sum()
mt_raw_tot = pd.DataFrame(mt_raw_tot,columns=["total_counts"])
mt_raw_tot.to_csv("data/mt/all_genes.total_counts.tsv", sep="\t")

## Calculate total transporter abundance

Transporter abundances are calculated using the normalized TPM values. However, the DeSeq2 package requires raw counts so for that purpose the summed raw counts are calculated for 1 representative protein family per transporter cluster.

In [43]:
def get_representatives(df):
    '''Finds representative families for each transporter based on highest mean'''
    df_mean = df.groupby(["fam","transporter"]).sum().mean(axis=1).reset_index()
    df_mean.sort_values(0,ascending=False,inplace=True)
    df_mean.index = list(range(0,len(df_mean)))
    reps = {}
    for i in df_mean.index:
        fam = df_mean.loc[i,"fam"]
        t = df_mean.loc[i,"transporter"]
        if t in reps.keys():
            continue
        reps[t] = fam
    return reps

Sum to protein family.

In [49]:
mg_fam_sum = mg_transcov.groupby(["fam","transporter"]).sum().reset_index()
# Get representative families for each transporter cluster (for use with DSeq2)
mg_reps = get_representatives(mg_fam_sum)
mg_reps = pd.DataFrame(data=mg_reps,index=["fam"]).T

In [50]:
mt_fam_sum = mt_transcov.groupby(["fam","transporter"]).sum().reset_index()
# Get representative families for each transporter cluster (for use with DSeq2)
mt_reps = get_representatives(mt_fam_sum)
mt_reps = pd.DataFrame(data=mt_reps,index=["fam"]).T

Group by transporter and calculate means.

In [51]:
mg_trans = mg_fam_sum.groupby("transporter").mean()
mg_trans_percent = mg_trans.div(mg_trans.sum())*100
mg_trans.to_csv("results/mg/all_trans.tpm.tsv", sep="\t")
mg_trans_percent.to_csv("results/mg/all_trans.tpm.percent.tsv", sep="\t")

In [52]:
mt_trans = mt_fam_sum.groupby("transporter").mean()
mt_trans_percent = mt_trans.div(mt_trans.sum())*100
mt_trans.to_csv("results/mt/all_trans.tpm.tsv", sep="\t")
mt_trans_percent.to_csv("results/mt/all_trans.tpm.percent.tsv", sep="\t")

Calculate transporter maximum (in % of total transporters) across all samples.

In [53]:
mg_trans_percent_max = mg_trans_percent.max(axis=1)
mt_trans_percent_max = mt_trans_percent.max(axis=1)

Output max abundances for transporters for filtering

In [54]:
print("{} transporters with max% > 0.5 in the mg-samples".format(len(mg_trans_percent_max.loc[mg_trans_percent_max>=0.5])))

81 transporters with max% > 0.5 in the mg-samples


In [55]:
print("{} transporters with max% > 0.5 in the mt-samples".format(len(mt_trans_percent_max.loc[mt_trans_percent_max>=0.5])))

84 transporters with max% > 0.5 in the mt-samples


Write raw counts for representative protein families.

In [56]:
mg_reps_raw = pd.merge(mg_reps,mg_transraw,left_on="fam",right_on="fam")
mg_reps_raw_sum = mg_reps_raw.groupby("transporter").sum()
mg_reps_raw_sum.to_csv("results/mg/rep_trans.raw_counts.tsv", sep="\t")

In [57]:
mt_reps_raw = pd.merge(mt_reps,mt_transraw,left_on="fam",right_on="fam")
mt_reps_raw_sum = mt_reps_raw.groupby("transporter").sum()
mt_reps_raw_sum.to_csv("results/mt/rep_trans.raw_counts.tsv", sep="\t")

### Calculate transporter abundances for bacteria

Metagenome

In [58]:
# Get genes classified as bacteria but not cyanobacteria
mg_transcov_bac = mg_transcov.loc[(mg_transcov.superkingdom=="Bacteria")&(mg_transcov.phylum!="Cyanobacteria")]
# Calculate sum of protein families 
mg_transcov_bac_fam = mg_transcov_bac.groupby(["fam","transporter"]).sum().reset_index()
# Calculate mean of transporters
mg_trans_bac = mg_transcov_bac_fam.groupby("transporter").mean()
mg_trans_bac.to_csv("results/mg/bac_trans.tpm.tsv", sep="\t")

Metatranscriptome

In [59]:
# Get genes classified as bacteria but not cyanobacteria
mt_transcov_bac = mt_transcov.loc[(mt_transcov.superkingdom=="Bacteria")&(mt_transcov.phylum!="Cyanobacteria")]
# Calculate sum of protein families 
mt_transcov_bac_fam = mt_transcov_bac.groupby(["fam","transporter"]).sum().reset_index()
# Calculate mean of transporters
mt_trans_bac = mt_transcov_bac_fam.groupby("transporter").mean()
mt_trans_bac.to_csv("results/mt/bac_trans.tpm.tsv", sep="\t")

## Selected transporters

A subset of 58 transporters were selected for this study, based on abundances in the dataset (>=0.5% max in at least one sample) and their putative substrates. They were classified manually using TIGRFAM roles and Gene Ontology mappings. 

The curated table is stored under **results/selected_transporters_classified.tab**

In [60]:
transinfo = pd.read_table("results/selected_transporters_classified.tab", index_col=0)
transinfo.head()

Unnamed: 0_level_0,substrate_category,type,name,abbreviation
transporter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
T1068,AA peptide,2a,cyclic peptide transporter,AA-PEP
T534,AA peptide,2a,lao: LAO/AO transport,AA-PEP
T52,AA peptide,2a,livcs: branched-chain amino acid transport,AA-PEP
T37,AA peptide,2a,potA: polyamine ABC transporter,AA-PEP
T42,AA peptide,3a,proV: glycine betaine/L-proline,AA-PEP


Limit the transporter definitions to the selected transporters.

In [61]:
transdef_select = transdef.loc[transdef.transporter.isin(transinfo.index)]
print("{} transporters remaining, comprising {} TIGRFAMS".format(len(transdef_select.transporter.unique()), len(transdef_select.fam.unique())))

57 transporters remaining, comprising 85 TIGRFAMS


In [62]:
pd.merge(transinfo,transdef_select, left_index=True, right_on="transporter")

Unnamed: 0,substrate_category,type,name,abbreviation,transporter,fam
1384,AA peptide,2a,cyclic peptide transporter,AA-PEP,T1068,TIGR01194
850,AA peptide,2a,lao: LAO/AO transport,AA-PEP,T534,TIGR00750
251,AA peptide,2a,livcs: branched-chain amino acid transport,AA-PEP,T52,TIGR00796
204,AA peptide,2a,potA: polyamine ABC transporter,AA-PEP,T37,TIGR01187
206,AA peptide,2a,potA: polyamine ABC transporter,AA-PEP,T37,TIGR03258
222,AA peptide,3a,proV: glycine betaine/L-proline,AA-PEP,T42,TIGR01186
184,AA peptide,2a,small oligopeptide transporter,AA-PEP,T32,TIGR00727
185,AA peptide,2a,small oligopeptide transporter,AA-PEP,T32,TIGR00728
54,AA peptide,2a,sodium/proline symporter,AA-PEP,T7,TIGR02711
56,AA peptide,2a,sodium/proline symporter,AA-PEP,T7,TIGR00813


Add substrate categories to the dataframes.

In [330]:
# Mean abundances of transporters for selected transporters
mg_trans_select = pd.merge(transinfo.loc[transdef_select.transporter.unique()],mg_trans,left_index=True,right_index=True)
mg_trans_select.to_csv("results/mg/select_trans.tpm.tsv", sep="\t")
# Mean abundances of transporters for bacteria and selected transporters
mg_trans_bac_select = pd.merge(transinfo.loc[transdef_select.transporter.unique()],mg_trans_bac,left_index=True,right_index=True)
mg_trans_select.to_csv("results/mg/bac_select_trans.tpm.tsv", sep="\t")
# TPM values per gene for genes matching selected transporters
mg_transcov_select = pd.merge(transinfo.loc[transdef_select.transporter.unique()],mg_transcov,left_index=True,right_on="transporter")
mg_transcov_select.to_csv("results/mg/select_trans_genes.tpm.tsv", sep="\t")
# TPM values per gene for bacterial genes matching selected transporters
mg_transcov_bac_select = pd.merge(transinfo.loc[transdef_select.transporter.unique()],mg_transcov_bac,left_index=True,right_on="transporter")
mg_transcov_bac_select.to_csv("results/mg/bac_select_trans_genes.tpm.tsv", sep="\t")

Metatranscriptomes

In [331]:
# Mean abundances of transporters for selected transporters
mt_trans_select = pd.merge(transinfo.loc[transdef_select.transporter.unique()],mt_trans,left_index=True,right_index=True)
mt_trans_select.to_csv("results/mt/select_trans.tpm.tsv", sep="\t")
# Mean abundances of transporters for bacteria and selected transporters
mt_trans_bac_select = pd.merge(transinfo.loc[transdef_select.transporter.unique()],mt_trans_bac,left_index=True,right_index=True)
mt_trans_select.to_csv("results/mt/bac_select_trans.tpm.tsv", sep="\t")
# TPM values per gene for genes matching selected transporters
mt_transcov_select = pd.merge(transinfo.loc[transdef_select.transporter.unique()],mt_transcov,left_index=True,right_on="transporter")
mt_transcov_select.to_csv("results/mt/select_trans_genes.tpm.tsv", sep="\t")
# TPM values per gene for bacterial genes matching selected transporters
mt_transcov_bac_select = pd.merge(transinfo.loc[transdef_select.transporter.unique()],mt_transcov_bac,left_index=True,right_on="transporter")
mt_transcov_bac_select.to_csv("results/mt/bac_select_trans_genes.tpm.tsv", sep="\t")

#### Transporter type and substrate summary

Generate count summary across transporter type and substrate category.

In [113]:
# Group by and count type and substrate category
type_counts = transinfo.groupby(["type","substrate_category"]).count().reset_index().iloc[:,[0,1,2]]
# Calculate total type sum
SUM = transinfo.groupby("type").count().iloc[:,0]
# Calculate total substrate category sum
colsum = transinfo.groupby("substrate_category").count().iloc[:,0]
colsum.name = "SUM"
colsum = pd.DataFrame(colsum).T
colsum = colsum.assign(SUM=SUM.sum())
# Pivot count table
type_counts.columns = ["type","substrate_category","counts"]
type_counts = pd.pivot_table(type_counts, index=["type"], columns=["substrate_category"])
type_counts.fillna("0", inplace=True)
type_counts = type_counts["counts"]
# Add row sums
type_counts = type_counts.assign(SUM=SUM)
# Add col sums
type_counts = pd.concat([type_counts,colsum])
# Convert to integer
type_counts = type_counts.astype(int)
type_counts.to_csv("results/transporter_type_table.tsv", sep="\t")
type_counts

substrate_category,AA peptide,Anions,CO3,Cations,N-FIX,NH3 Nitrate,Other,Rare nutr,Urea,carbohydrate,nucleoside,phosphate,phosphonate,SUM
1a,0,0,0,2,0,1,0,0,0,0,0,0,0,3
1b,0,0,0,1,0,0,0,1,0,0,0,0,0,2
2a,7,2,1,9,0,1,1,0,0,9,3,0,0,33
2c,0,0,0,1,0,0,0,0,0,0,0,0,0,1
3a,1,2,0,3,0,2,1,0,2,0,0,1,3,15
3d,0,0,0,0,1,0,0,0,0,0,0,0,0,1
4a,0,0,0,0,0,0,0,0,0,1,0,0,0,1
4b,0,0,0,0,0,0,0,0,0,0,1,0,0,1
9a,0,0,0,1,0,0,0,0,0,0,0,0,0,1
SUM,8,4,1,17,1,4,2,1,2,10,4,1,3,58
