# AMPSphere v.2022-03

This is a notebook meant to form the set of notebooks used to analyze the data in AMPSphere and write the manuscript:

__AMPSphere: Global survey of prokaryotic antimicrobial peptides shaping microbiomes__

### The diversity of c_AMPs is directly linked to the diversity of microbes

We show that normalized measures of diversity of c_AMPs and microbes are highly correlated.

First, we calculate the diversity of c_AMPs per sample:

In [1]:
# import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from collections import Counter
from scipy.stats import spearmanr
from skbio.diversity import alpha_diversity

In [2]:
# define the Shannon entropy formula
def shannon(df):
    f1 = df[0]/df[0].sum()
    f2 = np.log(f1)
    return -sum(f1*f2)

In [3]:
# load data
gmsc = pd.read_table('data/gmsc_amp_genes_envohr_source.tsv.gz')

# sample info
sta = pd.read_table('data/samples-min500k-assembly-prodigal-stats.tsv.gz')

sta = sta[['sample_accession',
           'assembly_total_length']]

sta = sta.rename({'sample_accession': 'sample',
                  'assembly_total_length': 'assembled_bp'},
                 axis=1)

In [4]:
# shannon H diversity for c_AMPs
d = gmsc.groupby(['amp', 'sample'])
d = d.agg('size')
d = d.reset_index()
d = d.sort_values(by='sample')

d = d.groupby('sample')
d = d.apply(lambda x: shannon(x))
d = d.reset_index().rename({0: 'shannon_H_amp'}, axis=1)

In [5]:
# amp counts - redundant and nr
d2 = gmsc.groupby('sample').agg('size').reset_index()
d2 = d2.rename({0: 'redundant_amps'}, axis=1)

d3 = gmsc[['sample', 'amp']].drop_duplicates()
d3 = d3.groupby('sample').agg('size').reset_index()
d3 = d3.rename({0: 'nr_amps'}, axis=1)

In [6]:
# merge intermediate files
d = d.merge(on='sample', right=d2)
d = d.merge(on='sample', right=d3)

In [7]:
d = d.merge(on='sample', right=sta)

d4 = gmsc[['sample',
           'is_metagenomic',
           'general_envo_name']]

d4 = d4.drop_duplicates()
d = d.merge(on='sample', right=d4)
d = d[d.is_metagenomic == True]

amps = d[['sample',
          'general_envo_name',
          'assembled_bp',
          'redundant_amps',
          'nr_amps',
          'shannon_H_amp']]

amps

Unnamed: 0,sample,general_envo_name,assembled_bp,redundant_amps,nr_amps,shannon_H_amp
0,100822046-x-0-x-ST,human gut,110512142,62,60,4.082415
1,103092734-palatinetonsils3,human mouth,197407958,98,97,4.570822
2,103092734-subgingivalplaque3,human mouth,136837353,73,72,4.271469
3,103092734-supragingivalplaque3,human mouth,106787798,50,50,3.912023
4,1062629-24-0-0,human gut,180877636,114,114,4.736198
...,...,...,...,...,...,...
61388,tigress_HD.S14-x-392-x-OR,human saliva,227034094,92,91,4.506720
61389,tigress_HD.S14-x-392-x-ST,human gut,170456936,70,69,4.228691
61390,tigress_HD.S14-x-60-x-ST,human gut,176591235,86,85,4.438228
61391,tigress_HD.S14-x-7-x-OR,human saliva,73749208,20,20,2.995732


Secondly, we calculate the diversity of microbial species per sample:

* As the size of this table is much larger, than it is better to compute using the C++ optimized option

In [8]:
# load motus
motus = pd.read_table('data/freeze.v2.motusv2_5.mg3.insertcount.tsv.xz',
                      sep='\t', 
                      header='infer')

motus.rename({'Unnamed: 0': 'sample'},
             axis=1,
             inplace=True)

motus = motus.set_index('sample')

In [12]:
# eliminate samples containing duplicates
res = dict(Counter(motus.index))
res = [k for k, v in res.items() if v > 1]
motus = motus.drop(res, axis=0)

In [9]:
# cleaning all zero rows and columns
motus = motus.loc[:, (motus != 0).any(axis=0)]  # all zero columns
motus = motus.loc[(motus != 0).any(axis=1)]  # all zero rows
df = np.asarray(motus)
ids = motus.columns.tolist()

In [10]:
# calculating diversity
shannonH = alpha_diversity('shannon', df, ids)
x = alpha_diversity('osd', df, ids)
otus = [ o for (o, _, _) in x ]
sing = [ s for (_, s, _) in x ]
doub = [ p for (_, _, p) in x ]

ValueError: Only 1-D and 2-D array-like objects can be provided as input. Provided object has 3 dimensions.

In [None]:
df = np.array([ids,
               otus,
               sing,
               doub, 
               shannonH])

df = df.T

In [None]:
#for otus
motus = pd.DataFrame(df,
                     columns=['sample',
                              'OTUs',
                              'singletons',
                              'doubletons',
                              'shannon_H'])

motus

Then, using the both diversity matrices, we calculate the tests:

In [None]:
div = amps.merge(on='sample',
                 right=motus)

div['amp_density'] = div.redundant_amps * 1e9 / div.assembled_bp

In [None]:
sns.jointplot(data=div.sample(5000),
              x='shannon_H',
              y='shannon_H_amp',
              s=3)

plt.xlabel('Species diversity - Shannon H')
plt.ylabel('AMPs diversity - Shannon H')

In [None]:
spearmanr(div.shannon_H_amp, div.shannon_H)
## SpearmanrResult(correlation=0.4679178079882997, pvalue=0.0)

In [None]:
spearmanr(div.OTUs, div.redundant_amps)
## SpearmanrResult(correlation=0.6287666912682447, pvalue=0.0)

In [None]:
spearmanr(div.shannon_H, div.nr_amps)
## SpearmanrResult(correlation=0.46741093627128366, pvalue=0.0)

In [None]:
spearmanr(div.shannon_H, div.redundant_amps)
## SpearmanrResult(correlation=0.4662465067984268, pvalue=0.0)

In [None]:
spearmanr(div.shannon_H, div.amp_density)
## SpearmanrResult(correlation=0.07109297647828566, pvalue=1.7047089362702838e-67)

In [None]:
spearmanr(div.OTUs, div.amp_density)
## SpearmanrResult(correlation=0.1859735046363122, pvalue=0.0)

In [None]:
spearmanr(div.singletons, div.amp_density)
## SpearmanrResult(correlation=0.12678272864738688, pvalue=1.4630738253235345e-211)

In [None]:
spearmanr(div.doubletons, div.amp_density)
## SpearmanrResult(correlation=0.08989888051788306, pvalue=5.980766621843995e-107)

In [None]:
spearmanr(div.assembled_bp, div.shannon_H_amp)
## SpearmanrResult(correlation=0.9390838989154104, pvalue=0.0)

In [None]:
spearmanr(div.assembled_bp, div.amp_density)
## SpearmanrResult(correlation=-0.11880114166275109, pvalue=7.80750789022028e-186)

In [None]:
spearmanr(div.shannon_H_amp, div.amp_density)
## SpearmanrResult(correlation=0.16354565455715206, pvalue=0.0)

In [None]:
div = div.rename({'shannon_H': 'shannon_H_mOTUs'}, axis=1)

div=div[['sample', 'general_envo_name', 'assembled_bp',
         'OTUs', 'singletons', 'doubletons',
         'redundant_amps', 'nr_amps', 'amp_density',
         'shannon_H_amp', 'shannon_H_mOTUs']]

div