# AMPSphere v.2022-03

This is a notebook meant to form the set of notebooks used to analyze the data in AMPSphere and write the manuscript:

__AMPSphere: Global survey of prokaryotic antimicrobial peptides shaping microbiomes__

### Summarizing AMPSphere origins and results in supplementary tables

Here, we summarize the metagenomes, genomes and other info needed to rebuild AMPSphere along with other supplementary data reporting results.

In [1]:
# loading libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import spearmanr

In [2]:
higher_level = {'sediment' : 'other',
        'bird gut' : 'other animal',
        'cat gut' : 'non-human mammal gut',
        'insect associated' : 'other animal',
        'human urogenital tract' : 'other human',
        'dog gut' : 'non-human mammal gut',
        'fermented food' : 'anthropogenic',
        'groundwater' : 'aquatic',
        'coral associated' : 'other animal',
        'rat gut' : 'non-human mammal gut',
        'human associated' : 'other human',
        'cattle gut' : 'non-human mammal gut',
        'deer gut' : 'non-human mammal gut',
        'mouse gut' : 'non-human mammal gut',
        'river associated' : 'aquatic',
        'primate gut' : 'non-human mammal gut',
        'human respiratory tract' : 'other human',
        'cattle rumen' : 'other animal',
        'human saliva' : 'other human',
        'activated sludge' : 'anthropogenic',
        'lake associated' : 'aquatic',
        'wastewater' : 'anthropogenic',
        'chicken gut' : 'other animal',
        'air' : 'other',
        'human mouth' : 'other human',
        'plant associated' : 'soil/plant',
        'water associated' : 'aquatic',
        'pig gut' : 'non-human mammal gut',
        'human skin' : 'other human',
        'marine' : 'aquatic',
        'soil' : 'soil/plant',
        'built environment' : 'anthropogenic',
        'human gut' : 'human gut',
        'anthropogenic': 'anthropogenic',
        'bear gut' : 'non-human mammal gut',
        'bee gut': 'other animal',
        'bat gut': 'non-human mammal gut',
        'dog associated': 'other animal',
        'cattle associated': 'other animal',
        'crustacean associated': 'other animal',
        'insect gut': 'other animal',
        'goat gut': 'non-human mammal gut', 
        'rodent gut': 'non-human mammal gut',
        'fisher gut': 'non-human mammal gut',
        'human digestive tract': 'other human',
        'coyote gut': 'non-human mammal gut',
        'planarian associated': 'other animal',
        'sponge associated': 'other animal',
        'goat rumen': 'other animal',
        'crustacean gut': 'other animal',
        'annelidae associated': 'other animal',
        'bird skin': 'other animal',
        'beatle gut': 'other animal',
        'termite gut': 'other animal', 
        'fish gut': 'other animal',
        'mollusc associated': 'other animal',
        'ship worm associated': 'other animal',
        'rabbit gut': 'non-human mammal gut',
        'tunicate associated': 'other animal',
        'mussel associated': 'other animal',
        'horse gut': 'non-human mammal gut',
        'wasp gut': 'other animal',
        'guinea pig gut': 'non-human mammal gut'}


is_host_associated = {'human gut' : True,
        'soil/plant' : False,
        'aquatic' : False,
        'anthropogenic' : False,
        'other human' : True,
        'non-human mammal gut' : True,
        'other animal' : True,
        'other' : False}


plants={'lettuce', 'monocots', 'cowpea',
        'mosses', 'pitcher plant', 'maize',
        'thale cress', 'siratro', 'grapevine',
        'Norway spruce', 'black cottonwood',
        'soy', 'french bean', 'silvergrass',
        'sorghum', 'bread wheat', 'sunflower',
        'carrot', 'lodgepole pine' 'burclover',
        'cottongrass', 'switchgrass', 'eudicots',
        'agave', 'barrelclover', 'alfalfa',
        'red fir'}

In [3]:
def listing(x):
    return ', '.join(x.tolist())

In [4]:
# load data
data = pd.read_table('data/gmsc_amp_genes_envohr_source.tsv.gz')

data = data[data.is_metagenomic == True]

data['higher'] = data['general_envo_name'].map(lambda g: higher_level.get(g, 'other'))

data['host_associated'] = data['higher'].map(lambda g: is_host_associated.get(g, 'NA'))

In [5]:
nf = data[['amp','host_associated']]
nf = nf.drop_duplicates()
nf = nf.groupby('host_associated')
nf = nf.agg('size')
print(nf)

host_associated
False    595345
True     285393
dtype: int64


In [6]:
nf = data[data.higher.isin(['soil/plant', 'aquatic'])]
nf = nf[['amp', 'higher']]
nf = nf.drop_duplicates()
nf = nf.groupby('higher')
nf = nf.agg('size')
print(nf)

higher
aquatic       213861
soil/plant    315628
dtype: int64


In [7]:
nf = data[data.higher == 'anthropogenic']
nf = nf[['amp','higher']]
nf = nf.drop_duplicates()
nf = nf.groupby('higher')
nf = nf.agg('size')
print(nf)

higher
anthropogenic    85988
dtype: int64


In [8]:
nf = len(set(data[(data.higher == 'other')]['amp']))
print(f'Other environments: {nf}')

Other environments: 15609


In [9]:
metadata = pd.read_table('data/metadata.tsv.gz')
metadata.rename({'sample_accession': 'sample'}, axis=1, inplace=True)
nf = metadata[['sample', 'host_common_name']]
nf = nf.merge(on='sample', right=data)
nf[['host_common_name', 'amp']].drop_duplicates().groupby('host_common_name').agg('size').sort_values()

host_common_name
guinea pig                1
rabbit                    1
white-tailed deer         2
cottongrass               2
soy                       4
                      ...  
maize                 31335
switchgrass           40001
pig                   50247
cattle                61859
human                127314
Length: 94, dtype: int64

In [10]:
nf['nenvo'] = [x if x not in plants else 'plant' for x in nf.host_common_name]
nf[['nenvo', 'amp']].drop_duplicates().groupby('nenvo').agg('size').sort_values()  

nenvo
rabbit                    1
guinea pig                1
white-tailed deer         2
fisher                    4
wasp                      5
                      ...  
chicken               18259
pig                   50247
cattle                61859
plant                103023
human                127314
Length: 70, dtype: int64

In [11]:
samples = data[['sample', 'higher']]
samples = samples.drop_duplicates()
samples = samples.groupby('higher')
samples = samples.agg('size')

In [12]:
habitats = data[['higher', 'general_envo_name']].drop_duplicates()
habitats = habitats.groupby('higher')['general_envo_name'].apply(lambda x: listing(x))

In [13]:
redamps = data.groupby('higher').agg('size')

In [14]:
nramps = data[['higher', 'amp']]
nramps = nramps.drop_duplicates()
nramps = nramps.groupby('higher')
nramps = nramps.agg('size')

In [15]:
fams = pd.read_table('data/SPHERE_v.2022-03.levels_assessment.tsv.gz')
fams = fams.rename({'AMP accession': 'amp',
                    'SPHERE_fam level III': 'family'},
                   axis=1)
                   
fams = fams[['amp', 'family']]
data = data.merge(on='amp', right=fams)
fams = fams.groupby('family').agg('size')
fams = fams[fams >= 8].index

In [16]:
data = data[['higher', 'family']].drop_duplicates()
famps = data.groupby('higher').agg('size')
famp_l = data[data.family.isin(fams)].groupby('higher').agg('size')


Here, it follows the supplementary table with info about the samples, number of redundant and non-redundant AMPs, as well as the number of clusters and families each high-level habitat affiliates.

In [17]:
df = pd.concat([habitats,
                samples,
                redamps,
                nramps,
                famps,
                famp_l],
               axis=1)

df = df.reset_index()
df = df.rename({'higher': 'high level environment',
                'general_envo_name': 'habitats',
                0: 'samples',
                1: 'redundant AMPs',
                2: 'non-redundant AMPs',
                3: 'AMP clusters',
                4: 'AMP families'},
               axis=1)

df

Unnamed: 0,high level environment,habitats,samples,redundant AMPs,non-redundant AMPs,AMP clusters,AMP families
0,anthropogenic,"wastewater, built environment, activated sludg...",6129,304382,85988,63824,3913
1,aquatic,"water associated, groundwater, river associate...",5961,762508,213861,147828,5116
2,human gut,human gut,32640,2167411,111946,63641,3304
3,non-human mammal gut,"pig gut, dog gut, cattle gut, mouse gut, cat g...",3535,629929,104559,68364,3339
4,other,"air, mock community, algae associated, mine, i...",1300,39127,15609,13435,1306
5,other animal,"chicken gut, bird gut, cattle rumen, bee gut, ...",2356,348985,98615,73723,3275
6,other human,"human skin, human urogenital tract, human mout...",4052,96290,20137,13196,1112
7,soil/plant,"soil, plant associated",5420,1078992,315628,197565,5552


### Information about the samples used in AMPSphere

In [18]:
# load data again
data = pd.read_table('data/samples-min500k-assembly-prodigal-stats.tsv.gz')
amps = pd.read_table('data/gmsc_amp_genes_envohr_source.tsv.gz')

In [19]:
# filter columns
namps = amps[['amp',
              'sample',
              'general_envo_name']]

In [20]:
# eliminate redundancy
namps = namps.drop_duplicates()
namps = namps.groupby('sample')
namps = namps.agg('size')
namps = namps.reset_index()

namps = namps.rename({0: 'amps',
                      'sample': 'sample_accession'},
                     axis=1)

In [21]:
# merge splitted data
a = data.merge(on='sample_accession',
               right=namps)
               
b = data[~data.sample_accession.isin(namps.sample_accession)]
data = pd.concat([a, b])
data.amps = data.amps.fillna(0)

data['amps_per_assembled_Mbp'] = data.amps * 1_000_000 / data.assembly_total_length

In [22]:
# more data...
envo = pd.read_table('data/reduced_metadata.tsv.gz')
data = data.merge(on='sample_accession', right=envo)

In [23]:
# supp table S1
sup1 = data[['sample_accession', 'general_envo_name',
             'inserts_raw', 'assembly_total_length',
             'assembly_N50', 'prodigal_total_orfs',
             'smORFs', 'amps']].copy()

sup1.columns = ['sample', 'habitat', 'raw inserts',
                'assembled bp', 'N50', 'ORFs+smORFs',
                'smORFs', 'non-redundant AMPs']

sup1

Unnamed: 0,sample,habitat,raw inserts,assembled bp,N50,ORFs+smORFs,smORFs,non-redundant AMPs
0,Karasov_2018_arabidopsis_NextMet1,plant associated,12374844,6714448,700,10983,2254,2.0
1,Karasov_2018_arabidopsis_NextMet124,plant associated,17119714,29141743,684,53733,10438,9.0
2,Karasov_2018_arabidopsis_NextMet15,plant associated,13384712,3583114,1325,3660,1046,2.0
3,Karasov_2018_arabidopsis_NextMet25,plant associated,15623952,5459772,736,7794,1998,5.0
4,Karasov_2018_arabidopsis_NextMet50,plant associated,14905649,4297904,982,5138,1296,4.0
...,...,...,...,...,...,...,...,...
63405,SAMEA6080433,human mouth,5013002,10785166,648,21077,6318,0.0
63406,mgs635492,soil,25418882,6843991,369,17917,3077,0.0
63407,mgs635439,soil,4516768,758472,273,2606,243,0.0
63408,mgs635565,soil,1781932,41679,256,145,9,0.0
