# AMPSphere v.2022-03

This is a notebook meant to form the set of notebooks used to analyze the data in AMPSphere and write the manuscript:

__AMPSphere: Global survey of prokaryotic antimicrobial peptides shaping microbiomes__

### Analysis of abundance and diversity of c_AMPs

c_AMPs are distributed as gene variants through many species. Here, we will test:
    
    I. Is there multi-habitat c_AMPs?
    II. Are multi-habitat c_AMPs clonal?

In [1]:
import lzma
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from Bio import SeqIO
from tqdm import tqdm
from scipy.stats import norm
from scipy.stats import shapiro
from scipy.stats import pearsonr, spearmanr

In [2]:
higher_level = {'sediment' : 'other',
        'bird gut' : 'other animal',
        'cat gut' : 'non-human mammal gut',
        'insect associated' : 'other animal',
        'human urogenital tract' : 'other human',
        'dog gut' : 'non-human mammal gut',
        'fermented food' : 'anthropogenic',
        'groundwater' : 'aquatic',
        'coral associated' : 'other animal',
        'rat gut' : 'non-human mammal gut',
        'human associated' : 'other human',
        'cattle gut' : 'non-human mammal gut',
        'deer gut' : 'non-human mammal gut',
        'mouse gut' : 'non-human mammal gut',
        'river associated' : 'aquatic',
        'primate gut' : 'non-human mammal gut',
        'human respiratory tract' : 'other human',
        'cattle rumen' : 'other animal',
        'human saliva' : 'other human',
        'activated sludge' : 'anthropogenic',
        'lake associated' : 'aquatic',
        'wastewater' : 'anthropogenic',
        'chicken gut' : 'other animal',
        'air' : 'other',
        'human mouth' : 'other human',
        'plant associated' : 'soil/plant',
        'water associated' : 'aquatic',
        'pig gut' : 'non-human mammal gut',
        'human skin' : 'other human',
        'marine' : 'aquatic',
        'soil' : 'soil/plant',
        'built environment' : 'anthropogenic',
        'human gut' : 'human gut',
        'anthropogenic': 'anthropogenic',
        'bear gut' : 'non-human mammal gut',
        'bee gut': 'other animal',
        'bat gut': 'non-human mammal gut',
        'dog associated': 'other animal',
        'cattle associated': 'other animal',
        'crustacean associated': 'other animal',
        'insect gut': 'other animal',
        'goat gut': 'non-human mammal gut', 
        'rodent gut': 'non-human mammal gut',
        'fisher gut': 'non-human mammal gut',
        'human digestive tract': 'other human',
        'coyote gut': 'non-human mammal gut',
        'planarian associated': 'other animal',
        'sponge associated': 'other animal',
        'goat rumen': 'other animal',
        'crustacean gut': 'other animal',
        'annelidae associated': 'other animal',
        'bird skin': 'other animal',
        'beatle gut': 'other animal',
        'termite gut': 'other animal', 
        'fish gut': 'other animal',
        'mollusc associated': 'other animal',
        'ship worm associated': 'other animal',
        'rabbit gut': 'non-human mammal gut',
        'tunicate associated': 'other animal',
        'mussel associated': 'other animal',
        'horse gut': 'non-human mammal gut',
        'wasp gut': 'other animal',
        'guinea pig gut': 'non-human mammal gut'}

In [3]:
# load data
data = pd.read_table('../data_folder/gmsc_amp_genes_envohr_source.tsv.gz')

In [4]:
# eliminate genes/amps without environment (from ProGenomes)
data = data[~data.general_envo_name.isna()]

# attribut high-level habitat to genes/amps
data['high'] = data.general_envo_name.map(lambda x: higher_level.get(x, 'other'))

In [5]:
# testing multihabitat AMPs
def get_multihabitat(df, level=None):
    '''
    counts the number of multi-habitat AMPs (present in at least 3 environments)
    
    :inputs:
    data frame containing at least AMP, general_envo_name, high-level habitat
    level which stats for the habitat or high-level environment
    
    :outputs:
    length of the list of multi-habitat c_AMPs
    '''
    if level == None:
        level = 'high'
    if level == 'low':
        level = 'general_envo_name'
        
    h = df[['amp', level]].drop_duplicates()
    h = h.amp.value_counts()
    h = h[h > 1].index
    
    return h

In [6]:
l = get_multihabitat(data, 'high')
l0 = get_multihabitat(data, 'low')
print(f'Multi-habitat AMPs: {len(l0)}\nMulti-high-level-habitat: {len(l)}')

Multi-habitat AMPs: 173955
Multi-high-level-habitat: 93280


In [7]:
k = data[['amp', 'high']].drop_duplicates()['amp'].value_counts()
k = k[k>1].index

with open('multihabitat_highlevel.txt', 'wt') as out:
    for i in k: out.write(f'{i}\n')

k = data[['amp', 'general_envo_name']].drop_duplicates()['amp'].value_counts()
k = k[k>1].index

with open('multihabitat_generalenvo.txt', 'wt') as out:
    for i in k: out.write(f'{i}\n')

### Permutation test

We shuffle the high-level habitat annotation for the samples, and then, calculate the number of multi-habitat c_AMPs. This operation is repeated 100 times, and then we calculate the average and standard deviation of the distribution of random results. Using Shapiro-Wilk test, we check if the random distribution is normal, and if it is, we calculate the Z-score for the result obtained for AMPSphere. The Z-score is then converted into a p-value to support our conclusions.

In [8]:
# testing significance
def shuffle_test(df, level=None):
    if level == None: level = 'high'
    elif level == 'low': level = 'general_envo_name'
        
    habitat = df.set_index('sample')[level].to_dict()
    
    values = [v for _, v in habitat.items()]
    random.shuffle(values)
    
    for k, v in zip(habitat, values):
        habitat[k] = v
        
    altdf = df.copy()
    altdf[level] = altdf['sample'].map(lambda x: habitat.get(x))
    
    return altdf

In [9]:
#test high level habitats
test = []
for _ in tqdm(range(100)):
    altdf = shuffle_test(data, 'high')
    altdf = len(get_multihabitat(altdf, 'high'))
    test.append(altdf)

print(test)

100%|█████████████████████████████████████████| 100/100 [24:49<00:00, 14.90s/it]

[676499, 674359, 671791, 675954, 672110, 678459, 666194, 677173, 681451, 668163, 676543, 676404, 671474, 680156, 671258, 675615, 674566, 663149, 677396, 670835, 669737, 684001, 681781, 675558, 674138, 678656, 679422, 678795, 678456, 676070, 678397, 677408, 671222, 682134, 673758, 675463, 678823, 678107, 678368, 676448, 678596, 678717, 673127, 673508, 673042, 672716, 679893, 680610, 678004, 670864, 673128, 680270, 677619, 676338, 682716, 681902, 672168, 679255, 677654, 674282, 673671, 678805, 679388, 684151, 681639, 682485, 679829, 683357, 668880, 673273, 668500, 669705, 674121, 677341, 677975, 678486, 677395, 682010, 680005, 674129, 670167, 682404, 681194, 679373, 675932, 683006, 678519, 673109, 673000, 672372, 670497, 677189, 674095, 682130, 673589, 680277, 683166, 680153, 679954, 672998]





In [10]:
_, p = shapiro(test)

if p < 0.05: res = 'not-normal'
else: res = 'normal'

print(f'The Shapiro-Wilks test returned a p = {p}')
print(f'This means that the distribution is {res}')

avg, std = np.mean(test), np.std(test)

print(f'Average number of random multi-habitat AMPs - {avg}, with std = {std}')

Z = (len(l) - avg) / std

pz = norm.sf(abs(Z))

print(f'The number of multi-habitat AMPs in AMPSphere was {len(l)}')
print(f'It was {Z} * std of the random distribution')
print(f'This gives us a p-value of {pz}')

The Shapiro-Wilks test returned a p = 0.07779254764318466
This means that the distribution is normal
Average number of random multi-habitat AMPs - 676489.69, with std = 4281.78946165035
The number of multi-habitat AMPs in AMPSphere was 93280
It was -136.20699831775724 * std of the random distribution
This gives us a p-value of 0.0


In [17]:
#test habitats
test_low = []
for _ in tqdm(range(100)):
    altdf = shuffle_test(data, 'low')
    altdf = len(get_multihabitat(altdf, 'low'))
    test_low.append(altdf)

print(test_low)

100%|█████████████████████████████████████████| 100/100 [33:36<00:00, 20.17s/it]

[680721, 688236, 679559, 683629, 685726, 685943, 691053, 684848, 676976, 682659, 686377, 694536, 677249, 688504, 682197, 688182, 689975, 681738, 683273, 685580, 680806, 685663, 678305, 682005, 681553, 686502, 681663, 679341, 683992, 680794, 686928, 684681, 676158, 688298, 686537, 683496, 681364, 686519, 691960, 683040, 691137, 686302, 688414, 680230, 686522, 685561, 687949, 681958, 682350, 685254, 695749, 679555, 689155, 682772, 683380, 686392, 684415, 685559, 680185, 689560, 686277, 683090, 690766, 692583, 691754, 689310, 679803, 687318, 688867, 683431, 690797, 686723, 683623, 689810, 688709, 695854, 681363, 686459, 688499, 687855, 687084, 676710, 688208, 686753, 686153, 690184, 684196, 681704, 686332, 687330, 685333, 690687, 677227, 690631, 677255, 681722, 687192, 690237, 695529, 685394]





In [18]:
_, p = shapiro(test_low)

if p < 0.05: res = 'not-normal'
else: res = 'normal'

print(f'The Shapiro-Wilks test returned a p = {p}')
print(f'This means that the distribution is {res}')

avg, std = np.mean(test_low), np.std(test_low)

print(f'Average number of random multi-habitat AMPs - {avg}, with std = {std}')

Z = (len(l0) - avg) / std

pz = norm.sf(abs(Z))

print(f'The number of multi-habitat AMPs in AMPSphere was {len(l0)}')
print(f'It was {Z} * std of the random distribution')
print(f'This gives us a p-value of {pz}')

The Shapiro-Wilks test returned a p = 0.4874470829963684
This means that the distribution is normal
Average number of random multi-habitat AMPs - 685477.17, with std = 4369.610044511981
The number of multi-habitat AMPs in AMPSphere was 173955
It was -117.06357427534 * std of the random distribution
This gives us a p-value of 0.0
