### Studying prevalence and rarity

c_AMPs were checked for their rarity, by verifying the number of samples they were present for each high level habitat group. After we plot the results and check the percent of c_AMPs are present in 10 or less samples (0.02% of AMPSphere samples), ergo rare c_AMPs.

In [None]:
tf = data.groupby(['high', 'amp']).agg('size').reset_index()
tf = tf.rename({0: 'number of detections'}, axis=1)
tf = tf.groupby(['high', 'number of detections']).agg('size').reset_index()
tf = tf.rename({0: 'number of c_AMPs'}, axis=1)

In [None]:
_all = data.groupby('amp').agg('size').reset_index()
_all = _all.rename({0: 'number of detections'}, axis=1)
_all = _all.groupby('number of detections').agg('size').reset_index()
_all = _all.rename({0: 'number of c_AMPs'}, axis=1)
_all['high'] = 'all'

tf = tf.append(_all, ignore_index=True)

In [None]:
# adding prevalence
metadata = pd.read_table('data/reduced_metadata.tsv.gz')
metadata['high'] = metadata.general_envo_name.map(lambda x: higher_level.get(x))
metadata = metadata.high.value_counts()
metadata.loc['all'] = 63410

tf['total'] = tf.high.map(lambda x: metadata.loc[x])
tf['prevalence'] = 100 * tf['number of detections'] / tf['total']
tf

In [None]:
fig, axarr = plt.subplot_mosaic([['a', 'b']])

sns.scatterplot(ax=axarr['a'],
                data=tf,
                x='number of detections',
                y='number of c_AMPs',
                hue='high',
                s=5)

axarr['a'].set_yscale('log')
axarr['a'].set_xscale('log')

axarr['a'].axvline(x=10.0,
            color='black',
            linestyle='--')

axarr['a'].axhline(y=1000,
            color='black',
            linestyle='--')

sns.scatterplot(ax=axarr['b'],
                data=tf,
                x='prevalence',
                y='number of c_AMPs',
                hue='high',
                s=5)

axarr['b'].set_yscale('log')
#axarr['b'].set_xscale('log')

axarr['b'].set_xlabel('prevalence %')
axarr['b'].set_ylabel(None)
axarr['b'].set_yticks([])

axarr['b'].axvline(x=10.0,
            color='black',
            linestyle='--')

axarr['b'].axhline(y=1000,
            color='black',
            linestyle='--')

axarr['a'].legend('')
#axarr['b'].legend('')

In [None]:
# show proportions of genes with 10 or less detections in each environment:
prop = []    
for h in set(tf.high):
    c = tf[(tf.high == h)]
    l = c['number of c_AMPs'].sum()
    n = c.loc[c['number of detections'] <= 10, 'number of c_AMPs'].sum()
    p = n * 100 / l
    prop.append((h, p, l))

prop = pd.DataFrame(prop, columns=['habitat', 'proportion', 'c_AMPs'])
prop

In [None]:
spearmanr(prop.proportion, prop.c_AMPs)

In [None]:
for h in set(tf.high):
    
    print(f'Habitat: {h}')
    
    k1 = 1 / tf[tf.high == h]['number of c_AMPs']
    
    r, p = pearsonr(tf[tf.high == h]['number of detections'],
                    k1)
    
    print(f'Linear relationship, Pearson r: {r} and p: {p}')
    
    r, p = pearsonr(np.log10(tf[tf.high == h]['number of detections']),
                    np.log10(k1))

    print(f'Log-log linear relationship, Pearson r: {r} and p: {p}\n')
    

In [None]:
import json

genevariant = json.load(open('data/ampsphere_gene_variant.json'))

metadata = pd.read_table('data/reduced_metadata.tsv.gz')
metadata = metadata[['sample_accession', 'general_envo_name']]
metadata = metadata.rename({'sample_accession': 'sample'}, axis=1)

# load data
amp_fam = pd.read_table('data/SPHERE_v.2022-03.levels_assessment.tsv.gz', sep='\t', header='infer')
amp_fam = amp_fam[['AMP accession', 'SPHERE_fam level III']]
amp_fam = amp_fam.rename({'AMP accession': 'amp',
                          'SPHERE_fam level III': 'family'},
                          axis=1)

# species level AMPs
df = pd.read_table('data/complete_amps_associated_taxonomy.tsv.gz')
df = df[df.level == 'species']
df = df.merge(on='sample', right=metadata)
df = df.merge(on='amp', right=amp_fam)

In [None]:
df['high'] = df.general_envo_name.map(lambda x: higher_level.get(x))
df['gene_var'] = df.gmsc.map(lambda x: genevariant.get(x))
df

In [None]:
#df[df.source.str.contains('Shimwellia')].groupby(['amp', 'high']).agg('size').reset_index().rename({0: 'samples'}, axis=1)
df.loc[df.amp == 'AMP10.000_436', 'source'].value_counts()

# d = pd.read_table('data/gmsc_amp_genes_envohr_source.tsv.gz')
# d = d[d.is_metagenomic == False]
# d = d.fillna('NA')
# d[d.source.str.contains('Shimwellia')]

In [None]:
# samples
f0 = df[['source', 'sample']].drop_duplicates()
f0 = f0['source'].value_counts()

# amps
f1 = df[['source', 'amp']].drop_duplicates()
f1 = f1['source'].value_counts()

# genes
f2 = df['source'].value_counts()

# gene_var
f4 = df[['source', 'gene_var']].drop_duplicates()
f4 = f4.source.value_counts()

# families
f3 = df[['source', 'family']].drop_duplicates()
f3 = f3.source.value_counts()

newdf = pd.concat([f0, f1, f2, f4, f3], axis=1)
newdf.columns = ['sample', 'amp', 'gene', 'gene_var', 'family']
newdf = newdf.sort_values(by=['sample', 'amp', 'gene', 'gene_var', 'family'])
newdf

In [None]:
altdf = newdf[newdf['sample'] > 634]
print(altdf.columns)
sns.scatterplot(data=altdf, x='gene_var', y='family')

In [None]:
newdf['var_per_fam'] = newdf.gene_var / newdf.family
newdf['var_per_amp'] = newdf.gene_var / newdf.amp

print(spearmanr(altdf.amp, altdf.gene))
print(spearmanr(altdf.amp, altdf.gene_var))
print(spearmanr(altdf.gene, altdf.gene_var))
print(spearmanr(altdf.amp, altdf.family))
print(spearmanr(altdf.gene, altdf.family))
print(spearmanr(altdf.gene_var, altdf.family))

In [None]:
amps_per_sps = df[['source', 'gene_var', 'amp']]
amps_per_sps = amps_per_sps.drop_duplicates()
amps_per_sps = amps_per_sps.groupby(['source', 'amp']).agg('size')
amps_per_sps = amps_per_sps.reset_index()
amps_per_sps = amps_per_sps.groupby('source').agg('size')
amps_per_sps

In [None]:
itf = df[df.general_envo_name.str.contains('human')]
itf = itf[['amp', 'gene_var', 'general_envo_name', 'source']]
itf = itf.drop_duplicates()
itf = itf.groupby(['general_envo_name', 'source'])
itf = itf.agg('size')
itf = itf.reset_index()
itf = itf.rename({0: 'gene_variants'}, axis=1)

ftf = df[df.general_envo_name.str.contains('human')]
ftf = ftf[['family', 'general_envo_name', 'source']]
ftf = ftf.drop_duplicates()
ftf = ftf.groupby(['general_envo_name', 'source'])
ftf = ftf.agg('size')
ftf = ftf.reset_index()
ftf = ftf.rename({0: 'families'}, axis=1)


itf = itf.merge(on=['general_envo_name', 'source'], 
               right=ftf)

itf['uni_per_fam'] = itf.gene_variants / itf.families

sns.kdeplot(data=itf,
            x='uni_per_fam',
            hue='general_envo_name',
            clip=(0, 5),
            fill=True,
            alpha=0.15)