## Python notebook for PIGEON vOTUs
- What vOTUs are at multiple sites?
- Sum up the abundances of vOTUs originally from PIGEON
- See in what sites they occur (what vegetation group, halophyte vs non-halophyte)

In [None]:
#imports
import pandas as pd

In [None]:
# open all data (including 2019 samples), using the presence absence one
df = pd.read_csv('../data/coverage_table_01.csv', sep=',')

# sum all abundances except the contig name
df['sum'] = df.drop('Contig', axis=1).sum(axis=1)

# show how many vOTUs only in 1 sample
df['sum'].value_counts()

In [1]:
# remove the sum column
df.drop('sum', axis=1, inplace=True) 

# print length
len(df)

In [59]:
# calculate in how many site votus are present, so for each site, sum presence absense
def calc_sum_time(df, cat):
    mesk = df.columns.str.contains(cat+'.*')
    df.loc[:,mesk]
    df[cat + '_sum'] = df.loc[:, mesk].sum(1)
    return df

In [None]:
# Convert dataframe from sums to ones and zeros again (for presence absense at a site)
def one_zero(df):
    df_contig = df[['Contig']]
    df.drop('Contig', axis=1, inplace=True) 
    # ones and 0
    df[df > 0] = 1
    # add contig names back
    df = df_contig.join(df)
    return df

In [61]:
# make sums for each location
df['BMLB_sum'] = df.filter(like='BMLB').sum(1)
df['BMLF_sum'] = df.filter(like='BMLF').sum(1)
df['BMLS_sum'] = df.filter(like='BMLS').sum(1)
df['DP_sum'] = df.filter(like='DP').sum(1)
df['GPB_sum'] = df.filter(like='GPB').sum(1)
df['GPS_sum'] = df.filter(like='GPS').sum(1)
df['HC_sum'] = df.filter(like='HC').sum(1)


In [63]:
# make df into ones and zeros
df = one_zero(df)

# Sum the number of sites a vOTU was found at into a new column
df["habsum"] = df.filter(like="_sum").sum(axis=1)

# Print how many times a vOTU was found 1,2,3,4,5 or more times
df['habsum'].value_counts()

# Write the df to a csv for R, making plots
df.to_csv('habsums_votus.csv')