In [None]:
"""PROGRAM TO COMBINE METAGENOMES FROM ALL SAMPLE SITES INTO A SINGLE DATABASE AS A 
LIST OF LISTS.  EACH TAXON HSS A ROW: [ TAXON, SUM OVER ALL SITES, AVERAGE FREQ ] 
ONLY TAXA WITH AVERAGE FREQUENCIES > 10^-5 ARE INCLUDED IN THE OUTPUT FILE.
"""
numprocess = 300
import math
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.font_manager as fp
import random
from google.colab import drive
drive.mount('/content/gdrive')

search = 'fb' 
#  'fb: fecal bacteria; sg: sewage genera; af: alternate familes; nep: non-enteric pathogens; tt: top-ten taxa

#1. THE OUTFILE NAME. 
outname = '...path.../Joined MG Files/Genus.Frqs.All.MGs.Joined.csv'

#2. PASTE IN THE NAMES OF ALL THE INPUT METAGENOME FILES: 

genus_files = ['...path.../Genus.Site1.Loop1.TRANSPOSED.csv',
         '...path.../Joined MG Files/Genus.Site2.Loop3.TRANSPOSED.csv',
         '...path.../Genus.Site3.Loop2.TRANSPOSED.csv',
         '...path.../Genus.Site4.Loop1.TRANSPOSED.csv',
         '...path.../Genus.Site5.Loop3.TRANSPOSED.csv',
         '...path.../Genus.Site6.Loop2.TRANSPOSED.csv']

family_files = ['...path.../Family.Site1.Loop3.TRANSPOSED.csv',
         '...path.../Family.Site2.Loop1.TRANSPOSED.csv',
         '...path.../Family.Site3.Loop1.TRANSPOSED.csv',
         '...path.../Family.Site4.Loop2.TRANSPOSED.csv',
         '...path.../Family.Site5.Loop1.TRANSPOSED.csv',
         '...path.../Family.Site6.Loop4.TRANSPOSED.csv']

if search == 'fb':
  files = genus_files
  targets  = ['escherichia', 'enterococcus']
  title = 'Traditional Fecal Genera'

elif search == 'sg':
  files = genus_files
  targets = ['acinetobacter', 'arcobacter'] 
  title = 'Sewage-Associated Genera'

elif search == 'nep':
  files = genus_files
  targets = ['bacillus', 'pseudomonas', 'leptospira', 'mycobacterium', 'shewanella', 'vibrio']
  title = 'Non-Enteric Pathogens'

elif search == 'af':
  files = family_files
  targets = ['bacteroidaceae', 'clostridiaceae', 'lachnospiraceae', 'porphyromonadaceae', 'prevotellaceae', 'rikenellaceae', 'ruminococcaceae']
  title = 'Alternative Fecal Families'

elif search == 'tt':
  files = genus_files
  targets = ['gammaproteobacteria', 'deltaproteobacteria', 'beggiatoa', 'desulfobacterium', 'nitrosopumilus',
             'bacteroides', 'desulfabacillum', 'planktomyces', 'geobacter', 'desulfococcus']
  title = 'Top Ten Taxa'

nummetas = len(files)
stringmetas =[]
for x in range(nummetas):   # for reading files and converting each taxon to a list of [taxon, number]
  with open(files[x], 'r') as f:
    data = f.read()
    #print(repr(data)[:100])
    stringmetas.append(str(data))
  f.close()
print('Number of metagenomes =', len(stringmetas))
print()

cleanmetas =[]            # gets rid of unwanted test, converts number from string to integer
numprint = 4
for x in range(nummetas):
  datastring = stringmetas[x].replace('unclassified (derived from ', '').replace(')', '').replace('Candidatus ', '')
  metalines = datastring.split('\n')
  #print(metalines)
  metalines.pop()
  print('Site', str(x+1) + '.  The first 4 of',len(metalines), 'total lines.' )
  for a in range(numprint): 
    print(metalines[a])
  print()
  integermeta =[]
  for y in range(len(metalines)):
    row = metalines[y].split(',')
    integermeta.append([row[0], int(row[1]), 0.0])
  cleanmetas.append(integermeta)

totalslist =[]             # counts the total number of taxa in each metagenome
for x in range(nummetas):
  count =0
  for y in range(len(cleanmetas[x])):
    count += cleanmetas[x][y][1]
  totalslist.append(count)
#print('Total counts for each metagenone: ', totalslist)

for x in range(nummetas):
  for y in range(len(cleanmetas[x])):
    cleanmetas[x][y][2] += cleanmetas[x][y][1] / totalslist[x]

taxonset = set()
for x in range(nummetas):
  for y in range(numprocess):
    if cleanmetas[x][y][0] not in taxonset:
      taxonset.add(cleanmetas[x][y][0])

taxonlist = list(taxonset)
zeros =[]
for x in range(len(files)):
  zeros.append(0.0)
 
cvcm =[]
for x in range(len(taxonlist)):
  row = [ taxonlist[x], 0.0 ] + zeros  
  for y in range(nummetas):
    for z in range(len(cleanmetas[y])):
      if cleanmetas[y][z][0] == taxonlist[x]:
        row[1] += cleanmetas[y][z][2] 
  cvcm.append(row)

cvcm.sort(key = lambda x: x[1])
cvcm.reverse()

for x in range(len(cvcm)):
  for y in range(nummetas):
    for z in range(len(cleanmetas[y])):
      if cleanmetas[y][z][0] == cvcm[x][0]:
        cvcm[x][y+2] = cleanmetas[y][z][2]
# for plotting Taxa

fig, ax = plt.subplots(figsize=(3,3), dpi=300 )
labels = ['S1', 'S2', 'S3', 'S4', 'S5', 'S6']
colors = [ 'r', 'b', 'c',  'm', '#7cd700','#400000', '#8f6d5f', '#ffde00', '#81FF33', '#ff0087']

barlines, plotname =[], [] 
for x in range(len(cvcm)):
  if cvcm[x][0].lower() in targets:
    barlines.append(cvcm[x][2:])
    plotname.append(cvcm[x][0].capitalize())

width = 0.5       # the width of the bars: can also be len(x) sequence
bottomsum = [] 
for x in range(len(labels)):
  bottomsum.append(0)
for x in range(len(plotname)):
  if x == 0:
    ax.bar(labels, barlines[x], width, color= colors[x], bottom= bottomsum, label=plotname[x]) 
  else:
    ax.bar(labels, barlines[x], width, color= colors[x], 
           bottom=[sum(z) for z in zip(bottomsum, barlines[x-1]) ], label=plotname[x]) 
    bottomsum = [sum(z) for z in zip(bottomsum, barlines[x-1]) ] 

ax.set_ylabel('Frequencies', fontsize =14)
ax.set_xlabel('Site', fontsize=14)
ax.set_title(title, fontsize = 14)
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()