In [1]:
from Bio import SeqIO
import glob
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import matplotlib.patches as mpatches
from matplotlib.legend_handler import HandlerPatch
from matplotlib.lines import Line2D
import numpy as np
from numpy import average, median
from scipy.stats import mannwhitneyu as mwu
from statsmodels.stats.multitest import multipletests
from scipy.spatial.distance import euclidean
from scipy import stats
from os import path
import hashlib
from Bio import SeqIO
from scipy.stats import norm
from scipy.optimize import curve_fit
from scipy import asarray as ar,exp
from scipy import optimize
from Bio.KEGG import REST
from Bio.KEGG import Enzyme
from collections import Counter
from itertools import chain

%matplotlib inline

## Prepare NCBI metadat tables for BioSample and SRA archiving
#### Make BioSample metadata table

In [9]:
ncbi_columns = {'sample_title':'sample_title',
                '*organism':'*organism',
                'SP':'host',
                'type':'isolation_source',
                'Date':'*collection_date',
                'geo_loc_name':'*geo_loc_name',
                'lat_lon':'*lat_lon',
                'description':'description',
                'replicate':'replicate'}

samples_with_data = set([i.split('/')[-1].split('_')[0] for i in glob.glob('concat/AS*')])
biosample_metadata = metadata.loc[metadata.index.isin(samples_with_data)]
biosample_metadata = biosample_metadata.rename(ncbi_columns, axis='columns')
biosample_metadata.index.name = '*sample_name'

def get_geoloc(string):
    streams = {'H':'Hermon Stream','S': 'Snir Stream', 'J':'Jordan River, North of Sea of Galilee', 'T': 'Tel Saharonim', 'A': 'Asi Stream'}
    return 'Israel:'+streams[string]

biosample_metadata['*geo_loc_name'] = biosample_metadata.Stream.apply(get_geoloc)
biosample_metadata['*organism'] = 'Bacteria'
biosample_metadata['host'] = biosample_metadata.host.replace('Water', '')
biosample_metadata['host'] = biosample_metadata.host.replace('Tilapia zillii', 'Coptodon zillii')
biosample_metadata['host'] = biosample_metadata.host.replace('OH', 'Oreochromis')
biosample_metadata['host'] = biosample_metadata.host.replace('IH', 'Cichlidae')
biosample_metadata['isolation_source'] = biosample_metadata.isolation_source.replace('Swab', 'Fish skin swab')

def get_description(series):
    return "Metagenome sample of %s %s collected from %s" % (series['host'],series['isolation_source'],series['*geo_loc_name'])

biosample_metadata['description'] = biosample_metadata.apply(get_description,axis=1)
biosample_metadata['sample_title'] = biosample_metadata.apply(get_description,axis=1)


streams = set(biosample_metadata['*geo_loc_name'])
hosts = set(biosample_metadata['host'])
sites = set(biosample_metadata['Sampling site'])
replicate_counters = {}
for s in streams:
    replicate_counters[s] = {}
    for site in sites:
        replicate_counters[s][site] = {i:1 for i in hosts}

for ind, row in biosample_metadata.iterrows():
    if row['isolation_source'] == 'Water':
        continue
    counter = replicate_counters[row['*geo_loc_name']][row['Sampling site']][row['host']]
    replicate_counters[row['*geo_loc_name']][row['Sampling site']][row['host']] += 1
    biosample_metadata.at[ind,'replicate'] = 'biological replicate ' + str(counter)

biosample_metadata = biosample_metadata[list(ncbi_columns.values()) + ['Sampling site']]
biosample_metadata.to_csv('ncbi_biosample_metadata.tsv',sep='\t')

#### Make SRA metadata table

In [12]:
SRA_columns = ['library_ID','title','library_strategy','library_source',
               'library_selection','library_layout','platform','instrument_model',
               'design_description','filetype','filename','filename2']

sra_metadata = pd.DataFrame(columns = SRA_columns)
sra_metadata.index.name = 'sample_name'

for R1 in glob.glob('concat/AS*_R1_*.fastq'):
    smpl = R1.split('/')[-1].split('_')[0]
    replicate = biosample_metadata.at[smpl,'replicate']
    if isinstance(replicate,float):
        replicate = ''
    title = biosample_metadata.at[smpl,'description'].replace('Metagenome','16 rRNA V3V4 metabarcoding')+', '+replicate
    r1 = R1.split('/')[-1]+ '.gz'
    r2 = r1.replace('_R1_','_R2_')
    description = 'Single barcode, two-step PCR, V3-V4 fragment'
    sra_metadata.loc[smpl] = [smpl, title, 'AMPLICON','GENOMIC','PCR',
                              'paired','ILLUMINA','Illumina MiSeq',
                              description,'fastq',r1,r2]
sra_metadata.to_csv('ncbi_sra_metadata.tsv',sep='\t')