### Make BioSample table

In [13]:
import pandas as pd
import glob

!mkdir biosample_sra

biosample = pd.DataFrame(columns=[
    'replicate','description','*geo_loc_name',
    '*collection_date','sample_title','*organism','environmental medium',
    '*lat_lon','depth','elevation',
    'broad-scale environmental context','local-scale environmental context',
    'annual and seasonal precipitation','annual and seasonal temperature',
    'geodiversity','niche','hillslope'])

biosample.index.name = '*sample_name'

geo_loc_name = 'Israel:Ofakim,SayeretShakedPark'
collection_date = '3-Feb-2020'
organism = 'soil metagenome'
environmental_medium = 'Soil'
lat_lon = '31.27 N 34.65 E'
depth = '10 cm'
elevation = '187 m.a.s.l.'
broad_scale_environmental_context = 'Semi-arid shrubland'
local_scale_environmental_context = "Long-term ecological research station"
annual_and_seasonal_precipitation = "250 mm (165 mm since 2010)"
annual_and_seasonal_temperature = "26 and 12°C in the summer and winter, respectively"

metadata = pd.read_csv('16SrRNA_metadata.tsv',sep='\t',index_col=0)

samples = [i.split('/')[-1].split('_')[0] for i in glob.glob('raw_data/16SrRNA/*.fastq.gz')]
samples += [i.split('/')[-1].split('_')[0].replace('ITS-','') for i in glob.glob('raw_data/ITS1/ITS-*.fastq.gz')]
samples = set(samples)

replicate = 'biological replicate %i'
description = 'Soil metagenome sample from Israel:Ofakim,SayeretShakedPark LTER, from %s on the %s %s hillslope'

for niche in set(metadata.Niche):
    for geodiversity in set(metadata.Geodiversity):
        geo ='homogeneous'
        if geodiversity == 'Heterogenous':
            geo = 'heterogeneous'
        n = 'under the phylosphere of Noaea mucronata'
        if niche == 'Intershrub':
            n = 'an inter-shrub space'
        for slope in set(metadata.TubePartA_slope_or_plant):
            if slope.startswith('L') or slope.startswith('D'):
                continue
        
            rep_count = 1

            strata = metadata.loc[((metadata.Niche==niche) & (metadata.Geodiversity==geodiversity) &  (metadata.TubePartA_slope_or_plant==slope))]

            for ind, row in strata.iterrows():
                if ind not in samples:
                    continue
                
                biosample.loc[ind] = [
                    replicate%rep_count,
                    description%(n, slope, geo),
                    geo_loc_name,
                    collection_date,
                    description%(n, slope, geo),
                    organism,
                    environmental_medium,
                    lat_lon,
                    depth,
                    elevation,
                    broad_scale_environmental_context,
                    local_scale_environmental_context,
                    annual_and_seasonal_precipitation,
                    annual_and_seasonal_temperature,
                    geo,
                    n,
                    slope

                ]
                rep_count += 1
            
for smpl in samples:
    if smpl not in biosample.index:
        print(smpl)
        
biosample.to_csv('biosample_sra/BioSample_metadata.tsv',sep='\t')
biosample
# dead vs live samples are excluded

mkdir: cannot create directory ‘biosample_sra’: File exists
AS3075
AS3061
AS3072
AS3062
AS3067
AS3070
AS3074
AS3064
AS3063
AS3073
AS3066
AS3060
AS3071
AS3065


Unnamed: 0_level_0,replicate,description,*geo_loc_name,*collection_date,sample_title,*organism,environmental medium,*lat_lon,depth,elevation,broad-scale environmental context,local-scale environmental context,annual and seasonal precipitation,annual and seasonal temperature,geodiversity,niche,hillslope
*sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
AS3045,biological replicate 1,"Soil metagenome sample from Israel:Ofakim,Saye...","Israel:Ofakim,SayeretShakedPark",3-Feb-2020,"Soil metagenome sample from Israel:Ofakim,Saye...",soil metagenome,Soil,31.27 N 34.65 E,10 cm,187 m.a.s.l.,Semi-arid shrubland,Long-term ecological research station,250 mm (165 mm since 2010),"26 and 12°C in the summer and winter, respecti...",heterogeneous,an inter-shrub space,H2
AS3046,biological replicate 2,"Soil metagenome sample from Israel:Ofakim,Saye...","Israel:Ofakim,SayeretShakedPark",3-Feb-2020,"Soil metagenome sample from Israel:Ofakim,Saye...",soil metagenome,Soil,31.27 N 34.65 E,10 cm,187 m.a.s.l.,Semi-arid shrubland,Long-term ecological research station,250 mm (165 mm since 2010),"26 and 12°C in the summer and winter, respecti...",heterogeneous,an inter-shrub space,H2
AS3047,biological replicate 3,"Soil metagenome sample from Israel:Ofakim,Saye...","Israel:Ofakim,SayeretShakedPark",3-Feb-2020,"Soil metagenome sample from Israel:Ofakim,Saye...",soil metagenome,Soil,31.27 N 34.65 E,10 cm,187 m.a.s.l.,Semi-arid shrubland,Long-term ecological research station,250 mm (165 mm since 2010),"26 and 12°C in the summer and winter, respecti...",heterogeneous,an inter-shrub space,H2
AS3048,biological replicate 4,"Soil metagenome sample from Israel:Ofakim,Saye...","Israel:Ofakim,SayeretShakedPark",3-Feb-2020,"Soil metagenome sample from Israel:Ofakim,Saye...",soil metagenome,Soil,31.27 N 34.65 E,10 cm,187 m.a.s.l.,Semi-arid shrubland,Long-term ecological research station,250 mm (165 mm since 2010),"26 and 12°C in the summer and winter, respecti...",heterogeneous,an inter-shrub space,H2
AS3049,biological replicate 5,"Soil metagenome sample from Israel:Ofakim,Saye...","Israel:Ofakim,SayeretShakedPark",3-Feb-2020,"Soil metagenome sample from Israel:Ofakim,Saye...",soil metagenome,Soil,31.27 N 34.65 E,10 cm,187 m.a.s.l.,Semi-arid shrubland,Long-term ecological research station,250 mm (165 mm since 2010),"26 and 12°C in the summer and winter, respecti...",heterogeneous,an inter-shrub space,H2
AS3055,biological replicate 1,"Soil metagenome sample from Israel:Ofakim,Saye...","Israel:Ofakim,SayeretShakedPark",3-Feb-2020,"Soil metagenome sample from Israel:Ofakim,Saye...",soil metagenome,Soil,31.27 N 34.65 E,10 cm,187 m.a.s.l.,Semi-arid shrubland,Long-term ecological research station,250 mm (165 mm since 2010),"26 and 12°C in the summer and winter, respecti...",heterogeneous,an inter-shrub space,H3
AS3056,biological replicate 2,"Soil metagenome sample from Israel:Ofakim,Saye...","Israel:Ofakim,SayeretShakedPark",3-Feb-2020,"Soil metagenome sample from Israel:Ofakim,Saye...",soil metagenome,Soil,31.27 N 34.65 E,10 cm,187 m.a.s.l.,Semi-arid shrubland,Long-term ecological research station,250 mm (165 mm since 2010),"26 and 12°C in the summer and winter, respecti...",heterogeneous,an inter-shrub space,H3
AS3057,biological replicate 3,"Soil metagenome sample from Israel:Ofakim,Saye...","Israel:Ofakim,SayeretShakedPark",3-Feb-2020,"Soil metagenome sample from Israel:Ofakim,Saye...",soil metagenome,Soil,31.27 N 34.65 E,10 cm,187 m.a.s.l.,Semi-arid shrubland,Long-term ecological research station,250 mm (165 mm since 2010),"26 and 12°C in the summer and winter, respecti...",heterogeneous,an inter-shrub space,H3
AS3058,biological replicate 4,"Soil metagenome sample from Israel:Ofakim,Saye...","Israel:Ofakim,SayeretShakedPark",3-Feb-2020,"Soil metagenome sample from Israel:Ofakim,Saye...",soil metagenome,Soil,31.27 N 34.65 E,10 cm,187 m.a.s.l.,Semi-arid shrubland,Long-term ecological research station,250 mm (165 mm since 2010),"26 and 12°C in the summer and winter, respecti...",heterogeneous,an inter-shrub space,H3
AS3059,biological replicate 5,"Soil metagenome sample from Israel:Ofakim,Saye...","Israel:Ofakim,SayeretShakedPark",3-Feb-2020,"Soil metagenome sample from Israel:Ofakim,Saye...",soil metagenome,Soil,31.27 N 34.65 E,10 cm,187 m.a.s.l.,Semi-arid shrubland,Long-term ecological research station,250 mm (165 mm since 2010),"26 and 12°C in the summer and winter, respecti...",heterogeneous,an inter-shrub space,H3


In [3]:
import pandas as pd

sample_to_biosample = {}

biosamples = pd.read_csv('biosample_sra/attributes.tsv',sep='\t')
for ind, row in biosamples.iterrows():
    sample_to_biosample[row['sample_name']] = row['accession']

In [7]:
import glob

sra = pd.DataFrame(columns = [
    'biosample_accession','title','library_strategy',
    'library_source','library_selection','library_layout',
    'platform','instrument_model','design_description',
    'filetype','filename','filename2']
)

sra.index.name = 'library_ID'

for smpl in sample_to_biosample:
    biosample = sample_to_biosample[smpl]
    library_ID = 'V4_%s' % smpl
    title = biosamples.loc[biosamples.sample_name==smpl]['description'].tolist()[0].replace('metagenome sample','16S-rRNA metabaecoding sequences')+' BioSample %s' % biosample
    library_strategy = 'AMPLICON'
    library_source = 'GENOMIC'
    library_selection = 'PCR'
    library_layout = 'paired'
    platform = 'ILLUMINA'
    instrument_model = 'Illumina iSeq 100'
    design_description = 'Single barcode, two-step PCR, V4 fragment'
    filetype = 'fastq'
    R1 = list(glob.glob('raw_data/16SrRNA/%s_*_*_R1_*.fastq.gz'%smpl))[0].split('/')[-1]
    R2 = list(glob.glob('raw_data/16SrRNA/%s_*_*_R2_*.fastq.gz'%smpl))[0].split('/')[-1]
    sra.loc[library_ID] = [biosample,title,library_strategy,library_source,
                           library_selection,library_layout,platform,
                           instrument_model,design_description,filetype,R1,R2]
    


for smpl in sample_to_biosample:
    biosample = sample_to_biosample[smpl]
    library_ID = 'ITS1_%s' % smpl
    title = biosamples.loc[biosamples.sample_name==smpl]['description'].tolist()[0].replace('metagenome sample','ITS1 metabaecoding sequences')+' BioSample %s' % biosample
    library_strategy = 'AMPLICON'
    library_source = 'GENOMIC'
    library_selection = 'PCR'
    library_layout = 'paired'
    platform = 'ILLUMINA'
    instrument_model = 'Illumina iSeq 100'
    design_description = 'Single barcode, two-step PCR, ITS1 fragment'
    filetype = 'fastq'
    R1 = list(glob.glob('raw_data/ITS1/ITS-%s_*_*_R1_*.fastq.gz'%smpl))[0].split('/')[-1]
    R2 = list(glob.glob('raw_data/ITS1/ITS-%s_*_*_R2_*.fastq.gz'%smpl))[0].split('/')[-1]
    sra.loc[library_ID] = [biosample,title,library_strategy,library_source,
                           library_selection,library_layout,platform,
                           instrument_model,design_description,filetype,R1,R2]
    
    
sra.to_csv('biosample_sra/sra_metadata.tsv',sep='\t')

### Comments on file upload
1. put all the files in a single directory (do not zip or tar)

2. cd to this directory 

3. `ftp -p -i`

4. once inside ftp:

`ftp> open ftp-private.ncbi.nlm.nih.gov`  
`user from submission page`  
`password from submission page`  
`ftp> cd directory-from-submission-page`  
`ftp> mkdir new_dir` 
`ftp> cd new_dir`  
`ftp> mput *.fastq.gz`
