In [8]:
import glob
import pandas as pd

# Create BioSample metadata table

In [28]:
columns = [
    'host','description',
    '*geo_loc_name','*collection_date','time_point','sample_title',
    '*organism','isolation_source','*lat_lon','source_material_id','replicate'
]
biosample_metadata = pd.DataFrame(columns=columns)
biosample_metadata.index.name = '*sample_name'

samples_with_data = set([i.split('/')[-1].split('_')[0] for i in glob.glob('raw_data/AS*')])

m = pd.read_csv('metadata.tsv',sep='\t',index_col=0)
m = m.loc[m.Treatment=='Infected']

for sample_name in samples_with_data:
    if sample_name not in m.index.tolist():
        continue
    host = 'Solanum melongena'
    if m.at[sample_name,'Niche'] == 'Soil':
        host = ''
    plant_id = m.at[sample_name,'Location'].split('_')[0]
    date = m.at[sample_name,'Date']
    niche = m.at[sample_name,'Niche']
    desc = {
        'Root':'Metagenome sample of Meloidogyne incognita infected eggplant root, plant %s, collected from Israel:Hatzeva on %s' % (plant_id,date),
        'Gall':'Metagenome sample of gall from a Meloidogyne incognita infected eggplant root, plant %s, collected from Israel:Hatzeva on %s' % (plant_id,date),
        'Soil':'Metagenome sample of rhizosphere soil from a Meloidogyne incognita infected eggplant, plant %s, collected from Israel:Hatzeva on %s' % (plant_id,date),
        'J2':'Metagenome sample of Meloidogyne incognita second stage juveniles from the rhizoplane of eggplant, plant %s, collected from Israel:Hatzeva on %s' % (plant_id,date),
    }
    description = desc[niche]
    geo_loc_name = 'Israel:Hatzeva'
    
    day, month, year = date.split('/')
    months = ['Jan','Feb','Mar','','May','','','','','','','Dec']
    month = months[int(month)-1]
    year = "20%s" % year
    collection_date = "%s-%s-%s" % (day, month, year)
    time_point = m.at[sample_name,'TimePoint']
    sample_title = description
    organism = 'Bacteria'
    source = {
        'Root': 'RKN infected eggplant root',
        'Gall': 'RKN eggplant gall',
        'Soil': 'RKN infected eggplant rhizosphere soil',
        'J2': 'Meloidogyne incognita J2 from eggplant rhizoplane'
    }
    isolation_source = source[niche]
    lat_lon = '30.7783 N 35.2396 E'
    source_material_id = plant_id
    replicate = 'biological replicate 1'
    if sample_name in ['AS1181S','AS1105','AS1106']:
        replicate = 'biological replicate 2'
    
    biosample_metadata.loc[sample_name] = [host,description,geo_loc_name,collection_date,time_point,sample_title,organism,isolation_source,lat_lon,source_material_id,replicate]

In [29]:
biosample_metadata.to_csv('biosample_metadata.tsv',sep='\t')

# Create SRA metadata table

In [39]:
columns = [
    'library_ID','title','library_strategy',
    'library_source','library_selection','library_layout',
    'platform','instrument_model','design_description',
    'filetype','filename','filename2'
]
sra_metadata = pd.DataFrame(columns=columns)
sra_metadata.index.name = 'sample_name'

samples_with_data = set([i.split('/')[-1].split('_')[0] for i in glob.glob('raw_data/AS*')])

m = pd.read_csv('metadata.tsv',sep='\t',index_col=0)
m = m.loc[m.Treatment=='Infected']

for sample_name in samples_with_data:
    if sample_name not in m.index.tolist():
        continue
    library_ID = sample_name
    title = 'Bacterial 16 rRNA metabarcode seqeuces of '+biosample_metadata.at[sample_name,'description'][21:] + '; ' + biosample_metadata.at[sample_name,'replicate']
    library = ['AMPLICON','GENOMIC','PCR','paired','ILLUMINA','Illumina MiSeq','Single barcode, two-step PCR, V3-V4 fragment','fastq']
    filename = list(glob.glob('raw_data/%s_*_R1_*.fastq.gz' % sample_name))[0].split('/')[-1]
    filename2 = list(glob.glob('raw_data/%s_*_R2_*.fastq.gz' % sample_name))[0].split('/')[-1]
    sra_metadata.loc[sample_name] = [library_ID,title]+library+[filename,filename2]
    


In [40]:
sra_metadata.to_csv('sra_metadata.tsv',sep='\t')