## Import provided tables with metadata, curate size fractions specific to Tara oceans data and generate Assembly lists

In [6]:
import pandas as pd
import argparse

In [31]:
# # import tables which list metagenomic and metatranscriptomic reads
# metag = pd.read_csv("ENA_tables/PRJEB4352_metaG_wenv.txt", sep = '\t')
# metat = pd.read_csv("ENA_tables/PRJEB6609_metaT_wenv.txt", sep = '\t')

In [32]:
# def main():
#     p = argparse.ArgumentParser()
#     p.add_argument('projectfile')
#     p.add_argument('--column',default='fastq_ftp', required=False)
#     p.add_argument('--delimiter', default=';', required=False)
#     args = p.parse_args() 
#     #read in file and split the ftp column
#     df = pd.read_table(args.projectfile) 
#     df[['c1', 'c2']]=df[args.column].str.split(args.delimiter, expand=True)
#     #test if PE read by looking for fastq2 file
#     pe_ind = df.c2.notnull()    
#     se_ind = df.c2.isnull()
#     basename = args.projectfile.split('.')[0]
#     PE_out = basename + '_PE.txt'
#     SE_out = basename + '_SE.txt'
#     PE_READS = df.loc[pe_ind].drop(['c1', 'c2'], axis=1)
#     SE_READS = df.loc[se_ind].drop(['c1', 'c2'], axis=1)  
#     PE_READS.to_csv(PE_out, sep='\t') 
#     SE_READS.to_csv(SE_out, sep='\t')

In [7]:
# import tables which list metagenomic and metatranscriptomic reads and take a subset

# subset from the metagenomic data:
metag_tmp = pd.read_csv("ENA_tables/PRJEB4352_metaG_wenv_PE.txt", sep = '\t')
metag_tmp_subset_1 = metag_tmp[metag_tmp['OS region'] == "[IO] Indian Ocean (MRGID:1904)"].sample(n=10)
metag_tmp_subset_2 = metag_tmp[metag_tmp['OS region'] == "[MS] Mediterranean Sea (MRGID:1905)"].sample(n=10)
metag_cat_subset = pd.concat([metag_tmp_subset_1, metag_tmp_subset_2])
metag_cat_subset.to_csv('ENA_tables/PRJEB4352_metaG_wenv_PE-TEST.txt', sep = '\t')

# repeat for metatranscriptomic data
metat_tmp = pd.read_csv("ENA_tables/PRJEB6609_metaT_wenv_PE.txt", sep = '\t')
metat_tmp_subset_1 = metat_tmp[metat_tmp['OS region'] == "[IO] Indian Ocean (MRGID:1904)"].sample(n=10)
metat_tmp_subset_2 = metat_tmp[metat_tmp['OS region'] == "[MS] Mediterranean Sea (MRGID:1905)"].sample(n=10)
metat_cat_subset = pd.concat([metat_tmp_subset_1, metag_tmp_subset_2])
metat_cat_subset.to_csv('ENA_tables/PRJEB6609_metaT_wenv_PE-TEST.txt', sep = '\t')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [8]:
# Import newly generated test datasets to run through below pipeline
metag = pd.read_csv("ENA_tables/PRJEB4352_metaG_wenv_PE-TEST.txt", sep = '\t')
metat = pd.read_csv("ENA_tables/PRJEB6609_metaT_wenv_PE-TEST.txt", sep = '\t')

In [9]:
# Generate new columns called size frac
metag['size_frac']= metag['Fraction lower [µm]'].map(str) + '-' + metag['Fraction upper [µm]'].map(str)
metat['size_frac']= metat['Fraction lower [µm]'].map(str) + '-' + metat['Fraction upper [µm]'].map(str)

### Tidy Size fraction data splitting 

In [11]:
# Original data -- group by size fractions in 4 distinct groups
pd.DataFrame(metag.groupby('size_frac').count()['study_accession'])

Unnamed: 0_level_0,study_accession
size_frac,Unnamed: 1_level_1
0.8-20.00,1
0.8-5.00,6
0.8->0.80,1
180.0-2000.00,5
20.0-180.00,3
5.0-20.00,4


In [12]:
#drop 0.22 sample
metag = metag[metag.size_frac != '0.22-3.00']
metag = metag.replace('0.8->0.80', '0.8-5.00')
metag = metag.replace('0.8-20.00', '0.8-5.00')
metag = metag.replace('0.8-3.00', '0.8-5.00')
metag = metag.replace('3.0->3.00', '0.8-5.00')
metag = metag.replace('3.0->5.00', '0.8-5.00')
metag = metag.replace('180.0-2000.00', '180-2000.00')
metag = metag.replace('20.0-180.00', '20-180.00')
metag = metag.replace('5.0-20.00', '5-20.00')

#Check final groupings
pd.DataFrame(metag.groupby('size_frac').count()['study_accession'])

Unnamed: 0_level_0,study_accession
size_frac,Unnamed: 1_level_1
0.8-5.00,8
180-2000.00,5
20-180.00,3
5-20.00,4


In [13]:
#drop 0.22 sample
metat = metat.replace('0.8->0.80', '0.8-5.00')
metat = metat.replace('0.8-20.00', '0.8-5.00')
metat = metat.replace('0.8-3.00', '0.8-5.00')
metat = metat.replace('3.0->3.00', '0.8-5.00')
metat = metat.replace('3.0->5.00', '0.8-5.00')
metat = metat.replace('180.0-2000.00', '180-2000.00')
metat = metat.replace('20.0-180.00', '20-180.00')
metat = metat.replace('5.0-20.00', '5-20.00')


pd.DataFrame(metat.groupby('size_frac').count()['study_accession'])

Unnamed: 0_level_0,study_accession
size_frac,Unnamed: 1_level_1
0.8-5.00,9
180-2000.00,3
20-180.00,4
5-20.00,4


## Check Ocean Region and Depth and clean

In [14]:
metag['OS'] = metag['OS region'].str.split(' ').str[0].str.strip('[').str.strip(']')
metag['EF'] = metag['Env feature'].str.split(' ').str[0].str.strip('[').str.strip(']')
metag['BG'] = metag['BG province'].str.split(' ').str[0].str.strip('[').str.strip(']')


In [15]:
metat['OS'] = metat['OS region'].str.split(' ').str[0].str.strip('[').str.strip(']')
metat['EF'] = metat['Env feature'].str.split(' ').str[0].str.strip('[').str.strip(']')
metat['BG'] = metat['BG province'].str.split(' ').str[0].str.strip('[').str.strip(']')


In [16]:
subset = ['study_accession', 'run_accession', 'Station', 'size_frac', 'Env feature', 
        'Sample material', 'Depth, nominal', 'OS region', 'BG province',
        'Latitude', 'Longitude', 'OS', 'EF', 'BG']

metag_subset = metag[subset]
metat_subset = metat[subset]

## Function to split into groups (for assembly)

In [17]:
def split_assembly_groups(df, OS_list):
    df['ERR_list']= df.groupby(['OS', 'EF', 'size_frac'])['run_accession'].transform(lambda x: ', '.join(x))
    df_combo = df[['OS', 'EF', 'size_frac','ERR_list']].drop_duplicates()
    df = df.drop('ERR_list', axis =1)
    for O in OS_list:
        df_combo = df_combo[df_combo.OS != O]
        tmp = df[df.OS == O]
        tmp['ERR_list']=tmp.groupby(['OS','BG', 'EF', 'size_frac'])['run_accession'].transform(lambda x: ', '.join(x))
        tmp_combo =tmp[['OS', 'BG', 'EF', 'size_frac','ERR_list']].drop_duplicates()
    final_combo = pd.concat([df_combo, tmp_combo])  
    final_combo = final_combo.fillna('all')
    final_combo['Assembly_group'] = final_combo.OS + '-' + final_combo.BG + '-' + final_combo.EF + '-' +final_combo.size_frac
    final_combo['Sub_region'] = final_combo.OS + '-' + final_combo.BG
    final_combo['Depth_sizefrac'] = final_combo.EF + '-' +final_combo.size_frac
    final_combo['ERR_count']= final_combo['ERR_list'].str.split(', ').str.len()
    final_combo = final_combo[['Sub_region', 'Depth_sizefrac', 'ERR_count', 'ERR_list', 'Assembly_group']]
    final_combo = final_combo.set_index('Sub_region')
    return(final_combo)

In [18]:
OS_list = ['SPO']
metag_out = split_assembly_groups(metag, OS_list)
metat_out = split_assembly_groups(metat, OS_list)

metag_out.to_csv('SampleList_ForAssembly_metaG_python-TEST.txt', sep = '\t')
metat_out.to_csv('SampleList_ForAssembly_metaT_python-TEST.txt', sep = '\t')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  # Remove the CWD from sys.path while we load stuff.


In [19]:
metag_out.sort_values(['ERR_count'])

Unnamed: 0_level_0,Depth_sizefrac,ERR_count,ERR_list,Assembly_group
Sub_region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
IO-all,DCM-20-180.00,1,ERR1726832,IO-all-DCM-20-180.00
IO-all,MES-0.8-5.00,1,ERR599225,IO-all-MES-0.8-5.00
IO-all,DCM-5-20.00,1,ERR1700904,IO-all-DCM-5-20.00
IO-all,SRF-180-2000.00,1,ERR1726674,IO-all-SRF-180-2000.00
MS-all,SRF-180-2000.00,1,ERR1726810,MS-all-SRF-180-2000.00
MS-all,DCM-5-20.00,1,ERR1726715,MS-all-DCM-5-20.00
MS-all,DCM-20-180.00,1,ERR538177,MS-all-DCM-20-180.00
MS-all,SRF-20-180.00,1,ERR1726660,MS-all-SRF-20-180.00
MS-all,SRF-5-20.00,2,"ERR1726722, ERR538190",MS-all-SRF-5-20.00
MS-all,SRF-0.8-5.00,2,"ERR1726597, ERR868456",MS-all-SRF-0.8-5.00
