In [1]:
import os
import pandas as pd
import numpy as np
from ecg import jgi_ko_edit as jgi

#######################################################

#in what folder are inputs and outputs? must include metadata file
folder_name = 'example'
#what is the metadata file name?
metadata_file = 'imgm_metadata.txt'
#do you want 'ecs' or 'kos'?
data_needed ='kos'
#how frequently to make csv, at minimum? file writing takes time
output_frequency = 50
#what subset do you want? see pd.DataFrame.query for format rules
query = 'Ecosystem=="Environmental"'
#analysis type options: 'Metatranscriptome' or 'Metagenome'
analysis_type = 'Metatranscriptome'

In [2]:
def available_samples(metadata_file:str, folder_name:str, data_needed:str):
    data = {'ecs': '`Enzyme Count`', 'kos': '`KO Count`'}
    path = folder_name+'/'+metadata_file
    if not os.path.exists(folder_name+'/'+metadata_file):
        raise ValueError('Please ensure folder exists & contains JGI metadata file.')
    meta_df = pd.read_csv(path, sep='\t', header=0, index_col=0)
    meta_df = meta_df.drop(columns=[i for i in meta_df.columns if 'Unnamed' in i])#  or meta_df[i].nunique()<=1])
    meta_df.columns = [i.split('*')[0].strip() for i in meta_df.columns]
    try:
        meta_df['Add Date'] = pd.to_datetime(meta_df['Add Date'])
    except:
        pass
    if data.get(data_needed, ' ').strip('`') in meta_df.columns:
        meta_df = meta_df.query(data[data_needed]+'>0 & `Gene Count`>0').dropna(axis=1, how='all')
    else:
        raise ValueError(data.get(data_needed, 'Attribute count')+' not in columns. Please re-download. \nCount columns present: '+
                         ', '.join(meta_df.columns[meta_df.columns.str.contains('Count')])+
                         '\nMTs:\thttps://img.jgi.doe.gov/cgi-bin/m/main.cgi?section=TaxonList&page=taxonListAlpha2&domain=Metatranscriptome'+
        '\nMGs:\thttps://img.jgi.doe.gov/cgi-bin/m/main.cgi?section=TaxonList&page=taxonListAlpha2&domain=*Microbiome')
    return(meta_df)

def write_data_urls(meta_df, data_needed, folder_name):
    oids = meta_df['IMG Genome ID']
    data = {'kos':'ko', 'ecs':'enzymes'}
    cols = {'kos': 'KO Count', 'ecs': 'Enzyme Count'}
    prefix = 'https://img.jgi.doe.gov/cgi-bin/m/main.cgi?section=MetaDetail&taxon_oid='
    page, gcount, fcount = '&page=', '&data_type=assembled&total_genome_gene_count=', '&total_gene_count='
    urls = (prefix + meta_df['IMG Genome ID'].astype(str) + page + data[data_needed] + gcount + 
              meta_df['Gene Count'].astype(str) + fcount + meta_df[cols[data_needed]].astype(str))
    urls.to_csv(folder_name+'/'+data_needed+'_urls.csv', encoding='utf-8')
    return(urls)

metadata = available_samples(metadata_file, folder_name, data_needed=data_needed)
my_metadata = metadata[metadata['GOLD Analysis Project Type'].str.contains(analysis_type)]
my_metadata = my_metadata[my_metadata['Domain']=='*Microbiome']
my_metadata = my_metadata.query(query).dropna(axis=1, how='all')
print(len(my_metadata), 'samples, ETA:', np.round((12*len(my_metadata)/60)/60,2), 'hours')

6 samples, ETA: 0.02 hours


In [3]:
J = jgi.Jgi()
data_urls = write_data_urls(my_metadata, data_needed, folder_name).to_dict()
counts, status = J._scrape_urls_unsafe_alacarte(path=folder_name, domain='*Microbiome',
                                                data_urls=data_urls, data_needed=data_needed, output_frequency = output_frequency)

Done! 6
