## **Portal Scraping**

Link Taxon Object IDs from each project to their respective portal names. The portal names are required to download data from Globus's API.

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import time 
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import concurrent.futures
from concurrent.futures import ALL_COMPLETED, as_completed

In [2]:
# Read in data from orgs.txt (downloaded from JGI) and get taxon ids
organisms = pd.read_csv('orgs.txt', sep = '\t')
taxon_ids = organisms['taxon_oid']
taxon_ids

0        3300026111
1        3300064906
2        3300000615
3        3300003789
4        3300038838
            ...    
20944    3300042066
20945    3300002188
20946    3300011176
20947    3300002795
20948    3300033098
Name: taxon_oid, Length: 20949, dtype: int64

In [3]:
def scrape_portals(id_):
    """
    Scrapes the portal name from the JGI website given a taxon id
    Returns a list of the taxon id and corresponding portal name
    """

    driver.get('https://genome.jgi.doe.gov/portal/?core=genome&query={}'.format(id_))
    wait = WebDriverWait(driver, 60)
    
    try:
        element = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'a.system_icon_link[ng-click="onClick($event,$index)')))
        href = element.get_attribute('href')
        portal_name = href.split('portal/')[1]
    except:
        portal_name = 'IMG_{}'.format(id_)
    
    return [id_, portal_name]

driver = webdriver.Chrome()
portals = []

# Scrape the portal name for each taxon id
for id_ in taxon_ids:
    try:
        portals.append(scrape_portals(id_))
    except:
        portals.append([id_, 'not appended'])
driver.quit()

In [6]:
df = pd.DataFrame(portals, columns = ['taxon_oid', 'portal_name'])
df.to_csv('portals.csv', sep = '\t', index = False)

## **Globus Download**

Format the curl command to request a data download from Globus's API.

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('portals.csv', sep = '\t')
df.head()

Unnamed: 0,taxon_oid,portal_name,downloaded
0,3300026111,MayberrySE_Catta_2_FD,no
1,3300064906,IMG_3300064906,no
2,3300000615,TroBogEpilimnion_34_FD,no
3,3300003789,CryBogCBE28Sep08_FD,no
4,3300038838,IMG_3300038838,no


In [6]:
# Print portal name for curl command
portals = (',').join(list(df['portal_name'].iloc[15000:]))
print(f"curl 'https://genome.jgi.doe.gov/portal/ext-api/downloads/globus/request' -b cookies --data-urlencode 'portal={portals}' --data-urlencode 'globusName=0009-0001-5451-1549@orcid.org' --data-urlencode 'filePattern=Table_8_-_.*\.taxonomic_composition\.txt' --data-urlencode 'sendMail=true'")
# If curl command argument is too long, do it in chunks of 5000
#(',').join(list(df['portal_name'].iloc[15000:]))

curl 'https://genome.jgi.doe.gov/portal/ext-api/downloads/globus/request' -b cookies --data-urlencode 'portal=IMG_3300054095,C_130625_EF_Meta_FD,IMG_3300006480,IMG_3300039681,COGITO_mtgs_1004_3_FD,IMG_3300039090,RI_4A1_MMG_10021_FD,20190225_38_FD,June2015DPH_20_5_FD,SKW24metagenome_FD,IMG_3300040791,2017072412m_MG_FD,IMG_2170459009,IMG_3300023284,OCT_B_ViralDNA_FD,IMG_3300054037,RifCSP19_2_FD,tim00Cox00000000_FD,AntAcemetage1502_FD,EasRivERMLT142_2_FD,Watercolumn_Mata_2_FD,Na_oxic_3_MG_FD,IMG_3300020233,SI_123_September_2_FD,SS_ome_FD,CryBogmetaGMA0M_FD,NewNP1_37_FD,48SNCmetaG_FD,IMG_3300001190,S53hydmetagenome_5_FD,E20190026metaG_FD,LHCtaG_FD,IMG_3300061674,pH1ome_7_FD,TroBogEpilimnion_5_FD,IMG_3300064905,Wag42014_10_16_FD,GWRWS3_20_30_FD,IMG_3300017672,IMG_3300028818,NitcycUWRG09A212_FD,BogFormECP14_OM2_FD,Colrivmet1548A02_FD,FlocCalmetaG_FD,COGITO_mtgs_1103_3_FD,T60D4metaG_FD,CA13metaG_FD,IMG_3300010213,IMG_3300021383,IMG_3300013134,TOOO20compDNA1_6_FD,Lox_Sample_45_FD,OWC_soil_day1

In [None]:
curl 'https://genome.jgi.doe.gov/portal/ext-api/downloads/bulk/request' -b cookies --data-urlencode 'portals=MayberrySE_Catta_2_FD,IMG_3300064906' --data-urlencode 'filePattern=Table_8_-_.*\.taxonomic_composition\.txt' --data-urlencode 'sendMail=true' --data-urlencode 'globusName=0009-0001-5451-1549@orcid.org'