In [1]:
from _pathlib import Path
import requests
from bs4 import BeautifulSoup


In [2]:
scxa_url = 'https://www.ebi.ac.uk/gxa/sc/json/experiments'
response = requests.get(scxa_url)
projects = response.json()['experiments']

In [3]:
def project_url(accession: str):
    return f'https://www.ebi.ac.uk/gxa/sc/experiments/{accession}/results/tsne'

In [4]:
# Don't actually need this... Assays is a synonym for cells. 
def get_cell_count(accession: str) -> int:
    response_ = requests.get(project_url(accession))
    soup = BeautifulSoup(response_.content, 'html.parser')
    h3 = soup.find(id='goto-experiment')
    cell_count_text = list(h3.next_siblings)[3].text
    _, cell_count_text = cell_count_text.split(':')
    return int(cell_count_text.strip().replace(',', ''))

In [5]:
def cell_count(projects_):
    return sum(p['numberOfAssays'] for p in projects_)

In [6]:
cell_count(projects)

1028590

In [7]:
hca_projects = [p for p in projects if 'Human Cell Atlas' in p['experimentProjects']]
cell_count(hca_projects)

177678

In [8]:
geo_projects = [p for p in projects if 'GEO' in p['experimentAccession']]
cell_count(geo_projects)

198047

In [9]:
def geo_accession(project):
    scxa_accession = project['experimentAccession']
    prefix = 'E-GEOD-'
    assert scxa_accession.startswith(prefix)
    return 'GSE' + scxa_accession[len(prefix):]

In [10]:
for geo in geo_projects:
    geo['geo_accession'] = geo_accession(geo)

geo_accessions = [geo['geo_accession'] for geo in geo_projects]

In [11]:
loaded_geo_accessions = {
'GSE100618', 'GSE102580', 'GSE102596', 'GSE103275', 'GSE103354', 'GSE106273', 'GSE106540', 'GSE107585', 'GSE107618',
'GSE107746', 'GSE107909', 'GSE108041', 'GSE108291', 'GSE109488', 'GSE109822', 'GSE109979', 'GSE110154', 'GSE110499',
'GSE111586', 'GSE111727', 'GSE113197', 'GSE114374', 'GSE114396', 'GSE114557', 'GSE114802', 'GSE115469', 'GSE116237',
'GSE116470', 'GSE117089', 'GSE117498', 'GSE118127', 'GSE124472', 'GSE124494', 'GSE126836', 'GSE127969', 'GSE128639',
'GSE129798', 'GSE130430', 'GSE130473', 'GSE130606', 'GSE130636', 'GSE131181', 'GSE131685', 'GSE131736', 'GSE132040',
'GSE132044', 'GSE132566', 'GSE132802', 'GSE134881', 'GSE135889', 'GSE36552', 'GSE44183', 'GSE57872', 'GSE67835',
'GSE70580', 'GSE73727', 'GSE75140', 'GSE75367', 'GSE75478', 'GSE75659', 'GSE75688', 'GSE76312', 'GSE76381',
'GSE81383', 'GSE81547', 'GSE81608', 'GSE81904', 'GSE81905', 'GSE83139', 'GSE84133', 'GSE84147', 'GSE84465',
'GSE86146', 'GSE86469', 'GSE86473', 'GSE89232', 'GSE89322', 'GSE90806', 'GSE92280', 'GSE93374', 'GSE93593',
'GSE94820', 'GSE96583', 'GSE97104', 'GSE99795',
}

new_geo_projects = [geo for geo in geo_projects if geo['geo_accession'] not in loaded_geo_accessions]
cell_count(new_geo_projects)

161359

In [12]:
loaded_shared_geo_projects = [geo for geo in geo_projects if geo['geo_accession'] in loaded_geo_accessions]
loaded_shared_geo_accessions = [geo['geo_accession'] for geo in geo_projects]
cell_count(loaded_shared_geo_projects)

36688

In [13]:
unique_projects = [p for p in projects if p not in geo_projects and p not in hca_projects]
cell_count(unique_projects)

659386

In [14]:
non_geo_projects = [p for p in projects if p not in geo_projects]
cell_count(non_geo_projects)

830543

In [15]:
new_projects = [p for p in projects if p not in loaded_shared_geo_projects]
cell_count(new_projects)

991902

In [16]:
already_loaded_geo_projects = {a for a in loaded_geo_accessions if a not in loaded_shared_geo_accessions}

In [17]:
from download_scxa import download_project, list_projects
projects = list_projects()
accessions = [p['experimentAccession'] for p in projects]
test_accession = accessions[0]
download_project(test_accession, Path('projects'))

INFO:root:Linking projects/E-CURD-10 to c062eb06-2c1d-5052-922a-5cf872155120
INFO:download_scxa:Skipping download of file `projects/c062eb06-2c1d-5052-922a-5cf872155120/scxa/experiment-metadata.zip`
INFO:download_scxa:Skipping download of file `projects/c062eb06-2c1d-5052-922a-5cf872155120/scxa/experiment-design`
INFO:download_scxa:Skipping download of file `projects/c062eb06-2c1d-5052-922a-5cf872155120/scxa/cluster`
INFO:download_scxa:Skipping download of file `projects/c062eb06-2c1d-5052-922a-5cf872155120/scxa/marker-genes.zip`
INFO:download_scxa:Skipping download of file `projects/c062eb06-2c1d-5052-922a-5cf872155120/scxa/normalised.zip`
INFO:download_scxa:Skipping download of file `projects/c062eb06-2c1d-5052-922a-5cf872155120/scxa/quantification-raw.zip`


In [18]:
import extract
extract.main()

INFO:root:Expansion of projects/E-CURD-10/scxa/experiment-metadata.zip already complete
INFO:root:Expansion of projects/E-CURD-10/scxa/normalised.zip already complete
INFO:root:Expansion of projects/E-CURD-10/scxa/quantification-raw.zip already complete
INFO:root:Expansion of projects/E-CURD-10/scxa/marker-genes.zip already complete


In [21]:
from generate_metadata_scxa import parse_mage_tab, file_by_suffix, generate_metadata
test_project = Path('projects') / test_accession
generate_metadata(test_project)


"projects/E-CURD-10/bundle/project_0.json" successfully written.


In [21]:
hca_accessions = {d['experimentAccession'] for d in hca_projects}