# EMPIAR Data Curation

This is the first step to create cryo-EM datasets from EMPIAR. The purpose of this step is to find all empiar imagesets and collect all useful metadata from empiar dataset entries and their corresponding resolved emdb entries.

In [4]:
import sys, os
project_root = os.path.dirname(os.path.abspath('.'))
if project_root not in sys.path:
    sys.path.append(project_root)
print(f"{project_root = }")
from CryoCRAB import *
import logging
logging.getLogger().setLevel(logging.DEBUG)
logging.getLogger("pymongo").setLevel(logging.WARNING)

project_root = '/home/vrlab/code/Cryo/CryoCRAB-Scripts'


In [None]:
# os.environ["CRYOCRAB_PROJECT_SAVE_DIR"] = None

## Get EMPIAR IDs

In [2]:
from CryoCRAB.pipeline.empiar_data_curation.step0_empiar_ids import \
    get_empiar_ids, save_empiar_ids, load_empiar_ids

# crawl empiar id via FTP 
empiar_ids = get_empiar_ids() 
# save empiar id into a .csv file
save_empiar_ids()
# load empiad id from the .csv file
empiar_ids = load_empiar_ids()
print(f"{empiar_ids[:5] = }")

/home/vrlab/code/Cryo/CryoCRAB-Scripts
empiar_ids[:5] = ['EMPIAR-10002', 'EMPIAR-10003', 'EMPIAR-10004', 'EMPIAR-10005', 'EMPIAR-10006']


## Get EMPIAR Storage Structures

In [3]:
from CryoCRAB.pipeline.empiar_data_curation.step1_empiar_structure import \
    save_empiar_structures, save_empiar_structure, load_empiar_structure

# get first empiar id for test
empiar_id = load_empiar_ids()[0]
print(f"{empiar_id = }")
# crawl its storage structure in EMPIAR via FTP
save_empiar_structure(empiar_id)
# load its storage structure from the .json file
empiar_structure = load_empiar_structure(empiar_id)
print(f"{empiar_structure = }")
# crawl all storage structure in EMPIAR via FTP
#! WARNING: it will take a long time, so uncomment it if you really need it
# save_empiar_structures()

empiar_id = 'EMPIAR-10002'
empiar_structure = ['10002.xml', {'data': ['100_movie_gc.mrcs', '101_movie_gc.mrcs', '102_movie_gc.mrcs', '103_movie_gc.mrcs', '104_movie_gc.mrcs', '105_movie_gc.mrcs', '106_movie_gc.mrcs', '107_movie_gc.mrcs', '108_movie_gc.mrcs', '109_movie_gc.mrcs', '110_movie_gc.mrcs', '111_movie_gc.mrcs', '112_movie_gc.mrcs', '113_movie_gc.mrcs', '114_movie_gc.mrcs', '115_movie_gc.mrcs', '117_movie_gc.mrcs', '118_movie_gc.mrcs', '119_movie_gc.mrcs', '120_movie_gc.mrcs', '121_movie_gc.mrcs', '122_movie_gc.mrcs', '123_movie_gc.mrcs', '124_movie_gc.mrcs', '125_movie_gc.mrcs', '126_movie_gc.mrcs', '127_movie_gc.mrcs', '128_movie_gc.mrcs', '129_movie_gc.mrcs', '130_movie_gc.mrcs', '131_movie_gc.mrcs', '132_movie_gc.mrcs', '133_movie_gc.mrcs', '134_movie_gc.mrcs', '135_movie_gc.mrcs', '136_movie_gc.mrcs', '137_movie_gc.mrcs', '139_movie_gc.mrcs', '140_movie_gc.mrcs', '141_movie_gc.mrcs', '142_movie_gc.mrcs', '143_movie_gc.mrcs', '144_movie_gc.mrcs', '145_movie_gc.mrcs', '146_m

## Save EMPIAR Path to Csv File

In [4]:
from CryoCRAB.pipeline.empiar_data_curation.step2_empiar_path_csv import \
    save_empiar_path_csvs, save_empiar_path_csv, load_empiar_path_csv
    
# get first empiar id for test
empiar_id = load_empiar_ids()[0]
print(f"{empiar_id = }")
# load storage structure and save it into a .csv file
save_empiar_path_csv(empiar_id)
# load storage structure from the .csv file
empiar_path_csv = load_empiar_path_csv(empiar_id)
print(empiar_path_csv)
# load all storage structure and save them into .csv files
# save_empiar_path_csvs()

empiar_id = 'EMPIAR-10002'
                                     ftp_absolute_path           relative_path
0    ftp.ebi.ac.uk/empiar/world_availability/EMPIAR...               10002.xml
1    ftp.ebi.ac.uk/empiar/world_availability/EMPIAR...  data/100_movie_gc.mrcs
2    ftp.ebi.ac.uk/empiar/world_availability/EMPIAR...  data/101_movie_gc.mrcs
3    ftp.ebi.ac.uk/empiar/world_availability/EMPIAR...  data/102_movie_gc.mrcs
4    ftp.ebi.ac.uk/empiar/world_availability/EMPIAR...  data/103_movie_gc.mrcs
..                                                 ...                     ...
256  ftp.ebi.ac.uk/empiar/world_availability/EMPIAR...   data/95_movie_gc.mrcs
257  ftp.ebi.ac.uk/empiar/world_availability/EMPIAR...   data/96_movie_gc.mrcs
258  ftp.ebi.ac.uk/empiar/world_availability/EMPIAR...   data/97_movie_gc.mrcs
259  ftp.ebi.ac.uk/empiar/world_availability/EMPIAR...   data/98_movie_gc.mrcs
260  ftp.ebi.ac.uk/empiar/world_availability/EMPIAR...   data/99_movie_gc.mrcs

[261 rows x 2 columns]


## Get EMPIAR and EMDB Entries

In [5]:
from CryoCRAB.pipeline.empiar_data_curation.step3_crawl_empiar_emdb_entries import \
    save_empiar_emdb_entries, get_empiar_emdb_pair_list
# get first empiar id for test
empiar_id = load_empiar_ids()[0]
print(f"{empiar_id = }")
# crawl its EMPIAR & EMDB entries
save_empiar_emdb_entries(empiar_id)
# parse its empiar emdb pair
empiar_emdb_pair = get_empiar_emdb_pair_list(empiar_id)
print(empiar_emdb_pair)
# crawl all EMPIAR & EMDB entries
#! WARNING: it will take a long time, so uncomment it if you really need it
# save_empiar_emdb_entries()

empiar_id = 'EMPIAR-10002'


Crawling empiar entries: 100%|██████████| 1/1 [00:00<00:00, 1180.83it/s]
Crawling emdb entries: 100%|██████████| 1/1 [00:00<00:00, 1587.55it/s]

[{'empiar': {'name': 'EMPIAR-10002', 'json_path': PosixPath('/home/vrlab/code/Cryo/CryoCRAB/Data/empiar-emdb-entries/empiar_entry/EMPIAR-10002.json')}, 'emdb': [{'name': 'EMD-2275', 'json_path': PosixPath('/home/vrlab/code/Cryo/CryoCRAB/Data/empiar-emdb-entries/emdb_entry/EMPIAR-10002/EMD-2275.json')}]}]





# EMPIAR Data Curation Complete Pipeline

In [None]:
from CryoCRAB.pipeline.empiar_data_curation import *
save_empiar_ids() # -> empiar-paths/empiar_ids.csv
save_empiar_structures() # -> empiar-paths/empiar_structure/10002.json
save_empiar_path_csvs() # -> empiar-paths/empiar_path_csv/10002.csv
save_empiar_emdb_entries() 
# EMPIAR entry: -> empiar-emdb-entries/empiar_entry/EMPIAR-10002.json
# EMDB entry: -> empiar-emdb-entries/emdb_entry/EMPIAR-10002/EMD-2275.json