In [1]:
import sys
if "../.." not in sys.path:
    sys.path.insert(0, "../..")

In [2]:
import concurrent.futures
import threading
import xmltodict
import pandas as pd

from getpass import getpass
from tqdm import tqdm

from model_building.get_reaction_annotations import get_session

DATA_FILE = "../raw/omics/DSS3_expression_data_extracted.xlsx"
RPOM_ORGID = "GCF_000011965"

rna_rel = pd.read_excel(DATA_FILE, sheet_name="rna-rel")
rna_abs = pd.read_excel(DATA_FILE, sheet_name="rna-abs")
prot = pd.read_excel(DATA_FILE, sheet_name="prot")

In [3]:
prot.columns

Index(['Rank', 'gene_callers_id', 'DSS3_ac_mean_abund', 'DSS3_glc_mean_abund',
       'DSS3_late_mean_abund', 'DSS3_early_mean_abund',
       'DSS3_ac_v_DSS3_glc_FOLD_CHANGE_prot',
       'DSS3_ac_v_DSS3_late_FOLD_CHANGE_prot',
       'DSS3_ac_v_DSS3_early_FOLD_CHANGE_prot',
       'DSS3_glc_v_DSS3_late_FOLD_CHANGE_prot',
       'DSS3_glc_v_DSS3_early_FOLD_CHANGE_prot',
       'DSS3_late_v_DSS3_early_FOLD_CHANGE_prot', 'ANOVA p-value', 'BH_crit',
       'BH_valid', 'DSS3_ac_v_DSS3_glc_Tukey_HSD_adjusted_p',
       'DSS3_ac_v_DSS3_late_Tukey_HSD_adjusted_p',
       'DSS3_ac_v_DSS3_early_Tukey_HSD_adjusted_p',
       'DSS3_glc_v_DSS3_late_Tukey_HSD_adjusted_p',
       'DSS3_glc_v_DSS3_early_Tukey_HSD_adjusted_p',
       'DSS3_late_v_DSS3_early_Tukey_HSD_adjusted_p', 'SPO_ID (ACCESSION)',
       'KOfam (ACCESSION)', 'KEGG_Module', 'COG20_FUNCTION'],
      dtype='object')

In [10]:
# Get the BioCyc frame IDs for the genes, using their SPO accessions

thread_local = threading.local()

u = input("> Username: ")
p = getpass("> Password: ")

# Worker function to get the BioCyc frame ID for a gene
def get_frame_id(accession):
    if not hasattr(thread_local, "session"):
        thread_local.session = get_session(u, p)

    # https://websvc.biocyc.org/[ORGID]/name-search?object=[NAME]&class=[CLASS]&fmt=[json|xml]    
    r = thread_local.session.get(f"https://websvc.biocyc.org/{RPOM_ORGID}/name-search?object={accession}&class=Genes&fmt=xml")
    d = xmltodict.parse(r.text)
    n_results = d["ptools-xml"]["metadata"]["num_results"]

    if n_results == 0 or "Gene" not in d["ptools-xml"]:
        return n_results, None

    results = d["ptools-xml"]["Gene"]
    if isinstance(results, dict):
        results = [results]
    
    frame_ids = [result["@frameid"] for result in results]
    return n_results, frame_ids
    

# Keep track of accessions that have already been visited, to avoid redundant requests across sheets
try:
    seen  # Hack to retain the value of seen if this cell terminates early. Allows re-running the cell without losing progress.
          # (Resets if the cell completes successfully.)
except NameError:
    seen = {}

for sheet in [rna_rel,
              rna_abs,
              prot]:
    
    accessions = [accession for accession in sheet["SPO_ID (ACCESSION)"] if accession not in seen]

    # Get the BioCyc frame IDs for the genes
    results = {}
    with tqdm(total=len(accessions)) as pbar:
        with concurrent.futures.ThreadPoolExecutor() as executor:
            futures = {
                 executor.submit(get_frame_id, accession) : accession
                 for accession in accessions}
            for future in concurrent.futures.as_completed(futures):
                    accession = futures[future]
                    results[accession] = future.result()
                    seen[accession] = future.result()
                    pbar.update(1)
    
    # Transform the results into columns,
    # using cached values for accessions that have already been visited
    n_matches = [int(seen[accession][0]) for accession in sheet["SPO_ID (ACCESSION)"]]
    frame_ids = [seen[accession][1] for accession in sheet["SPO_ID (ACCESSION)"]]

    # Check that all matches are either 0 or 1
    assert all(matches in {0, 1} for matches in n_matches)

    # Add the columns to the sheet
    sheet["n_matches"] = n_matches
    sheet["frame_id"] = [frame_id[0] if frame_id is not None else None for frame_id in frame_ids]


del u
del p
del seen

100%|██████████| 3/3 [00:01<00:00,  2.87it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


In [13]:
import os

os.makedirs("../clean/omics", exist_ok=True)
rna_rel.to_csv("../clean/omics/rna-rel.csv", index=False)
rna_abs.to_csv("../clean/omics/rna-abs.csv", index=False)
prot.to_csv("../clean/omics/prot.csv", index=False)

In [32]:
# Get the reactions associated with the genes

thread_local = threading.local()

u = input("> Username: ")
p = getpass("> Password: ")

# Worker reactions of a given gene
def get_reactions(frameid):
    if not hasattr(thread_local, "session"):
        thread_local.session = get_session(u, p)

    # https://websvc.biocyc.org/apixml?fn=reactions-of-gene&id={ORGID}:{FRAME-ID}&detail=full
    r = thread_local.session.get(f"https://websvc.biocyc.org/apixml?fn=reactions-of-gene&id={RPOM_ORGID}:{frameid}&detail=full")
    r.raise_for_status()
    d = xmltodict.parse(r.text)
    n_results = d["ptools-xml"]["metadata"]["num_results"]

    if n_results == 0 or "Reaction" not in d["ptools-xml"]:
        return n_results, None

    results = d["ptools-xml"]["Reaction"]
    if isinstance(results, dict):
        results = [results]
    
    reaction_ids = [result["@frameid"] for result in results]
    return n_results, reaction_ids
    

# Keep track of gene frameids that have already been visited, to avoid redundant requests across sheets
try:
    genes_seen  # Hack to retain the value of genes_seen if this cell terminates early. Allows re-running the cell without losing progress.
                # (Resets if the cell completes successfully.)
except NameError:
    genes_seen = {}

for sheet in [rna_rel,
              rna_abs,
              prot]:
    
    frameids = [frameid for frameid in sheet["frame_id"] if frameid not in genes_seen and frameid is not None]

    # Get the BioCyc frame IDs for the genes
    results = {}
    with tqdm(total=len(frameids)) as pbar:
        with concurrent.futures.ThreadPoolExecutor() as executor:
            futures = {
                 executor.submit(get_reactions, frameid) : frameid
                 for frameid in frameids}
            for future in concurrent.futures.as_completed(futures):
                    frameid = futures[future]
                    results[frameid] = future.result()
                    genes_seen[frameid] = future.result()
                    pbar.update(1)
    
    # Transform the results into columns,
    # using cached values for accessions that have already been visited
    n_reactions = [int(genes_seen[frameid][0]) if frameid is not None else None for frameid in sheet["frame_id"]]
    reaction_ids = [genes_seen[frameid][1] if frameid is not None else None for frameid in sheet["frame_id"]]

    # Add the columns to the sheet
    sheet["n_reactions"] = n_reactions
    sheet["reaction_ids"] = [", ".join(ids) if ids is not None else None for ids in reaction_ids]


del u
del p
del genes_seen

0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


In [33]:
import os

os.makedirs("../clean/omics", exist_ok=True)
rna_rel.to_csv("../clean/omics/rna-rel.csv", index=False)
rna_abs.to_csv("../clean/omics/rna-abs.csv", index=False)
prot.to_csv("../clean/omics/prot.csv", index=False)

In [26]:
r = s.get(f"https://websvc.biocyc.org/apixml?fn=reactions-of-gene&id={RPOM_ORGID}:SPO_RS19175&detail=full")
d = xmltodict.parse(r.text)
d

{'ptools-xml': {'@ptools-version': '28.0',
  '@xml:base': 'http://BioCyc.org/apixml?fn=reactions-of-gene%26id=GCF_000011965:SPO_RS19175%26detail=FULL',
  'metadata': {'url': 'http://BioCyc.org/',
   'service_name': 'apixml',
   'query': 'fn=reactions-of-gene&id=GCF_000011965:SPO_RS19175&detail=FULL',
   'num_results': '0'}}}

In [24]:
frameid

'SPO_RS20335'

In [23]:
rna_rel

Unnamed: 0,Rank,gene_callers_id,DSS3_ac_mean_abund,DSS3_glc_mean_abund,DSS3_late_mean_abund,DSS3_early_mean_abund,DSS3_ac_v_DSS3_glc_FOLD_CHANGE_rna,DSS3_ac_v_DSS3_late_FOLD_CHANGE_rna,DSS3_ac_v_DSS3_early_FOLD_CHANGE_rna,DSS3_glc_v_DSS3_late_FOLD_CHANGE_rna,...,DSS3_glc_v_DSS3_early_Tukey_HSD_adjusted_p,DSS3_late_v_DSS3_early_Tukey_HSD_adjusted_p,SPO_ID (ACCESSION),KOfam (ACCESSION),KEGG_Module,COG20_FUNCTION,n_matches,frame_id,n_reactions,reaction_ids
0,1,3780,0.067258,0.003105,0.003516,0.003365,4.436896,4.258451,4.322648,-0.178445,...,,,SPO3778,K02032,,ABC-type dipeptide/oligopeptide/nickel transpo...,1,SPO_RS19175,1,SPO_RS19175
1,2,3526,0.027681,0.195701,0.046760,0.025019,-2.825592,-0.755299,0.142076,2.070294,...,,,SPO3527,,,"Nucleotide-binding universal stress protein, ...",1,SPO_RS17870,1,SPO_RS17870
2,3,1113,0.010721,0.000607,0.000425,0.000980,4.143826,4.659230,3.456013,0.515404,...,,,SPO1112,K21395,,"TRAP-type C4-dicarboxylate transport system, p...",1,SPO_RS05645,1,SPO_RS05645
3,4,368,0.071601,0.003375,0.002749,0.037882,4.409182,4.703722,0.918856,0.294541,...,,,SPO0368,K14447,Ethylmalonyl pathway,"Methylmalonyl-CoA mutase, N-terminal domain/su...",1,SPO_RS01865,1,SPO_RS01865
4,5,323,0.130794,0.019961,0.019783,0.055113,2.715865,2.726692,1.247346,0.010827,...,,,SPO0325,K00023,"Fatty acid biosynthesis, elongation","NAD(P)-dependent dehydrogenase, short-chain al...",1,SPO_RS01655,1,SPO_RS01655
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4337,4338,2324,0.026165,0.026287,0.027611,0.026666,-0.014609,-0.094591,-0.040659,-0.079982,...,,,SPO2322,K03210,,Protein translocase subunit YajC (YajC),1,SPO_RS11770,1,SPO_RS11770
4338,4339,1665,0.040543,0.039794,0.038928,0.038554,0.024609,0.066833,0.084243,0.042225,...,,,SPO1664,K02838,,Ribosome recycling factor (Frr) (PDB:1DD5),1,SPO_RS08455,1,SPO_RS08455
4339,4340,1479,0.005247,0.005091,0.005149,0.005138,0.044644,0.025552,0.027654,-0.019093,...,,,SPO1476,K01703,"Isoleucine biosynthesis, pyruvate => 2-oxobuta...",Homoaconitase/3-isopropylmalate dehydratase la...,0,,0,
4340,4341,2870,0.012148,0.012356,0.012301,0.012292,-0.024253,-0.016251,-0.013494,0.008002,...,,,SPO2874,K02228,"Cobalamin biosynthesis, aerobic, uroporphyrino...",Precorrin-2 methylase (CobF) (PDB:2QBU),1,SPO_RS14590,1,SPO_RS14590
