## Imports

In [3]:
import requests
import time
import pandas as pd
import xml.etree.ElementTree as ET
from tqdm import tqdm
import os
import json
from dotenv import load_dotenv

## Functions

In [None]:
load_dotenv()


API_KEY = os.getenv('API_KEY')
SAVE_PATH = r'../data/interim/ast_dataset_abg.csv'

# Function to get BioSamples IDs
def get_biosamples(n_results):
    """
    Gets BioSamples IDs, filtering with 'antibiogram[filter]'.
    """
    response = requests.get(
        f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=biosample&term=antibiogram[filter]"
        f"&retmode=json&retmax={n_results}&api_key={API_KEY}"
    )
    data = response.json()
    biosample_ids = data.get("esearchresult", {}).get("idlist", [])
    print(f"Found {len(biosample_ids)} BioSamples with AST data.")
    return biosample_ids

# Function to get AST data
def get_ast_data(biosample_id):
    """
    Gets AST data using a specific BioSample from NCBI.
    """
    response = requests.get(
        f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=biosample&id={biosample_id}&retmode=json&api_key={API_KEY}",
        timeout=120
    )
    try:
        data = response.json()
        biosample_data = data.get("result", {}).get(biosample_id)
        if not biosample_data:
            return None
        biosample_data["BioSample ID"] = biosample_id  # Ensure BioSample ID is always included
        return biosample_data
    except Exception as e:
        print(f"Error processing BioSample {biosample_id}: {e}")
        return None

# Function to load existing data
def load_existing_data():
    """Loads existing data if the file already exists."""
    if os.path.exists(SAVE_PATH):
        return pd.read_csv(SAVE_PATH)
    return pd.DataFrame()

# Batch processing function
def batch_request(biosample_ids, fetch_function, batch_size=10, save_every=1000):
    """
    Processes BioSample IDs in batches and saves periodically.
    """
    existing_df = load_existing_data()
    processed_ids = set(existing_df["BioSample ID"]) if "BioSample ID" in existing_df.columns else set()
    biosample_data = []

    with tqdm(total=len(biosample_ids), desc="Processing BioSamples", unit="batch") as pbar:
        for i in range(0, len(biosample_ids), batch_size):
            batch = [bid for bid in biosample_ids[i:i + batch_size] if bid not in processed_ids]
            if not batch:
                pbar.update(len(batch))
                continue
            batch_results = [fetch_function(bid) for bid in batch]
            batch_results = [res for res in batch_results if res]  # Remove None values
            biosample_data.extend(batch_results)
            pbar.update(len(batch))

            if len(biosample_data) >= save_every:
                temp_df = pd.DataFrame(biosample_data)
                combined_df = pd.concat([existing_df, temp_df], ignore_index=True)
                if "BioSample ID" in combined_df.columns:
                    combined_df = combined_df.drop_duplicates(subset=["BioSample ID"])
                combined_df.to_csv(SAVE_PATH, index=False)
                existing_df = combined_df  # Update reference
                biosample_data = []
            time.sleep(0.3)

    if biosample_data:
        temp_df = pd.DataFrame(biosample_data)
        final_df = pd.concat([existing_df, temp_df], ignore_index=True)
        if "BioSample ID" in final_df.columns:
            final_df = final_df.drop_duplicates(subset=["BioSample ID"])
        final_df.to_csv(SAVE_PATH, index=False)
    print(f"Processing completed. Data saved in '{SAVE_PATH}'.")
    return final_df

# Function to parse BioSample XML
def parse_biosample_to_columns(xml_string):
    """
    Parses XML and extracts structured data.
    """
    root = ET.fromstring(xml_string)
    data = {"BioSample ID": root.attrib.get("accession", "")}
    fields = {
        "Sample Name": ".//Id[@db_label='Sample name']",
        "Organism": ".//Organism/OrganismName",
        "Collection Date": ".//Attribute[@harmonized_name='collection_date']",
        "Geographic Location": ".//Attribute[@harmonized_name='geo_loc_name']",
        "Isolation Source": ".//Attribute[@harmonized_name='isolation_source']",
    }
    for key, path in fields.items():
        element = root.find(path)
        data[key] = element.text if element is not None else ""
    antibiogram = {}
    for row in root.findall(".//Table[@class='Antibiogram.1.0']/Body/Row"):
        cells = row.findall("Cell")
        if len(cells) >= 2:
            antibiotic = cells[0].text.strip().lower().replace(" ", "_")
            resistance = cells[1].text.strip()
            antibiogram[antibiotic] = resistance
    data["antibiogram"] = antibiogram
    return data

# Function to expand sampledata
def expand_sampledata(df):
    """
    Expands 'sampledata' column into structured columns.
    """
    if "sampledata" not in df.columns:
        return df
    expanded_data = df["sampledata"].dropna().apply(parse_biosample_to_columns).tolist()
    df_expanded = pd.DataFrame(expanded_data)
    return pd.concat([df.drop(columns=["sampledata"]), df_expanded], axis=1)

# Main processing function
def process_biosamples(n_results):

    biosample_ids = get_biosamples(n_results)
    raw_data = batch_request(biosample_ids, get_ast_data)
    df = pd.DataFrame(raw_data)
    df_expanded = expand_sampledata(df)

    return df_expanded

## Data Extraction

In [None]:
df = process_biosamples(40000)
df.head()

Found 37894 BioSamples with AST data.


Processing BioSamples: 100%|██████████| 37894/37894 [3:58:04<00:00,  2.65batch/s]


Processing completed. Data saved in '/mnt/drive/MyDrive/Colab Notebooks/data/Flemming ML/ast_dataset_abg.csv'.


Unnamed: 0,uid,title,accession,date,publicationdate,modificationdate,organization,taxonomy,organism,sourcesample,...,package,sortkey,BioSample ID,BioSample ID.1,Sample Name,Organism,Collection Date,Geographic Location,Isolation Source,antibiogram
0,46923997,Pathogen: clinical or host-associated sample f...,SAMN46923997,2025/02/20,2025/02/20,2025/02/20,Rhode Island Department of Health State Health...,485,Neisseria gonorrhoeae,BioSample:SAMN46923997,...,Pathogen: clinical or host-associated; version...,20250220,46923997,SAMN46923997,RISHL25NGS004,Neisseria gonorrhoeae,2025-01-31,USA:RI,Rectal,"{'ciprofloxacin': 'intermediate', 'penicillin'..."
1,46841726,Pathogen: clinical or host-associated sample f...,SAMN46841726,2025/02/14,2025/02/14,2025/02/14,Instituto de Pesquisa Pele Pequeno Principe,573,Klebsiella pneumoniae,BioSample:SAMN46841726,...,Pathogen: clinical or host-associated; version...,20250214,46841726,SAMN46841726,Kp181,Klebsiella pneumoniae,2023-09-04,Brazil: Curitiba,blood,"{'cefotaxime': 'resistant', 'ceftazidime': 're..."
2,46841725,Pathogen: clinical or host-associated sample f...,SAMN46841725,2025/02/14,2025/02/14,2025/02/14,Instituto de Pesquisa Pele Pequeno Principe,573,Klebsiella pneumoniae,BioSample:SAMN46841725,...,Pathogen: clinical or host-associated; version...,20250214,46841725,SAMN46841725,K174,Klebsiella pneumoniae,2023-08-09,Brazil: Curitiba,blood,"{'cefotaxime': 'resistant', 'ceftazidime': 're..."
3,46841724,Pathogen: clinical or host-associated sample f...,SAMN46841724,2025/02/14,2025/02/14,2025/02/14,Instituto de Pesquisa Pele Pequeno Principe,573,Klebsiella pneumoniae,BioSample:SAMN46841724,...,Pathogen: clinical or host-associated; version...,20250214,46841724,SAMN46841724,K159,Klebsiella pneumoniae,2022-08-18,Brazil: Curitiba,blood,"{'cefotaxime': 'resistant', 'ceftazidime': 're..."
4,46841723,Pathogen: clinical or host-associated sample f...,SAMN46841723,2025/02/14,2025/02/14,2025/02/14,Instituto de Pesquisa Pele Pequeno Principe,573,Klebsiella pneumoniae,BioSample:SAMN46841723,...,Pathogen: clinical or host-associated; version...,20250214,46841723,SAMN46841723,K158,Klebsiella pneumoniae,2022-08-09,Brazil: Curitiba,ascitic fluid,"{'cefotaxime': 'resistant', 'ceftazidime': 're..."


## Data Loading

In [None]:
ast_df = pd.read_csv(r'../data/interim/ast_dataset_abg.csv', dtype={"BioSample ID.1": str})
ast_df.head()

Unnamed: 0.1,Unnamed: 0,uid,title,accession,date,publicationdate,modificationdate,organization,taxonomy,organism,...,package,sortkey,BioSample ID,BioSample ID.1,Sample Name,Organism,Collection Date,Geographic Location,Isolation Source,antibiogram
0,0,46923997,Pathogen: clinical or host-associated sample f...,SAMN46923997,2025/02/20,2025/02/20,2025/02/20,Rhode Island Department of Health State Health...,485,Neisseria gonorrhoeae,...,Pathogen: clinical or host-associated; version...,20250220,46923997,SAMN46923997,RISHL25NGS004,Neisseria gonorrhoeae,2025-01-31,USA:RI,Rectal,"{'ciprofloxacin': 'intermediate', 'penicillin'..."
1,1,46841726,Pathogen: clinical or host-associated sample f...,SAMN46841726,2025/02/14,2025/02/14,2025/02/14,Instituto de Pesquisa Pele Pequeno Principe,573,Klebsiella pneumoniae,...,Pathogen: clinical or host-associated; version...,20250214,46841726,SAMN46841726,Kp181,Klebsiella pneumoniae,2023-09-04,Brazil: Curitiba,blood,"{'cefotaxime': 'resistant', 'ceftazidime': 're..."
2,2,46841725,Pathogen: clinical or host-associated sample f...,SAMN46841725,2025/02/14,2025/02/14,2025/02/14,Instituto de Pesquisa Pele Pequeno Principe,573,Klebsiella pneumoniae,...,Pathogen: clinical or host-associated; version...,20250214,46841725,SAMN46841725,K174,Klebsiella pneumoniae,2023-08-09,Brazil: Curitiba,blood,"{'cefotaxime': 'resistant', 'ceftazidime': 're..."
3,3,46841724,Pathogen: clinical or host-associated sample f...,SAMN46841724,2025/02/14,2025/02/14,2025/02/14,Instituto de Pesquisa Pele Pequeno Principe,573,Klebsiella pneumoniae,...,Pathogen: clinical or host-associated; version...,20250214,46841724,SAMN46841724,K159,Klebsiella pneumoniae,2022-08-18,Brazil: Curitiba,blood,"{'cefotaxime': 'resistant', 'ceftazidime': 're..."
4,4,46841723,Pathogen: clinical or host-associated sample f...,SAMN46841723,2025/02/14,2025/02/14,2025/02/14,Instituto de Pesquisa Pele Pequeno Principe,573,Klebsiella pneumoniae,...,Pathogen: clinical or host-associated; version...,20250214,46841723,SAMN46841723,K158,Klebsiella pneumoniae,2022-08-09,Brazil: Curitiba,ascitic fluid,"{'cefotaxime': 'resistant', 'ceftazidime': 're..."


In [None]:
gm_df = pd.read_csv(r'../data/interim/filtered_genetic_dataset.csv', dtype={"BioSample": str})
gm_df.head()

Unnamed: 0.1,Unnamed: 0,#Scientific name,Protein,BioSample,Isolate,Contig,Start,Stop,Strand,Element symbol,Element name,Type,Scope,Subtype,Class,Subclass,Method,% Coverage of reference,% Identity to reference
0,2729,"Salmonella enterica subsp. enterica serovar 4,...",ECH9088530.1,SAMN02640777,PDT000003687.3,AAITMG010000001.1,536274,539927,-,iroC,salmochelin/enterobactin export ABC transporte...,VIRULENCE,plus,VIRULENCE,,,BLASTP,98.85,80.0
1,2730,Salmonella enterica subsp. enterica serovar Ke...,EBM9789732.1,SAMN02640778,PDT000003688.4,AAGEIK010000001.1,306785,307147,-,arsD,arsenite efflux transporter metallochaperone ArsD,STRESS,plus,METAL,ARSENIC,ARSENITE,HMM,99.17,57.5
2,2731,Salmonella enterica subsp. enterica serovar Ke...,EBO3121721.1,SAMN02640780,PDT000003689.4,AAGICK010000001.1,306785,307147,-,arsD,arsenite efflux transporter metallochaperone ArsD,STRESS,plus,METAL,ARSENIC,ARSENITE,HMM,99.17,57.5
3,2732,Salmonella enterica subsp. enterica serovar Hadar,EBV6716087.1,SAMN02640788,PDT000003690.3,AAHFZG010000001.1,348371,349057,-,gtgA,type III secretion system effector protease GtgA,VIRULENCE,plus,VIRULENCE,,,EXACTP,100.0,100.0
4,2733,Salmonella enterica subsp. enterica,ECE0280831.1,SAMN02640789,PDT000003691.3,AAIGQB010000006.1,15091,16206,+,iroB,salmochelin biosynthesis C-glycosyltransferase...,VIRULENCE,plus,VIRULENCE,,,BLASTP,100.0,86.52


## Data Merging

In [None]:
def merge_ast_with_mechanisms(ast_df, mech_df):
    """
    Merges the AST dataset with genetic mechanisms, storing the latter in a JSON format within a single column.

    Parameters:
        ast_path (DataFrame): AST Dataset, with Antibiogram information.
        mechanisms_path (DataFrame): Genetic resistance mechanisms Dataset.

    Returns:
        pd.DataFrame: Merged dataset with a 'genetic_mechanisms' column in JSON format.
    """


    mech_cols = ["BioSample", "Protein", "Strand", "Element symbol", "Type", "Scope"]
    mech_df = mech_df[mech_cols]

    # Group genetic mechanisms by 'BioSample' and store as JSON (without including the grouping column)
    mech_df_grouped = mech_df.set_index("BioSample").groupby("BioSample").apply(
        lambda x: x.to_dict(orient="records")
    ).reset_index(name="genetic_mechanisms")

    # Merge without creating duplicates
    merged_df = ast_df.merge(mech_df_grouped, left_on="BioSample ID.1", right_on="BioSample", how="left")

    # Drop redundant 'BioSample' column
    merged_df.drop(columns=["BioSample"], inplace=True)

    return merged_df

# Example usage
merged_df = merge_ast_with_mechanisms(ast_df, gm_df)

In [None]:
merged_df.to_csv(r'../data/interim/merge_dataset.csv')

In [None]:
"""

Possible columns to drop:

Internal data, non relevant:

- uid, title, accession, BioSample Internal ID, sortkey, package

Date/Geographical data (redundant, not entirely relevant):

- Date, publicationdate, modificationdate, organization, Publication Date, Last Update, Collection Date, Geographic Location, Latitude & Longitude

Data kinda redundant:

- Host (we're looking for only human cases by now)
- infraspecies (too much specific)
- Sample Name (Might not help that much)

"""

testdf = df["organism"]
testdf.head()

Unnamed: 0,organism
0,Neisseria gonorrhoeae
1,Klebsiella pneumoniae
2,Klebsiella pneumoniae
3,Klebsiella pneumoniae
4,Klebsiella pneumoniae
