In [9]:
import pandas as pd
import bacdive

from tqdm.auto import tqdm

In [10]:
df_genomes = pd.read_csv("genomes_dataset.csv")
df_bacdive = pd.read_csv("advsearch_bacdive_2025-02-06.csv")

In [None]:
# keep only unique species in df_bacdive
df_bacdive = df_bacdive.drop_duplicates(subset = "species")
df_bacdive

Unnamed: 0,ID,species,designation_header,strain_number_header,is_type_strain_header
0,159652,Abditibacterium utsteinense,R-68213,"DSM 105287, LMG 29911",1
1,219,Abiotrophia defectiva,,"DSM 9849, ATCC 49176, CIP 103242, SC 10, CCUG ...",1
40,175515,Absicoccus intestinalis,CLA-KB-P134,"DSM 114836, JCM 37183",0
41,164352,Absicoccus porci,YH-panp20,"JCM 32769, KCTC 15747, KCTC 1574",1
42,5447,Absiella tortuosa,,"DSM 3987, ATCC 25548, VPI 1084B",1
...,...,...,...,...,...
99255,14289,Zymomonas mobilis,pZM01 to pZM06,"DSM 424, ATCC 10988, NCIB 8938, NRRL B-806, IM...",1
99262,14290,Zymomonas mobilis subsp. francensis,AN0101,"DSM 18599, CIP 108684, LMG 22974",1
99263,14292,Zymomonas mobilis subsp. pomaceae,,"DSM 22645, ATCC 29192, LMG 448, NCIMB 11200, C...",1
99266,18486,unclassified,Hy m25,,0


In [None]:
# merge on Species and species
df = pd.merge(df_genomes, df_bacdive, left_on = "Species", right_on="species", how="left")

# drop all columns from df_bacdive except ID. Rename ID to bacdive_ID
df = df.drop(columns=[col for col in df.columns if col not in df_genomes.columns and col != "ID"])
df = df.rename(columns={"ID": "bacdive_ID"})

In [None]:
# Initialize the Bacdive client with your email and API key
client = bacdive.BacdiveClient('your_email@example.com', 'your_api_key_here')

In [None]:
# keywords to filter physiological info
filter = ["oxygen tolerance"]

In [None]:
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

N = 100
records = []

# make all ids Integers
ids = [int(i) for i in df["ID"].values if not pd.isna(i)]
ids = list(set(ids))

query = {"taxonomy": ids}

# Set up retry mechanism
session = requests.Session()
retry = Retry(
    total=5,
    backoff_factor=1,
    status_forcelist=[429, 500, 502, 503, 504],
    allowed_methods=["HEAD", "GET", "OPTIONS"]
)
adapter = HTTPAdapter(max_retries=retry)
session.mount("https://", adapter)
session.mount("http://", adapter)

for i in tqdm(range(0, len(ids), N)):
    query = {"id": ids[i:i+N]}
    try:
        count = client.search(**query)
        if count > 0:
            for entry in client.retrieve(filter=filter):
                records.append(entry)
        else:
            print("No records found")
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")

In [None]:
filtered_records = []

for record in records:
    for k, v in record.items():
        # if the value is not empty, add the record to the filtered_records list
        if v:
            filtered_records.append(record)
            break

In [None]:
def flatten_dict(d, parent_key='', sep='_'):
    items = {}
    for k, v in d.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if isinstance(v, dict):
            items.update(flatten_dict(v, new_key, sep=sep))
        else:
            items[new_key] = v
    return items

flattened_records = {}

for record in filtered_records:
    for rec_id, lst in record.items():
        flat_list = []
        for item in lst:
            if isinstance(item, dict):
                flat_list.append(flatten_dict(item))
            else:
                flat_list.append(item)
        flattened_records[rec_id] = flat_list

# To verify, print the first few keys and values
for k, v in list(flattened_records.items())[:3]:
    print(f"bacdive_ID: {k}")
    print("Records:")
    for rec in v:
        print(rec)
    print("----------")

bacdive_ID: 131202
Records:
{'oxygen tolerance_@ref': 22841, 'oxygen tolerance_oxygen tolerance': 'microaerophile'}
{'oxygen tolerance': 'microaerophile'}
----------
bacdive_ID: 131201
Records:
{'oxygen tolerance_@ref': 22840, 'oxygen tolerance_oxygen tolerance': 'microaerophile'}
{'oxygen tolerance': 'microaerophile'}
----------
bacdive_ID: 131200
Records:
{'oxygen tolerance_@ref': 22839, 'oxygen tolerance_oxygen tolerance': 'microaerophile'}
{'oxygen tolerance': 'microaerophile'}
----------


In [27]:
# add oxygen tolerance to df
df["oxygen_tolerance"] = None

for k, v in flattened_records.items():
    for record in v:
        if "oxygen tolerance" in record:
            if isinstance(record["oxygen tolerance"], str):
                df.loc[df["bacdive_ID"] == float(k), "oxygen_tolerance"] = record["oxygen tolerance"]
                continue
        elif "oxygen tolerance_oxygen tolerance" in record:
            if isinstance(record["oxygen tolerance_oxygen tolerance"], str):
                df.loc[df["bacdive_ID"] == float(k), "oxygen_tolerance"] = record["oxygen tolerance_oxygen tolerance"]
                continue

df.to_csv("genomes_dataset_with_oxygen_tolerance.csv", index=False)