# Download and preprocessing of the DRIAMS datasets to obtain the files necessary to train the ResMLP model

The raw data from Weis et al. (2022) is available through the Dryad platform at:
[https://datadryad.org/stash/dataset/doi:10.5061/dryad.bzkh1899q](https://datadryad.org/stash/dataset/doi:10.5061/dryad.bzkh1899q)

This script will require the download and extraction of the original files to then obtain the processed matrices.


In [4]:
import os
import json
import pandas as pd
from tqdm.notebook import tqdm
import numpy as np

In [5]:
import warnings

warnings.filterwarnings("ignore")

In [None]:
# Load sample tables from DRIAMS A, B, C, and D
sample_info = {
    "A": {
        2015: "/fs/pool/pool-miranda/Projects/AMR/ConformalAMR/data/Unpacked_data/DRIAMS-A/id/2015/2015_clean.csv",
        2016: "/fs/pool/pool-miranda/Projects/AMR/ConformalAMR/data/Unpacked_data/DRIAMS-A/id/2016/2016_clean.csv",
        2017: "/fs/pool/pool-miranda/Projects/AMR/ConformalAMR/data/Unpacked_data/DRIAMS-A/id/2017/2017_clean.csv",
        2018: "/fs/pool/pool-miranda/Projects/AMR/ConformalAMR/data/Unpacked_data/DRIAMS-A/id/2018/2018_clean.csv",
    },
    "B": {
        2018: "/fs/pool/pool-miranda/Projects/AMR/ConformalAMR/data/Unpacked_data/DRIAMS-B/id/2018/2018_clean.csv",
    },
    "C": {
        2018: "/fs/pool/pool-miranda/Projects/AMR/ConformalAMR/data/Unpacked_data/DRIAMS-C/id/2018/2018_clean.csv",
    },
    "D": {
        2018: "/fs/pool/pool-miranda/Projects/AMR/ConformalAMR/data/Unpacked_data/DRIAMS-D/id/2018/2018_clean.csv",
    },
}

long_sample_info = []

for dataset, years in sample_info.items():
    for year, path in years.items():
        driams_cur = pd.read_csv(path)

        if "combined_code" in driams_cur.columns:
            driams_cur = driams_cur.drop(columns=["combined_code"])
        if "laboratory_species" in driams_cur.columns:
            driams_cur = driams_cur.drop(columns=["laboratory_species"])

        driams_cur = driams_cur.melt(id_vars=["species", "code"])

        driams_cur = driams_cur.dropna().rename(
            columns={"code": "sample_id", "variable": "drug", "value": "response"}
        )
        driams_cur["response"] = driams_cur.response.replace({"R": 1, "I": 1, "S": 0})
        driams_cur = driams_cur.loc[driams_cur.response.isin([0, 1]), :]
        driams_cur["dataset"] = dataset
        driams_cur["year"] = year

        long_sample_info.append(driams_cur)

driams_long_table = pd.concat(long_sample_info)
driams_long_table = driams_long_table.loc[
    ~driams_long_table.species.str.contains("MIX")
]

# Fix drug names
driams_long_table.loc[driams_long_table.drug == "Cotrimoxazole", "drug"] = (
    "Cotrimoxazol"
)
driams_long_table.loc[driams_long_table.drug == "Benzylpenicillin", "drug"] = (
    "Penicillin"
)
driams_long_table.loc[driams_long_table.drug == "Rifampicin", "drug"] = "Rifamdin"
driams_long_table.loc[driams_long_table.drug == "Ticarcillin-Clavulan acid", "drug"] = (
    "Ticarcillin-Clavulanic acid"
)
driams_long_table.loc[driams_long_table.drug == "Minocycline", "drug"] = "Minocin"
driams_long_table.loc[driams_long_table.drug == "Polymyxin B", "drug"] = "Polymyxin"

drugs_to_remove = [
    "Quinolones",
    "Cefalotin-Cefazolin",
    "Gentamicin_high_level",
    "Meropenem_without_meningitis",
    "Meropenem_with_meningitis",
    "Penicillin_with_other_infections",
    "Penicillin_with_meningitis",
    "Penicillin_with_pneumonia",
    "Aminoglycosides",
    "Meropenem_with_pneumonia",
    "Amoxicillin-Clavulanic acid_uncomplicated_HWI",
    "Penicillin_without_endokarditis",
    "Penicillin_with_endokarditis",
    "Cefoxitin_screen",
    "Rifampicin_1mg-l",
    "Vancomycin_GRD",
    "Teicoplanin_GRD",
    "Penicillin_without_meningitis",
    "Ceftazidime-Avibactam",
    "Ceftolozane-Tazobactam",
    "Cefuroxime.1",
    "Strepomycin_high_level",
    "Unnamed: 0.1",
    "Unnamed: 0",
    "Clindamycin_induced",
    "ESBL",
    "MRSA",
    "Benzylpenicillin_others",
    "Benzylpenicillin_with_meningitis",
    "Benzylpenicillin_with_pneumonia",
    "Meropenem-Vaborbactam",
]
driams_long_table = driams_long_table.loc[
    ~driams_long_table.drug.isin(drugs_to_remove), :
]

# Save long table
driams_long_table.to_csv(
    "/fs/pool/pool-miranda/Projects/AMR/ConformalAMR/data/Processed/DRIAMS_combined_long_table_multidrug.csv",
    index=False,
)
driams_long_table

# DRIAMS A

In [None]:
driams_a_samples = sorted(
    list(driams_long_table[driams_long_table["dataset"] == "A"]["sample_id"].unique())
)

dataset_folder = "../data/Unpacked_data/DRIAMS-A/binned_6000"  # Replace with the path where you downloaded the data if needed
spectra_matrix = []
for sample_id in tqdm(driams_a_samples):
    sample_data = None
    for year in ["2015", "2016", "2017", "2018"]:
        if os.path.exists(os.path.join(dataset_folder, year, f"{sample_id}.txt")):
            sample_data = pd.read_csv(
                os.path.join(dataset_folder, year, f"{sample_id}.txt"),
                sep=" ",
                index_col=0,
            )
            break
    if sample_data is None:
        print(f"File for sample {sample_id} not found")
        continue
    spectra_matrix.append(sample_data["binned_intensity"].values)

spectra_matrix = np.vstack(spectra_matrix)
print(spectra_matrix.shape)
np.save(
    "../data/Processed/Spectra/DRIAMS-A/spectra_binned_6000_all_multidrug.npy",
    spectra_matrix,
)

# DRIAMS B

In [None]:
driams_b_samples = sorted(
    list(driams_long_table[driams_long_table["dataset"] == "B"]["sample_id"].unique())
)

dataset_folder = "../data/Unpacked_data/DRIAMS-B/binned_6000/2018"  # Replace with the path where you downloaded the data if needed
spectra_matrix = []
for sample_id in tqdm(driams_b_samples):
    sample_data = pd.read_csv(
        os.path.join(dataset_folder, f"{sample_id}.txt"), sep=" ", index_col=0
    )
    spectra_matrix.append(sample_data["binned_intensity"].values)

spectra_matrix = np.vstack(spectra_matrix)
print(spectra_matrix.shape)
np.save(
    "../data/Processed/Spectra/DRIAMS-B/spectra_binned_6000_all_multidrug.npy",
    spectra_matrix,
)

# DRIAMS C

In [None]:
driams_c_samples = sorted(
    list(driams_long_table[driams_long_table["dataset"] == "C"]["sample_id"].unique())
)

dataset_folder = "../data/Unpacked_data/DRIAMS-C/binned_6000/2018"  # Replace with the path where you downloaded the data if needed
spectra_matrix = []
for sample_id in tqdm(driams_c_samples):
    sample_data = pd.read_csv(
        os.path.join(dataset_folder, f"{sample_id}.txt"), sep=" ", index_col=0
    )
    spectra_matrix.append(sample_data["binned_intensity"].values)

spectra_matrix = np.vstack(spectra_matrix)
print(spectra_matrix.shape)
np.save(
    "../data/Processed/Spectra/DRIAMS-C/spectra_binned_6000_all_multidrug.npy",
    spectra_matrix,
)

# DRIAMS D

In [None]:
driams_d_samples = sorted(
    list(driams_long_table[driams_long_table["dataset"] == "D"]["sample_id"].unique())
)

dataset_folder = "../data/Unpacked_data/DRIAMS-D/binned_6000/2018"  # Replace with the path where you downloaded the data if needed
spectra_matrix = []
for sample_id in tqdm(driams_d_samples):
    sample_data = pd.read_csv(
        os.path.join(dataset_folder, f"{sample_id}.txt"), sep=" ", index_col=0
    )
    spectra_matrix.append(sample_data["binned_intensity"].values)

spectra_matrix = np.vstack(spectra_matrix)
print(spectra_matrix.shape)
np.save(
    "../data/Processed/Spectra/DRIAMS-D/spectra_binned_6000_all_multidrug.npy",
    spectra_matrix,
)