In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

import sys
sys.path.insert(0,'../')

from data_split.data_utils import DataSplitter

# Load preprocessed DRIAMS data

In [8]:
data = pd.read_csv("../processed_data/DRIAMS_combined_long_table.csv")
data

Unnamed: 0,species,sample_id,drug,response,dataset
0,Staphylococcus epidermidis,e9adf43d-679b-497c-9849-1fa214838dd3,Meropenem,1,A
1,Staphylococcus epidermidis,e9adf43d-679b-497c-9849-1fa214838dd3,Ciprofloxacin,1,A
2,Staphylococcus epidermidis,e9adf43d-679b-497c-9849-1fa214838dd3,Cefepime,1,A
3,Staphylococcus epidermidis,e9adf43d-679b-497c-9849-1fa214838dd3,Cotrimoxazole,0,A
4,Staphylococcus epidermidis,e9adf43d-679b-497c-9849-1fa214838dd3,Imipenem,1,A
...,...,...,...,...,...
652766,Staphylococcus aureus,08bc8410-51ec-46d7-ac7b-afba9e6ba2cd_3313,Linezolid,0,D
652767,Staphylococcus aureus,08bc8410-51ec-46d7-ac7b-afba9e6ba2cd_3313,Rifampicin,0,D
652768,Staphylococcus aureus,08bc8410-51ec-46d7-ac7b-afba9e6ba2cd_3313,Tetracycline,0,D
652769,Staphylococcus aureus,08bc8410-51ec-46d7-ac7b-afba9e6ba2cd_3313,Tigecycline,0,D


# Examples

In [9]:
dsplit = DataSplitter(data, dataset="B")

## Selection of data for a specific species-drug combination
This split will be used for the comparison to the baseline methods of the previous paper

In [4]:
target_df, other_df = dsplit.baseline_selection(drug="Cefepime", species="Staphylococcus epidermidis")
train_data, test_data = dsplit.baseline_train_test_split(target_df, test_size=0.2)
print(len(train_data))
print(len(test_data))
target_df

165
55


Unnamed: 0,species,sample_id,drug,response,dataset
0,Staphylococcus epidermidis,3f8cad92-c7a6-4ccf-988d-592f4c6f3151,Cefepime,1,B
1,Staphylococcus epidermidis,879dba75-4b6c-4b11-a912-72380f768a60,Cefepime,0,B
2,Staphylococcus epidermidis,5fb36b8f-2cc4-40b7-bd54-4d5e03f95465,Cefepime,0,B
3,Staphylococcus epidermidis,26be5509-662e-4562-abd2-d4aae49f78b7,Cefepime,0,B
4,Staphylococcus epidermidis,151e9f43-0491-4c61-ae12-c0bc6d0a033e,Cefepime,0,B
...,...,...,...,...,...
215,Staphylococcus epidermidis,5edc1be1-a932-435e-988f-59eff1fa007e,Cefepime,1,B
216,Staphylococcus epidermidis,008c7f1a-a6ee-46f4-97ac-cbac4eacc5b3,Cefepime,1,B
217,Staphylococcus epidermidis,b806941e-580a-4515-a9d1-043683c347d8,Cefepime,1,B
218,Staphylococcus epidermidis,28cb344a-fa89-47dd-8001-ba71dc07aae1,Cefepime,1,B


## Drug zero-shot split
The test data consists of all the samples for the specified drug.

The remaining data is partitioned into training and validation splits on the basis that all drugs and all species are represented in the training data,
but that no drug-species pair appears in more than one set

In [5]:
target_drug_df, train_val_df = dsplit.drug_zero_shot_split(drug="Cefepime")
target_drug_df

Unnamed: 0,species,sample_id,drug,response,dataset
0,Klebsiella pneumoniae,ca568529-351a-43af-8cec-7175488f66ea,Cefepime,0,B
1,Proteus mirabilis,0fd4fd1a-8233-400d-bdee-f0c13819dc6b,Cefepime,0,B
2,Staphylococcus aureus,b1fa5e1c-d013-4716-b4b0-6552ad1c50de,Cefepime,0,B
3,Staphylococcus aureus,00aa2ddb-1819-475c-85e6-ed39d6efe6bc,Cefepime,0,B
4,Proteus mirabilis,c81323a2-fabb-428b-a002-1cc7c5b5c0ad,Cefepime,0,B
...,...,...,...,...,...
1684,Escherichia coli,753d577a-d8a0-4929-acaf-39dd8c214196,Cefepime,0,B
1685,Staphylococcus aureus,02cf7dfe-c0c0-46e4-8848-168ddb20a336,Cefepime,0,B
1686,Staphylococcus lugdunensis,80cb2c77-a65a-4ad9-ad6e-f0c076fcd839,Cefepime,0,B
1687,Staphylococcus aureus,68d1147b-1606-46e3-a515-e992f0f84455,Cefepime,0,B


In [6]:
train_data, val_data = dsplit.combination_train_test_split(train_val_df, test_size=0.2)
print(len(train_data))
print(len(val_data))
print(len(pd.merge(train_data, val_data, on=["drug", "species"]))) # Overlap of combinations between train and val sets
train_val_df

24551
6137
0


Unnamed: 0,species,sample_id,drug,response,dataset
0,Klebsiella pneumoniae,ca568529-351a-43af-8cec-7175488f66ea,Amikacin,0,B
1,Klebsiella pneumoniae,ca568529-351a-43af-8cec-7175488f66ea,Ampicillin,1,B
2,Klebsiella pneumoniae,ca568529-351a-43af-8cec-7175488f66ea,Cefoxitin,0,B
3,Klebsiella pneumoniae,ca568529-351a-43af-8cec-7175488f66ea,Ceftazidime,0,B
4,Klebsiella pneumoniae,ca568529-351a-43af-8cec-7175488f66ea,Ceftriaxone,0,B
...,...,...,...,...,...
30683,Staphylococcus aureus,57043b13-3ba8-4f30-83ac-2416c23cec3a,Rifampicin,0,B
30684,Staphylococcus aureus,57043b13-3ba8-4f30-83ac-2416c23cec3a,Teicoplanin,0,B
30685,Staphylococcus aureus,57043b13-3ba8-4f30-83ac-2416c23cec3a,Tetracycline,0,B
30686,Staphylococcus aureus,57043b13-3ba8-4f30-83ac-2416c23cec3a,Tigecycline,0,B
