# Vancomycin sample

This notebook contains the code to preprocess the Vancomycin dataset. We will also perform random undersampling to obtain the data for E2.

This notebook requires the Vancomycin dataset, which can be generated using the code in the following repository: https://github.com/antoniolopezmc/A-methodology-based-on-Trace-based-clustering-for-patient-phenotyping/

In [31]:
import pandas as pd
import os

In [32]:
df = pd.read_csv("../data/mimic-iii-preprocessed-db.csv")
df.head(5)

Unnamed: 0,patient_gender,exitus,admission_type,admission_location,discharge_location,culture_specimen_type_description,culture_microorganism_name,culture_susceptibility,service_when_culture,icu_when_culture,...,culture_mic_<=128,patient_age,days_since_last_admission,days_between_admission_and_first_ICU,days_between_last_vancomycin_treatment_and_culture__ALL_ADMISSIONS,duration_of_last_vancomycin_treatment__ALL_ADMISSIONS,number_of_last_vancomycin_treatments__ALL_ADMISSIONS,duration_in_natural_days_of_last_vancomycin_treatments_in_the_last_180_days__ALL_ADMISSIONS,number_of_last_vancomycin_treatments_in_the_last_180_days__ALL_ADMISSIONS,culture_month
0,M,Y,EMERGENCY,EMERGENCY ROOM ADMIT,DEAD/EXPIRED,BLOOD CULTURE,ENTEROCOCCUS FAECIUM,R,NO SERVICE,NO ICU,...,Y,88,141,0,-1,-1,0,-1,0,1
1,M,Y,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,DEAD/EXPIRED,BLOOD CULTURE,ENTEROCOCCUS FAECIUM,R,SURG,SICU,...,Y,75,223,8,0,5,1,7,1,1
2,F,N,EMERGENCY,EMERGENCY ROOM ADMIT,REHAB/DISTINCT PART HOSP,BLOOD CULTURE,ENTEROCOCCUS FAECIUM,R,VSURG,SICU,...,Y,55,-1,0,-1,-1,0,-1,0,5
3,F,N,EMERGENCY,EMERGENCY ROOM ADMIT,REHAB/DISTINCT PART HOSP,BLOOD CULTURE,ENTEROCOCCUS FAECIUM,R,MED,CSRU,...,Y,69,-1,1,-1,-1,0,-1,0,6
4,F,Y,EMERGENCY,EMERGENCY ROOM ADMIT,DEAD/EXPIRED,BLOOD CULTURE,ENTEROCOCCUS FAECIUM,R,NO SERVICE,NO ICU,...,Y,63,-1,0,-1,-1,0,-1,0,12


In [33]:
df.shape

(531, 26)

In [34]:
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns
df = df.drop(columns = numerical_columns)

In [35]:
df = df.drop(columns = ['culture_mic_<=1',
       'culture_mic_<=2', 'culture_mic_<=4', 'culture_mic_<=8',
       'culture_mic_<=16', 'culture_mic_<=32', 'culture_mic_<=128'])
df.columns

Index(['patient_gender', 'exitus', 'admission_type', 'admission_location',
       'discharge_location', 'culture_specimen_type_description',
       'culture_microorganism_name', 'culture_susceptibility',
       'service_when_culture', 'icu_when_culture'],
      dtype='object')

In [36]:
n_sel = 0
for col in df.columns:
    n_sel += df[col].nunique()
n_sel

67

In [40]:
uncommon_values = []
for c in df.columns:
    v_c = df[c].value_counts()
    uncommon_values.append(v_c[v_c < 10].index.tolist())
uncommon_values = [item for sublist in uncommon_values for item in sublist]
uncommon_values
# Drop rows with uncommon values
remove_index = []
for i, row in df.iterrows():
    if any([row[c] in uncommon_values for c in df.columns]):
        remove_index.append(i)
df = df.drop(index=remove_index)

In [41]:
df.shape

(447, 10)

In [44]:
n_instances = list(range(100, 400+1, 50))

In [None]:
if not os.path.exists("vancomicyn-samples"):
    os.makedirs("vancomicyn-samples")

In [46]:
for i in n_instances:
    df_sample = df.sample(n = i, random_state = 42)
    for col in df_sample.columns:
        if df[col].nunique() != df_sample[col].nunique():
            print("Different number of unique values", col, "for dataset", i)
            print(df[col].nunique())
            print(df_sample[col].nunique())
    df_sample.to_csv("vancomicyn-samples/mimic-iii-preprocessed-db-sample-" + str(i) + ".csv", index = False)

100
150
200
250
300
350
400
