by Akinde Kadjo

**The goal** here is to get the data into a concise form. Due to its large size (20 GB), the data won't be added to GitHub, but the original data set can be found [here.](https://www.kaggle.com/datasets/drscarlat/driams)

# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
import os
from glob import glob
from tqdm import tqdm
from tqdm.keras import TqdmCallback
import time

# Functions

In [2]:
#extract all the files within the path/directory 
#then convert put them into a single dataframe
def ms_to_pd(folder_path):
    col_list =[f'bin{i}' for i in range(6000)]
    data = []
    for i, filename in tqdm (list(enumerate(os.listdir(folder_path)))):
        if filename.endswith('.txt'):
            file_path = os.path.join(folder_path, filename)
            df = pd.read_csv(file_path, sep=' ', usecols=[1])
            df = df.transpose()
            df.columns = col_list
            df['id'] = os.path.splitext(os.path.basename(filename))[0]
            data.append(df)
    df = pd.concat(data, ignore_index=True)
    X = df.drop(columns='id')
    df = pd.concat([df['id'], X], axis =1)
    return df

In [3]:
# function to set data frame columns in a given order
def orderdf(df, new_list):
    new_df= pd.DataFrame()
    for i, col in tqdm(enumerate(new_list)):
        if col in list(df.columns):
            new_df[col] = df[col]
        else:
            new_df[col] = pd.Series([np.nan]*(len(df)))
    return new_df

# Mass Spectra

In [16]:
#a quick look at one of the files
file_path = 'binned_6000/2015/000d2b4a-ca7f-41c6-a9a2-968874ee9ce4.txt'
df = pd.read_csv(file_path, sep=' ')
print(f'df shape {df.shape}')
df.head(2)

df shape (6000, 2)


Unnamed: 0,bin_index,binned_intensity
0,0,0.000288
1,1,0.000433


In [17]:
#converting the 2015 data into pd
df2015 = ms_to_pd('binned_6000/2015')
print(df2015.shape)
df2015.to_csv('compiled_ms2015.csv.gz', index=False, compression='gzip')
df2015.head(2)

100%|██████████| 3198/3198 [00:32<00:00, 99.45it/s] 


(3198, 6001)


Unnamed: 0,id,bin0,bin1,bin2,bin3,bin4,bin5,bin6,bin7,bin8,...,bin5990,bin5991,bin5992,bin5993,bin5994,bin5995,bin5996,bin5997,bin5998,bin5999
0,000d2b4a-ca7f-41c6-a9a2-968874ee9ce4,0.000288,0.000433,0.000165,0.000366,0.000182,0.000142,0.000268,0.000192,0.000301,...,2.2e-05,1.1e-05,1.2e-05,3.5e-05,6.2e-05,8.2e-05,6.2e-05,7.6e-05,9.3e-05,0.000126
1,0014baec-4eb8-4a78-913a-99cc773b62b7,0.000496,0.00017,0.000286,0.000547,0.000991,0.000563,0.000156,4e-05,0.000451,...,6.7e-05,8.5e-05,4.2e-05,4.8e-05,6.1e-05,8.5e-05,4.4e-05,2.7e-05,1e-05,8e-06


In [18]:
df2016 = ms_to_pd('binned_6000/2016')
print(df2016.shape)
df2016.to_csv('compiled_ms2016.csv.gz', index=False, compression='gzip')

100%|██████████| 34868/34868 [20:05<00:00, 28.92it/s]


(34868, 6001)


In [3]:
df2017 = ms_to_pd('binned_6000/2017')
print(df2017.shape)
df2017.to_csv('compiled_ms2017.csv.gz', index=False, compression='gzip')

100%|██████████| 43122/43122 [49:37<00:00, 14.48it/s]  


(43122, 6001)


In [4]:
df2018 = ms_to_pd('binned_6000/2018')
print(df2018.shape)
df2018.to_csv('compiled_ms2018.csv.gz', index=False, compression='gzip')

100%|██████████| 30069/30069 [26:19<00:00, 19.03it/s]  


(30069, 6001)


In [9]:
#combining all MS data
ms_df = pd.concat([df2015,df2016,df2017,df2018])
ms_df.shape

(111257, 6001)

# Labels

In [5]:
id2015 = pd.read_csv('id/2015/2015_clean.csv', low_memory=False)
print(id2015.shape)
id2015.head(2)

(3198, 81)


Unnamed: 0,code,species,laboratory_species,Piperacillin-Tazobactam,Meropenem,Ciprofloxacin,Cefepime,Cotrimoxazole,Ceftazidime,Amikacin,...,Fosfomycin,Ticarcillin-Clavulan acid,Penicillin_without_endokarditis,Penicillin_with_endokarditis,Doxycycline,Cefoxitin_screen,Isoniazid_.4mg-l,Bacitracin,Vancomycin_GRD,Teicoplanin_GRD
0,74969164-613a-4455-ac8e-5666ee0dfade,MIX!Streptococcus pneumoniae,,,,,,,,,...,,,,,,,,,,
1,e9adf43d-679b-497c-9849-1fa214838dd3,Staphylococcus epidermidis,Staphylococcus epidermidis,R,R,R,R,S,-,-,...,,,-,-,,-,,,,


In [6]:
id2016 = pd.read_csv('id/2016/2016_clean.csv', low_memory=False)
print(id2016.shape)
id2017 = pd.read_csv('id/2017/2017_clean.csv', low_memory=False)
print(id2017.shape)
id2018 = pd.read_csv('id/2018/2018_clean.csv', low_memory=False)
print(id2018.shape)

(34868, 90)
(43122, 90)
(30069, 87)


In [7]:
all_columns = list(id2015.columns) + list(id2016.columns) + list(id2017.columns) + list(id2018.columns) 
print(len(all_columns))
#list of columns without repeat
column_list = list(set(all_columns))
column_list.sort()
print(len(column_list))

348
92


In [8]:
column_list

['5-Fluorocytosine',
 'Amikacin',
 'Aminoglycosides',
 'Amoxicillin',
 'Amoxicillin-Clavulanic acid',
 'Amoxicillin-Clavulanic acid_uncomplicated_HWI',
 'Amphotericin B',
 'Ampicillin-Amoxicillin',
 'Ampicillin-Sulbactam',
 'Anidulafungin',
 'Azithromycin',
 'Aztreonam',
 'Bacitracin',
 'Caspofungin',
 'Cefazolin',
 'Cefepime',
 'Cefixime',
 'Cefoxitin_screen',
 'Cefpodoxime',
 'Ceftarolin',
 'Ceftazidime',
 'Ceftazidime-Avibactam',
 'Ceftobiprole',
 'Ceftolozane-Tazobactam',
 'Ceftriaxone',
 'Cefuroxime',
 'Cefuroxime.1',
 'Chloramphenicol',
 'Ciprofloxacin',
 'Clarithromycin',
 'Clindamycin',
 'Colistin',
 'Cotrimoxazole',
 'Daptomycin',
 'Doxycycline',
 'Ertapenem',
 'Erythromycin',
 'Ethambutol_5mg-l',
 'Fluconazole',
 'Fosfomycin',
 'Fosfomycin-Trometamol',
 'Fusidic acid',
 'Gentamicin',
 'Gentamicin_high_level',
 'Imipenem',
 'Isavuconazole',
 'Isoniazid_.1mg-l',
 'Isoniazid_.4mg-l',
 'Itraconazole',
 'Levofloxacin',
 'Linezolid',
 'Meropenem',
 'Meropenem_with_meningitis',
 'Me

In [9]:
#re-arranging the list
column_list.remove('Unnamed: 0')
column_list.remove('Unnamed: 0.1')
column_list

['5-Fluorocytosine',
 'Amikacin',
 'Aminoglycosides',
 'Amoxicillin',
 'Amoxicillin-Clavulanic acid',
 'Amoxicillin-Clavulanic acid_uncomplicated_HWI',
 'Amphotericin B',
 'Ampicillin-Amoxicillin',
 'Ampicillin-Sulbactam',
 'Anidulafungin',
 'Azithromycin',
 'Aztreonam',
 'Bacitracin',
 'Caspofungin',
 'Cefazolin',
 'Cefepime',
 'Cefixime',
 'Cefoxitin_screen',
 'Cefpodoxime',
 'Ceftarolin',
 'Ceftazidime',
 'Ceftazidime-Avibactam',
 'Ceftobiprole',
 'Ceftolozane-Tazobactam',
 'Ceftriaxone',
 'Cefuroxime',
 'Cefuroxime.1',
 'Chloramphenicol',
 'Ciprofloxacin',
 'Clarithromycin',
 'Clindamycin',
 'Colistin',
 'Cotrimoxazole',
 'Daptomycin',
 'Doxycycline',
 'Ertapenem',
 'Erythromycin',
 'Ethambutol_5mg-l',
 'Fluconazole',
 'Fosfomycin',
 'Fosfomycin-Trometamol',
 'Fusidic acid',
 'Gentamicin',
 'Gentamicin_high_level',
 'Imipenem',
 'Isavuconazole',
 'Isoniazid_.1mg-l',
 'Isoniazid_.4mg-l',
 'Itraconazole',
 'Levofloxacin',
 'Linezolid',
 'Meropenem',
 'Meropenem_with_meningitis',
 'Me

In [10]:
column_list = column_list[-3:] + column_list[:-3]

In [11]:
column_list

['code',
 'laboratory_species',
 'species',
 '5-Fluorocytosine',
 'Amikacin',
 'Aminoglycosides',
 'Amoxicillin',
 'Amoxicillin-Clavulanic acid',
 'Amoxicillin-Clavulanic acid_uncomplicated_HWI',
 'Amphotericin B',
 'Ampicillin-Amoxicillin',
 'Ampicillin-Sulbactam',
 'Anidulafungin',
 'Azithromycin',
 'Aztreonam',
 'Bacitracin',
 'Caspofungin',
 'Cefazolin',
 'Cefepime',
 'Cefixime',
 'Cefoxitin_screen',
 'Cefpodoxime',
 'Ceftarolin',
 'Ceftazidime',
 'Ceftazidime-Avibactam',
 'Ceftobiprole',
 'Ceftolozane-Tazobactam',
 'Ceftriaxone',
 'Cefuroxime',
 'Cefuroxime.1',
 'Chloramphenicol',
 'Ciprofloxacin',
 'Clarithromycin',
 'Clindamycin',
 'Colistin',
 'Cotrimoxazole',
 'Daptomycin',
 'Doxycycline',
 'Ertapenem',
 'Erythromycin',
 'Ethambutol_5mg-l',
 'Fluconazole',
 'Fosfomycin',
 'Fosfomycin-Trometamol',
 'Fusidic acid',
 'Gentamicin',
 'Gentamicin_high_level',
 'Imipenem',
 'Isavuconazole',
 'Isoniazid_.1mg-l',
 'Isoniazid_.4mg-l',
 'Itraconazole',
 'Levofloxacin',
 'Linezolid',
 'Me

In [12]:
df_2015 = orderdf(id2015, column_list)
print(df_2015.shape)
df_2016 = orderdf(id2016, column_list)
print(df_2016.shape)
df_2017 = orderdf(id2017, column_list)
print(df_2017.shape)
df_2018 = orderdf(id2018, column_list)
print(df_2018.shape)

90it [00:00, 1653.42it/s]


(3198, 90)


90it [00:00, 999.96it/s]


(34868, 90)


90it [00:00, 988.94it/s]


(43122, 90)


90it [00:00, 986.14it/s]

(30069, 90)





In [13]:
id_df = pd.concat([df_2015,df_2016,df_2017,df_2018], ignore_index=True)
id_df.shape

(111257, 90)

In [14]:
#checking for duplicate
id_df.duplicated().sum()

0

In [15]:
id_df.to_csv('label.csv.gz', index=False, compression='gzip')