#### Preprocess repetition data

DRIAMS-E: USB

DRIAMS-F: KSBL

In [204]:
import os

import numpy as np
import pandas as pd

In [205]:
PATH_REPETITION_FOLDER = '/links/groups/borgwardt/Projects/maldi_tof_diagnostics/amr_maldi_ml/codeAC/Ec_Sa_Repetition_Measurements'
FILE_DRIAMS_E = os.path.join(PATH_REPETITION_FOLDER, 'Res_EcSa_USB.csv')
FILE_DRIAMS_F = os.path.join(PATH_REPETITION_FOLDER, 'Res_EcSa_ksbl.csv')

OUTPUT_FILE_E = '/links/groups/borgwardt/Data/DRIAMS/DRIAMS-E/id/2019/2019_clean.csv'
OUTPUT_FILE_F = '/links/groups/borgwardt/Data/DRIAMS/DRIAMS-F/id/2019/2019_clean.csv'

In [206]:
df_E = pd.read_csv(FILE_DRIAMS_E, dtype=str)
df_F = pd.read_csv(FILE_DRIAMS_F, dtype=str)

In [207]:
print(df_E['AB'].unique())
print(df_F['AB'].unique())

['Cefoxitin' 'Cefepim' 'Cotrimoxazol' 'Ciprofloxacin' 'Fosfomycin'
 'Benzylpenicillin' 'Cefoxitin.Screen' 'Oxacillin' 'Erythromycin'
 'Tetracyclin' 'Tigecycline' 'Induzierbare Clindamycin__Resistenz'
 'Clindamycin' 'Gentamycin' 'Vancomycin' 'Teicoplanin' 'Fusidinsaeure'
 'Rifampicin' 'Linezolid' 'Daptomycin' 'Mupirocin'
 'Piperacillin...Tazobactam' 'Ceftazidim' 'Ceftriaxon' 'Ertapenem'
 'Imipenem' 'Meropenem' 'Amikacin' 'Nitrofurantoin' 'Norfloxacin']
['Piperacillin...Tazobactam' 'Ceftazidim' 'Ceftriaxon' 'Cefoxitin'
 'Cefepim' 'Ertapenem' 'Imipenem' 'Meropenem' 'Amikacin' 'Nitrofurantoin'
 'Norfloxacin' 'Ciprofloxacin' 'Fosfomycin' 'Cotrimoxazol'
 'Benzylpenicillin' 'Cefoxitin.Screen' 'Oxacillin' 'Erythromycin'
 'Tetracyclin' 'Tigecycline' 'Induzierbare Clindamycin__Resistenz'
 'Clindamycin' 'Gentamycin' 'Vancomycin' 'Teicoplanin' 'Fusidinsaeure'
 'Rifampicin' 'Linezolid' 'Daptomycin' 'Mupirocin']


In [208]:
df_E.drop(columns=['Unnamed: 0', 'MHK', 'TGNR'], inplace=True)
df_F.drop(columns=['Unnamed: 0', 'MHK', 'TGNR'], inplace=True)

In [209]:
df_E_ids = df_E['ID'].unique()
df_F_ids = df_F['ID'].unique()
common_ids = list(set(df_E_ids).intersection(set(df_F_ids)))

print(f'{len(df_E_ids)} unique IDs found in df_E_ids.',
      f'\n{len(df_F_ids)} unique IDs found in df_F_ids.')

print(f'{len(common_ids)} common unique IDs found.')

174 unique IDs found in df_E_ids. 
174 unique IDs found in df_F_ids.
174 common unique IDs found.


In [210]:
# E.coli unique IDs
df_E_Ecoli_ids = df_E[df_E['Species']=='Escherichia coli']['ID'].unique()
df_F_Ecoli_ids = df_F[df_F['Species']=='Escherichia coli']['ID'].unique()
common_Ecoli_ids = list(set(df_E_Ecoli_ids).intersection(set(df_F_Ecoli_ids)))

print(f'{len(df_E_Ecoli_ids)} unique Ecoli_IDs found in df_E_ids.',
      f'\n{len(df_F_Ecoli_ids)} unique Ecoli_IDs found in df_F_ids.')

print(f'{len(common_Ecoli_ids)} common unique Ecoli_IDs found.')

# Staph. aureus unique IDs
df_E_Saureus_ids = df_E[df_E['Species']=='Staphylococcus aureus']['ID'].unique()
df_F_Saureus_ids = df_F[df_F['Species']=='Staphylococcus aureus']['ID'].unique()
common_Saureus_ids = list(set(df_E_Saureus_ids).intersection(set(df_F_Saureus_ids)))

print(f'{len(df_E_Saureus_ids)} unique Saureus_IDs found in df_E_ids.',
      f'\n{len(df_F_Saureus_ids)} unique Saureus_IDs found in df_F_ids.')

print(f'{len(common_Saureus_ids)} common unique Saureus_IDs found.')


78 unique Ecoli_IDs found in df_E_ids. 
78 unique Ecoli_IDs found in df_F_ids.
78 common unique Ecoli_IDs found.
96 unique Saureus_IDs found in df_E_ids. 
96 unique Saureus_IDs found in df_F_ids.
96 common unique Saureus_IDs found.


In [211]:
df_E = df_E.set_index(['Species', 'Bruker', 'ID', 'AB']) 
df_F = df_F.set_index(['Species', 'Bruker', 'ID', 'AB']) 

In [212]:
# stack by antibiotic
df_E = df_E['Interpretation'].unstack()
df_F = df_F['Interpretation'].unstack()

In [213]:
print(df_E.head())
print(df_F.head())

AB                                                             Amikacin  \
Species          Bruker                               ID                  
Escherichia coli 05ff977e-a0a9-4712-b783-ed061e3d96fb 10072301        S   
                 06d73e8b-a09f-41d3-917a-ca8b0c0af402 10072500        S   
                 0736bb1d-8193-428e-a4aa-d56a8a6b7f74 10072303        S   
                 0c71c22a-f682-44bd-83e3-57b7dce07da7 3152966         S   
                 0cf3c806-85b8-42f8-8a17-8b6a28055b96 10059610        S   

AB                                                             Benzylpenicillin  \
Species          Bruker                               ID                          
Escherichia coli 05ff977e-a0a9-4712-b783-ed061e3d96fb 10072301              NaN   
                 06d73e8b-a09f-41d3-917a-ca8b0c0af402 10072500              NaN   
                 0736bb1d-8193-428e-a4aa-d56a8a6b7f74 10072303              NaN   
                 0c71c22a-f682-44bd-83e3-57b7dce07da7 31529

In [214]:
df_E.to_csv(OUTPUT_FILE_E)
df_F.to_csv(OUTPUT_FILE_F)