In [1]:
# Load packages
import json
import os
from os.path import join
import numpy as np
import re
import pandas as pd

# KSBL


In [2]:
# Define input paths
PATH_to_spectra = '/Users/aline/MALDI-TOF-Machine_learning/01_Rawdata/01_Spectra/Validation_Set/KSBL/encoded/Spectra/'

# Define input report
PATH_to_report = '/Users/aline/MALDI-TOF-Machine_learning/01_Rawdata/02_Resistances/Validation_Set/KSBL/'

# Define output paths
PATH_to_dict_output = '/Users/aline/MALDI-TOF-Machine_learning/01_Rawdata/01_Spectra/Validation_Set/KSBL/dicts_code_TGNR.csv'

In [3]:
# Make dicts, translating from Bruker encoding to samplename
return_list_1 = []
runinfo_all_1 = []

for root, dirs, files in os.walk(PATH_to_spectra):
    for name in files:
        if name.startswith(("info")):
            with open(os.path.join(root, name)) as json_file:
                runinfo_all_1.append(json.load(json_file))

dicts_1 = []
dicts_all_1= []
for runinfo in runinfo_all_1: 
        dicts_1.append({runinfo['AnalyteUid']:runinfo['AnalyteId']})

dicts_all_1 = {}
for d in dicts_1:
    dicts_all_1.update(d)
    
# Create a second dictionary, including the date the spectra was acquired
dicts_2 = []
for runinfo in runinfo_all_1: 
        dicts_2.append({runinfo['AnalyteUid']:runinfo['ProjectName']})

dicts_all_2 = {}
for d in dicts_2:
    dicts_all_2.update(d)

In [4]:
# Add string to code to make it unambiguous when combining with spectra from other devices
dicts_all_15 = dict(("{}{}".format(k,'_MALDI1'),v) for k,v in dicts_all_1.items())

# Build dataframes from dictionaries which contain: Brukercode, samplename and acquisition date
df_date = pd.DataFrame([dicts_all_2]).T
df_date.index.name = 'Bruker'
df_date.reset_index(inplace=True)
df_date.columns = ['Bruker', 'Projectname']

df_samplename = pd.DataFrame([dicts_all_1]).T
df_samplename.index.name = 'Bruker'
df_samplename.reset_index(inplace=True)
df_samplename.columns = ['Bruker', 'Samplename']

df = pd.DataFrame.merge(df_samplename,df_date, on='Bruker')

In [11]:
# Load species identification reports, outputted from the Bruker Database
report_01 = pd.read_csv('/Users/aline/MALDI-TOF-Machine_learning/02_Scripts/KSBL/KSBL-1.csv', sep=';', header=None)
report_02 = pd.read_csv('/Users/aline/MALDI-TOF-Machine_learning/02_Scripts/KSBL/KSBL-1.csv', sep=';', header=None)
report_03 = pd.read_csv('/Users/aline/MALDI-TOF-Machine_learning/02_Scripts/KSBL/KSBL-3.csv', sep=';', header=None)
report_04 = pd.read_csv('/Users/aline/MALDI-TOF-Machine_learning/02_Scripts/KSBL/KSBL-4.csv', sep=';', header=None)
report_05 = pd.read_csv('/Users/aline/MALDI-TOF-Machine_learning/02_Scripts/KSBL/KSBL-5.csv', sep=';', header=None)


report=pd.DataFrame()
report=report.append(report_01)
report=report.append(report_02)
report=report.append(report_03)
report=report.append(report_04)
report=report.append(report_05)

#Drop 8th column as it is empty
report = report.drop(report.columns[7], axis=1)

# Rename columns and drop duplicates
report.columns = ['Bruker', 'Value','A','Organism_best_match', 'Score1', 'Organism(second best match)', 'Score2']
report = pd.DataFrame(report.drop_duplicates())


In [6]:
# Merge report to dict_df_all using Bruker code
report_TGNR = pd.merge(df, report, how='right', on='Bruker')

In [7]:
# Load in AMR profiles
res = pd.read_csv('/Users/aline/MALDI-TOF-Machine_learning/01_Rawdata/02_Resistances/Validation_Set/KSBL/DatenexportResiKSBLJanbisJun2018fuerMALDIStudie.csv')
res['Auftrag'] = res['Auftrag'].astype(str)
res['Auftrag'] = res['Auftrag'].str.extract('(\d{7})', expand=False).str.strip()
res['Keim']=res['Keim'].str.strip(' ')
res['SPEZIES_RES'] = res['Keim'].str.split('\s').str[1]
res['GENUS'] = res['Keim'].str.split('\s').str[0]

In [8]:
# Extract species and genus identifies by the Bruker Database. Create 'GENUS' column for matching
report_TGNR['Organism_best_match']=report_TGNR['Organism_best_match'].str.strip(' ')
report_TGNR['SPEZIES_MALDI'] = report_TGNR['Organism_best_match'].str.split('\s').str[1]
report_TGNR['GENUS'] = report_TGNR['Organism_best_match'].str.split('\s').str[0]
report_TGNR['Auftrag'] = report_TGNR['Samplename'].str.split('\-').str[0]

In [9]:
# Merge report_TGNR with res file using the Auftragsnummer and Genus
res_report = pd.merge(res,report_TGNR, on=('Auftrag', 'GENUS'), how='right')

In [10]:
# Write output
res_report.to_csv('/Users/aline/MALDI-TOF-Machine_learning/02_Scripts/KSBL/KSBL_res_report.csv', sep=';')