<img src="./University_Debrecen_logo.jpg" alt="Drawing" style="width: 200px;"/>

# Import and processes blood gases and transcutaneous data (when available) and exports them as pickle archives.

#### Author: Dr Gusztav Belteki

### 1. Import the required modules

In [None]:
import os
import sys
import pickle
from collections import defaultdict
from datetime import datetime

import pandas as pd
from pandas import DataFrame

In [None]:
print(f'Python version: {sys.version}')
print(f'pandas version: {pd.__version__}')

### 2. List and set the working directory and the directory to write out data

In [None]:
# Topic of the Notebook which will also be the name of the subfolder containing results
TOPIC = 'analysis_all'

# Path to clinical data and to folder to export results to
PATH = os.path.join(os.sep, 'Users', 'guszti', 'Library', 'Mobile Documents', 'com~apple~CloudDocs', 
                            'Documents', 'Research', 'Ventilation')

# Name of the external hard drive
DRIVE = 'Guszti'

# Directory containing clinical and blood gas data
DIR_READ_CLIN = os.path.join(PATH, 'ventilation_draeger_debrecen')

# Folder to write statistics and reports on the group
DIR_WRITE = os.path.join(os.sep, 'Users', 'guszti', 'ventilation_draeger_debrecen', 'Analyses', TOPIC)
os.makedirs(DIR_WRITE, exist_ok = True)

# Folder on external drive to export graphs and data about individual recordings
DATA_DUMP = os.path.join(os.sep, 'Volumes', 'Guszti', 'data_dump', 'draeger_debrecen', TOPIC)
os.makedirs(DATA_DUMP, exist_ok = True)

In [None]:
DIR_READ_CLIN, DIR_READ_VENT, DIR_WRITE, DATA_DUMP

### 3. Import processed clinical details
This recording list is produced by the `Clinical_details_processing_debrecen.ipynb` notebook

In [None]:
clinical_details = pd.read_csv(os.path.join(DIR_READ_CLIN, 'clinical_details_patients.csv'))

In [None]:
clinical_details.head()

In [None]:
patients = list(clinical_details['Patient'])
print(patients)

### 4. Import and process blood gases and export them as pickle archive

In [None]:
blood_gases = {}

for patient in patients:
    print(patient)
    
    try:
    
        blood_gases[patient] = pd.read_excel(os.path.join(DIR_READ_CLIN, 'blood_gas_Debrecen.xlsx'), 
                              header = None,   sheet_name = patient).T
    except ValueError as E:
        print(f'No blood gas data for {patient}')
        continue
    
    if blood_gases[patient].empty: # Some recordings do not have blood gases
        print(f'No blood gas data for {patient}')
        continue

    blood_gases[patient].columns = blood_gases[patient].iloc[0]
    blood_gases[patient] = blood_gases[patient][1:]
    blood_gases[patient].index = [blood_gases[patient]['Date:'], blood_gases[patient]['Time:']]
        
    # Change the index of blood gases into single index format
    time_list_all = []    
    for i in range(len(blood_gases[patient])):
        day = str(blood_gases[patient].index[i][0])[:10]
        time = str(blood_gases[patient].index[i][1])
        date_time = day + ' ' + time
        time_list_all.append(date_time)
    blood_gases[patient].index = time_list_all
    # Convert the indices of the blood_gases DataFrames to datetime index
    blood_gases[patient].index = pd.to_datetime(blood_gases[patient].index)
    # Remove original date and time columns
    blood_gases[patient] = blood_gases[patient].drop(['Date:', 'Time:'], axis=1)
    # Convert data types as appropriate
    for column in blood_gases[patient].columns:
        if column == 'Blood specimen type, POC':
            blood_gases[patient][column] = blood_gases[patient][column].astype('category')
        else:
            blood_gases[patient][column] = blood_gases[patient][column].astype('float')

In [None]:
for patient in blood_gases:
    print(patient, len(blood_gases[patient]))

In [None]:
blood_gases['LVD003'].head()

In [None]:
blood_gases['LVD003'].info()

In [None]:
# Export blood gases as pickle archives
with open(os.path.join(DATA_DUMP, 'blood_gases_processed.pickle'), 'wb') as handle:
    pickle.dump(blood_gases, handle, protocol=pickle.HIGHEST_PROTOCOL)

### 5. Import transcutaneous CO2 data

In [None]:
%%time

columns_to_keep = ['Date/Time', 'pCO2 [mmHg]', 'pO2 [mmHg]', 'SpO2 [%]', 'PR [bpm]', 
                   'PI [%]', 'PI Low Alarm[%]', 'Plethysmogram',
                   'Heating Power [mW]', 'Sensor Temp set [°C]', 'Sensor Temp eff [°C]',
                   'Event Category', 'Event Details',]

transcutaneous = {}

for patient in patients:
    data_list = []
    files = [file for file in os.listdir(os.path.join(DIR_READ_VENT, patient)) if 'tcm5' in file]
    if not files:
        print(f'No transcutaneous data for {patient}')
        continue
    
    transcutaneous[patient] = {}
    for file in files:
        data = pd.read_csv(os.path.join(DIR_READ_VENT, patient, file), 
            usecols = columns_to_keep, index_col = 'Date/Time', sep = ';',  skiprows = 29)
        data.index = pd.to_datetime(data.index)
        data = data.dropna(how='all', axis=1)
        data = data.dropna(how='all', axis=0)
        transcutaneous[patient][file] = data
    

### 6. Process transcutaneous data and export them to pickle archive

#### A. For some transcutaneous recordings the time stamp is incorrect

In [None]:
to_time_shift = pd.read_excel(os.path.join(DIR_READ_CLIN, 'tcpCO2_files_timestamps_shifted.xlsx'))
to_time_shift = to_time_shift[['patient', 'file', 'time stamp to shift']]
to_time_shift = to_time_shift.set_index(['patient', 'file'])
to_time_shift

In [None]:
transcutaneous['LVD013']['LVD013_tcm5_20211229_143812.csv'].head()

In [None]:
for patient in transcutaneous:
    for file in transcutaneous[patient]:
        # How much time (in hours) tcCO2 timestamps need to be shifted to align it with ventilator data
        time_shift = to_time_shift.loc[patient, file].values[0]
        if time_shift:
            # if timeshift is not zero, restore the correct time by shifting it
            transcutaneous[patient][file].index = \
                transcutaneous[patient][file].index.shift(periods = time_shift, freq='H')      

In [None]:
transcutaneous['LVD013']['LVD013_tcm5_20211229_143812.csv'].head()

#### B. Keep only relevant columns, remove missing values and convert data type to float

In [None]:
transcutaneous['LVD016']['LVD016_tcm5_20220117_085025.csv'].head()

In [None]:
columns_to_keep = ['pCO2 [mmHg]', 'pO2 [mmHg]', 'Heating Power [mW]', 'Sensor Temp set [°C]', 'Sensor Temp eff [°C]']

for patient in transcutaneous:
    for file in transcutaneous[patient]:
        transcutaneous[patient][file] = transcutaneous[patient][file][columns_to_keep]
        transcutaneous[patient][file] = transcutaneous[patient][file].dropna(how='any', subset=['pCO2 [mmHg]'])
        transcutaneous[patient][file] = transcutaneous[patient][file].astype('float')

In [None]:
transcutaneous['LVD016']['LVD016_tcm5_20220117_085025.csv'].head()

In [None]:
transcutaneous['LVD016']['LVD016_tcm5_20220117_085025.csv'].info()

#### Combine transcutaneous data in a single DataFrame

In [None]:
%%time

transcutaneous_patient = {}
for patient in transcutaneous:
    transcutaneous_patient[patient] = pd.concat(transcutaneous[patient])

transcutaneous_all = pd.concat(transcutaneous_patient)
transcutaneous_all.index.names = ['patient', 'file', 'date_time']

In [None]:
transcutaneous_all.head()

In [None]:
transcut_recordings = transcutaneous_all.reset_index()[['patient', 'file']].drop_duplicates()
transcut_recordings

In [None]:
writer = pd.ExcelWriter(os.path.join(DIR_WRITE, 'tcpCO2_files.xlsx'))
transcut_recordings.to_excel(writer, 'tcpCO2_files')
writer.save()

In [None]:
# Export transcutaneous data as pickle archives
with open(os.path.join(DATA_DUMP, 'transcutaneous_data.pickle'), 'wb') as handle:
    pickle.dump(transcutaneous_patient, handle, protocol=pickle.HIGHEST_PROTOCOL)