In [1]:
import numpy as np
import pandas as pd
import os
from collections import defaultdict
import matplotlib.pyplot as plt
from statistics import mean
%matplotlib inline
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import sklearn.neighbors._base
import sys
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base
import missingpy
from missingpy import MissForest
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 100)
pd.options.mode.chained_assignment = None

In [None]:
#load clinical
clinical_data = pd.read_csv(clinical_data.csv')
clinical_data = clinical_data[['Age', 'Sex', 'AtrialArrhythmia', 'LowLVEF', 'ReducedLVEF', 'NormalLVEF', 
              'OHCA', 'ICM', 'NICM', 'HCM', 'DCM', 'PAS', 'PCI', 'CABG', 'MyocardialInfarction', 
              'VF', 'NSVT', 'SustVT', 'CVA', 'COPD', 'DiabetesMellitus', 'BMI', 'PAF', 'QRS_Duration', 
              'Hypertension', 'CHD', 'Sodium', 'Potassium', 'Kreatinine', 'Dyslipedaemia', 'VR', 'DR', 
              'CRTD', 'SICD', 'Vitamine_K', 'Antiaritmica_soort#Sotalol', 'ImplantationDate', 'Antiaritmica_soort#Digoxine',
              'Antiaritmica_soort#Amiodarone', 'Betablokker', 'NOAC', 'Aldosteronremmer', 'Implantation_indication',
                'MonthsTillAppropriateTherapy', 'AppropriateTherapy', 'Python_ID_Index', 'StudyID_x']]

clinical_data['MonthsTillAppropriateTherapy'] = clinical_data['MonthsTillAppropriateTherapy'] * 30

#Rename columns
clinical_data = clinical_data.rename(columns={'Python_ID_Index': 'PatientID', 'AppropriateTherapy' : 'Event', 
                              'MonthsTillAppropriateTherapy' : 'Days_until_therapy',
                             'ImplantationDate' : 'Date_ICD_implantatie'})

#Convert to date-time
clinical_data['Date_ICD_implantatie'] = pd.to_datetime(clinical_data['Date_ICD_implantatie'], format='%Y-%m-%d')

clinical_data['StudyID_x'] = clinical_data['StudyID_x'].replace(np.NaN, 2)
clinical_data['StudyID_x'][clinical_data['StudyID_x'] != 2] = 1

In [None]:
df = clinical_data

#Drop columns
var_drop = ['Date_ICD_implantatie', 'StudyID_x']
df_dateim = df
df = df.drop(var_drop, axis=1)

#Select categorical variables
var_bin = ['Sex', 'AtrialArrhythmia', 'LowLVEF', 'ReducedLVEF', 'OHCA', 'ICM', 'NICM', 'HCM', 'DCM', 'PAS',
'PCI', 'CABG', 'MyocardialInfarction', 'VF', 'NSVT','SustVT', 'CVA','COPD', 'DiabetesMellitus', 'PAF', 'Hypertension', 
'CHD', 'Dyslipedaemia', 'VR', 'DR', 'CRTD', 'SICD', 'Vitamine_K', 'Antiaritmica_soort#Sotalol', 'Antiaritmica_soort#Digoxine', 
'Antiaritmica_soort#Amiodarone', 'Betablokker', 'NOAC', 'Aldosteronremmer']
df[var_bin] = df[var_bin].astype('category')
catColumnsPos = [df.columns.get_loc(col) for col in list(df.select_dtypes('category').columns)]
df = df.replace(-99.0, np.NaN)

#Make instance of MissForest
imputer = MissForest()

#Impute missing values
df_imp = imputer.fit_transform(df, cat_vars=catColumnsPos)
tsfresh_clin_bl = pd.DataFrame(df_imp, columns=df.columns)
tsfresh_clin_bl['Date_ICD_implantatie'] = df_dateim['Date_ICD_implantatie']
tsfresh_clin_bl['StudyID_x'] = df_dateim['StudyID_x']

clinical_data_imputed = tsfresh_clin_bl

clinical_data_imputed[tsfresh_clin_bl.PatientID == 2]
clinical_data_imputed.shape
#clinical_data_imputed.to_csv('clinical_data_imputed_AT.csv')


In [None]:
#Get the ECG features
ecg_timevarying = ecg_timevarying[['Ventricular Heart Rate', 'P Axis', 'T Axis',
                                     'PR Interval', 'QRS Duration', 'QT Interval', 'QTc Interval',
                                     'Python_ID_Index', 'StudyDate', 'AF', 'SB', 'ST',
                                     'LBBB', 'RBBB', '1st degree AV block']]

#Convert to float
cols = ['Ventricular Heart Rate', 'P Axis', 'T Axis', 'PR Interval', 'QRS Duration', 
        'QT Interval', 'QTc Interval']
ecg_timevarying[cols] = ecg_timevarying[cols].astype(float)

print(ecg_timevarying.shape)

clinical_data = pd.read_csv('clinical_data.csv')
blecgdate = clinical_data[['Python_ID_Index']]

#Merge clinical data with ECGs at baseline
baseline_ecg = pd.merge(ecg_timevarying, blecgdate, on='Python_ID_Index', how='left')
baseline_ecg['StudyDate'] = pd.to_datetime(baseline_ecg['StudyDate'])
baseline_ecg['StudyDate'] = baseline_ecg['StudyDate'] + pd.Timedelta(days=14)
baseline_ecg['PR Interval'] = np.where(baseline_ecg['PR Interval'] <= -99, np.NaN, baseline_ecg['PR Interval'])
baseline_ecg['Ventricular Heart Rate'] = np.where((baseline_ecg['Ventricular Heart Rate'] <= 20) | (baseline_ecg['Ventricular Heart Rate'] > 300), np.NaN, baseline_ecg['Ventricular Heart Rate'])
baseline_ecg['QT Interval'] = np.where((baseline_ecg['QT Interval'] <= 50) | (baseline_ecg['QT Interval'] > 500), np.NaN, baseline_ecg['QT Interval'])
baseline_ecg['QTc Interval'] = np.where((baseline_ecg['QTc Interval'] <= 50) | (baseline_ecg['QTc Interval'] > 500), np.NaN, baseline_ecg['QTc Interval'])
baseline_ecg['T Axis'] = np.where((baseline_ecg['T Axis'] <= -360) | (baseline_ecg['T Axis'] > 360), np.NaN, baseline_ecg['T Axis'])
baseline_ecg['P Axis'] = np.where((baseline_ecg['P Axis'] <= -360) | (baseline_ecg['P Axis'] > 360), np.NaN, baseline_ecg['P Axis'])


In [None]:
#Impute missing values
baseline_ecg_dateimp = baseline_ecg[['StudyDate', 'Python_ID_Index']]

imputer = MissForest()
df_imp = imputer.fit_transform(baseline_ecg[['Ventricular Heart Rate', 'P Axis', 'T Axis',
                                     'PR Interval', 'QRS Duration', 'QT Interval', 'QTc Interval',
                                     'AF', 'SB', 'ST', 'LBBB', 'RBBB', '1st degree AV block']])
baseline_ecg_imp = pd.DataFrame(df_imp, columns=baseline_ecg[['Ventricular Heart Rate', 'P Axis', 'T Axis',
                                     'PR Interval', 'QRS Duration', 'QT Interval', 'QTc Interval',
                                     'AF', 'SB', 'ST', 'LBBB', 'RBBB', '1st degree AV block']].columns)
baseline_ecg_imp['StudyDate'] = baseline_ecg_dateimp['StudyDate']
baseline_ecg_imp['Python_ID_Index'] = baseline_ecg_dateimp['Python_ID_Index'] 
#baseline_ecg_imp.to_csv('baseline_ecg_imp.csv')

In [3]:
import pandas as pd
baseline_ecg_imp = pd.read_csv('baseline_ecg_imp.csv', index_col=[0])
clinical_data_imputed = pd.read_csv('clinical_data_imputed_AT.csv', index_col=[0])
ecg_timevarying = pd.read_csv('predictions_and_intervals_allECG.csv', index_col=[0])

  ecg_timevarying = pd.read_csv('predictions_and_intervals_allECG.csv')


In [11]:
'''
Step 1. Merging the "baseline_ecg_imp" DataFrame with the "clinical" DataFrame based on the 'Python_ID_Index' 
column and storing it in a new DataFrame called "final".

Step 2.The code is converting the "StudyDate" column of the "final" DataFrame to a datetime format and 
multiplying the "MonthsTillAppropriateTherapy" column by 30.

Step 3. Renaming the columns 'Python_ID_Index' to 'PatientID' and 'StudyDate_x' to 'StudyDate' in the 
"final" DataFrame and storing it in a new DataFrame called "df_time_varying". It is also converting the "StudyDate"
column in "df_time_varying" to datetime format.
'''

from datetime import datetime
import pickle

#Read clinical data
clinical_data = pd.read_csv('clinical_data.csv', index_col=[0])
clinical = clinical_data[['Python_ID_Index', 'MonthsTillAppropriateTherapy', 'AppropriateTherapy']]

final = pd.merge(baseline_ecg_imp, clinical , on='Python_ID_Index', how='left')

#Convet to datetime-format
final['StudyDate'] = pd.to_datetime(final['StudyDate'])
final['MonthsTillAppropriateTherapy'] = final['MonthsTillAppropriateTherapy'] * 30
df_time_varying = final.rename(columns={'Python_ID_Index': 'PatientID', 'StudyDate_x' : 'StudyDate'})
df_time_varying['StudyDate'] = pd.to_datetime(df_time_varying['StudyDate'], format='%Y-%m-%d')

#Select rows with no missing study date
df_time_varying = df_time_varying[df_time_varying['StudyDate'].notna()]

In [12]:
from create_cpiu import *

df1 = clinical_data_imputed
df2 = df_time_varying

#Avoid QRS duration being in the final df twice
df1 = df1.drop(['QRS_Duration'], axis=1)

#Create CPIU
cpiu = create_cpiu(df1, 'PatientID', 'Days_until_therapy', 'Event', 'StudyDate', 'Date_ICD_implantatie', df2, N_INTERVALS, INTERVAL_LENGTH)
cpiu = cpiu.groupby(['PatientID'], as_index = False).apply(lambda group: group.ffill())
cpiu = cpiu.groupby(['PatientID'], as_index = False).apply(lambda group: group.bfill())

#Delete rows with missing values
cpiu = cpiu.dropna()
df = cpiu

print('event in EventDuringCPIU train\n', cpiu['EventDuringCPIU'].value_counts())

#Save the df
#df.to_csv('cpiu_AT.csv')

Unnamed: 0,Ventricular Heart Rate,P Axis,T Axis,PR Interval,QRS Duration,QT Interval,QTc Interval,AF,SB,ST,LBBB,RBBB,1st degree AV block,StudyDate,PatientID,MonthsTillAppropriateTherapy,AppropriateTherapy,Date_ICD_implantatie,DaysUntilECG
0,80.0,73.0,135.0,232.0,156.0,438.0,499.17,0.0,0.0,0.0,0.0,0.0,1.0,2013-03-05,0.0,1250.759754,1.0,2013-02-19,14
1,82.0,-40.0,82.0,152.0,146.0,402.0,469.0,0.0,0.0,0.0,1.0,0.0,1.0,2013-03-06,0.0,1250.759754,1.0,2013-02-19,15
2,62.0,126.0,100.0,136.0,158.0,456.0,462.0,0.0,0.0,0.0,1.0,0.0,1.0,2013-05-06,0.0,1250.759754,1.0,2013-02-19,1
3,61.0,18.0,95.0,152.0,148.0,440.0,442.0,0.0,0.0,0.0,0.0,1.0,1.0,2013-06-06,0.0,1250.759754,1.0,2013-02-19,107
4,67.0,65.0,104.0,152.0,150.0,428.0,452.0,0.0,0.0,0.0,1.0,0.0,1.0,2013-11-28,0.0,1250.759754,1.0,2013-02-19,282
5,78.0,119.0,74.0,144.0,160.0,448.0,498.8,0.0,0.0,0.0,0.0,0.0,1.0,2014-06-26,0.0,1250.759754,1.0,2013-02-19,492
6,69.0,63.0,89.0,160.0,154.0,428.0,458.0,0.0,0.0,0.0,0.0,0.0,0.0,2015-01-01,0.0,1250.759754,1.0,2013-02-19,681
7,60.0,113.0,91.0,112.0,168.0,476.0,476.0,0.0,0.0,0.0,0.0,0.0,0.0,2015-07-23,0.0,1250.759754,1.0,2013-02-19,884
8,66.0,69.0,95.0,142.0,156.0,452.0,473.0,0.0,0.0,0.0,0.0,1.0,0.0,2016-02-04,0.0,1250.759754,1.0,2013-02-19,1080
9,65.0,86.0,52.0,216.0,90.0,378.0,393.0,0.0,0.0,0.0,0.0,0.0,0.0,2016-08-25,0.0,1250.759754,1.0,2013-02-19,1283


!!! out of interval !!! 3766.078028747433
!!! out of interval !!! 3021.930184804929
!!! out of interval !!! 2590.225872689938
!!! out of interval !!! 2714.4147843942505
!!! out of interval !!! 2786.365503080082
!!! out of interval !!! 3615.2772073921983
!!! out of interval !!! 2403.94250513347
!!! out of interval !!! 3103.7371663244344
!!! out of interval !!! 3172.731006160164
!!! out of interval !!! 3384.640657084188
!!! out of interval !!! 3575.852156057496
!!! out of interval !!! 3095.852156057496
!!! out of interval !!! 2647.3921971252566
!!! out of interval !!! 3924.7638603696087
!!! out of interval !!! 2534.045174537988
!!! out of interval !!! 2692.7310061601643
!!! out of interval !!! 3730.5954825462
!!! out of interval !!! 3595.5646817248467
!!! out of interval !!! 3742.422997946613
!!! out of interval !!! 3094.866529774128
!!! out of interval !!! 3002.2176591375783
!!! out of interval !!! 2631.622176591376
!!! out of interval !!! 3456.591375770019
!!! out of interval !!! 2940.

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  cpiu = cpiu.groupby(['PatientID'], as_index = False).apply(lambda group: group.ffill())
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  cpiu = cpiu.groupby(['PatientID'], as_index = False).apply(lambda group: group.bfill())


event in EventDuringCPIU train
 0    37855
1      786
Name: EventDuringCPIU, dtype: int64
