# Preprocess RACat

In [1]:
import lifelines
import pandas as pd
import datetime
import matplotlib
import matplotlib.pyplot as plt
from lifelines import KaplanMeierFitter
from lifelines.statistics import logrank_test
from sklearn.preprocessing import LabelEncoder
from IPython.display import Markdown as md
from lifelines import CoxPHFitter
from lifelines import AalenJohansenFitter
import numpy as np
from scipy.stats import ttest_ind
from scipy.stats import binom_test
from scipy.stats import chi2_contingency
from statsmodels.stats.proportion import proportions_ztest
from scipy.stats import iqr
import lifelines
import warnings;
warnings.filterwarnings('ignore');

In [2]:
data = pd.read_csv('procesed_racat.csv')

In [3]:
len(data)

25617

In [4]:
# Variable renaming:
print(data.columns)

Index(['nia', 'edat', 'edat_g', 'codisexe', 'sex', 'up', 'upnom', 'nivah',
       'nivah_nom', 'C_Ingres', 'C_Ingres_nom', 'antibio1', 'viscositat',
       'any_qx', 'dat_qx_x', 'codisituaciopacient_new', 'dat_situapac_newrac',
       'charlindex', 'elix_cnt', 'Congestive_Heart_Failure',
       'Cardiac_arrhytmias', 'Valvular_Disease', 'Pulmonary_Circulation...',
       'Peripheral_Vascular_D...', 'Hipertensió', 'Paralysis',
       'Altres_trastorns_neur...', 'MPOC', '_Diabetis_no_complicada',
       '_Diabetis_complicada', '_Hypothyroidism', '_Renal_Failure',
       '_Liver_Disease', '_Peptic_Ulcer_Disease_...', '_AIDS', '_Lymphoma',
       '_Metastatic_Cancer', '_Solid_Tumor_Without_M...',
       '_Rheumatoid_Arthritis', '_Coagulopathy', '_Obesitat', '_Weight_Loss',
       '_Fluid_and_Electrolyte...', '_Blood_Loss_Anemia', '_Deficiency_Anemia',
       '_Alcohol_Abuse', '_Drug_Abuse', '_Psychoses', '_Depressió',
       '_Hypertension,_complicated', 'AMI (Acute Myocardial) ',
       'C

In [5]:
#print(data.columns)
data.columns = ['nia', 'edat', 'edat_g', 'codisexe', 'sex', 'up', 'upnom', 'nivah',
       'nivah_nom', 'C_Ingres', 'C_Ingres_nom', 'antibio1', 'viscositat',
       'any_qx', 'dat_qx_x', 'codisituaciopacient_new', 'dat_situapac_newrac',
       'charlindex', 'elix_cnt', 
                
       'Congestive_Heart_Failure','Cardiac_arrhytmias', 'Valvular_Disease', 'Pulmonary_Circulation',
       'Peripheral_Vascular_dis', 'Hipertension', 'Paralysis',
       'Altres_trastorns_neur', 'MPOC', 'Diabetis_no_complicada',
       'Diabetis_complicada', 'Hypothyroidism', 'Renal_Failure',
       'Liver_Disease', 'Peptic_Ulcer_Disease', 'AIDS', 'Lymphoma',
       'Metastatic_Cancer', 'Solid_Tumor_Without_M',
       'Rheumatoid_Arthritis', 'Coagulopathy', 'Obesitat', 'Weight_Loss',
       'Fluid_and_Electrolyte', 'Blood_Loss_Anemia', 'Deficiency_Anemia',
       'Alcohol_Abuse', 'Drug_Abuse', 'Psychoses', 'Depressió','Hypertension_complicated',
                
       'Acute_Myocardial','Congestive_Heart_f', 'Peripheral_Vascular_d',
       'Cerebrovascular_d', 'Dementia',
       'Chronic_Obstructive_Pulmonary_d', 'Rheumatoid_Disease',
       'Peptic_Ulcer_d', 'Mild_Liver_d', 'Diabetes',
       'Diabetes_Complications', 'Hemiplegia_or_Paraplegia',
       'Renal_d', 'Cancer', 'Moderate_Severe_Liver_d',
       'Metastatic_Cancer_2', 'AIDS_2', 
                
       'Nia_loc', 'dat_qx_y',
       'diagR', 'Durada_intervencio_minuts',
       'smoking_date', 'smoking_value',
       'bmi_date', 'bmi_val']

In [6]:
# Select diseases and order
data = data[[
  'nia', 'Nia_loc', 'antibio1', 'dat_qx_x', 'codisituaciopacient_new', 'dat_situapac_newrac', 'dat_qx_y', 'diagR', 
  'edat', 'edat_g', 'codisexe', 'sex', 'any_qx', 'up', 'upnom', 'nivah','nivah_nom', 
  'C_Ingres', 'C_Ingres_nom', 'viscositat', 'Durada_intervencio_minuts',
  
'smoking_date', 'smoking_value',
'bmi_date', 'bmi_val',
  
'Congestive_Heart_Failure',
'Cardiac_arrhytmias',
'Valvular_Disease',
'Peripheral_Vascular_dis',
'Altres_trastorns_neur',
'Rheumatoid_Arthritis',
'Fluid_and_Electrolyte',
'Alcohol_Abuse',
'Pulmonary_Circulation',
'Hipertension',
'MPOC',
'Hypothyroidism',
'Peptic_Ulcer_Disease',
'Coagulopathy',
'Blood_Loss_Anemia',
'Drug_Abuse',
'Paralysis',
'AIDS',
'Obesitat',
'Deficiency_Anemia',
'Psychoses',
'Lymphoma',
'Weight_Loss',
'Depressió',
'Hypertension_complicated',
  
'Acute_Myocardial',
'Cerebrovascular_d',
'Renal_d',
'Dementia',
'Mild_Liver_d',
'Cancer',
'Diabetes',
'Moderate_Severe_Liver_d',
  
'elix_cnt', 'charlindex'

]]

In [7]:
# LINE DROP OPERATION
# Drop those we dont know whether they have antibiotic or not
data = data[~(data.antibio1=='Sense informar')].copy()
data.reset_index(inplace=True, drop=True)

In [8]:
len(data)

22871

In [9]:
# Check elixhauser columns with nans
idx_old = data[data[data.columns[25]].isna()].index
for col in range(25,50):
  print(len(data[data[data.columns[col]].isna()]))
  idx_new = data[data[data.columns[col]].isna()].index
  print((idx_old==idx_new).all())
  idx_old=idx_new

90
True
90
True
90
True
90
True
90
True
90
True
90
True
90
True
90
True
90
True
90
True
90
True
90
True
90
True
90
True
90
True
90
True
90
True
90
True
90
True
90
True
90
True
90
True
90
True
90
True


In [10]:
# LINE DROP OPERATION
# lines with elixhauser as NAN
data = data[~data.index.isin(idx_new)].copy()
data.reset_index(inplace=True, drop=True)

In [11]:
len(data)

22781

In [12]:
# Check charlson columns with nans
idx_old = data[data[data.columns[52]].isna()].index
for col in range(50,58):
  print(len(data[data[data.columns[col]].isna()]))
  idx_new = data[data[data.columns[col]].isna()].index
  print((idx_old==idx_new).all())
  idx_old=idx_new

0
True
0
True
0
True
0
True
0
True
0
True
0
True
0
True


In [13]:
# LINE DROP OPERATION
# lines with elixhauser as NAN
data = data[~data.index.isin(idx_new)].copy()
data.reset_index(inplace=True, drop=True)

In [14]:
len(data)

22781

In [15]:
# COVARIATE MISSING FILLING OP
# Complete smoking status
data.smoking_value = data.smoking_value.fillna(0)

In [16]:
# COVARIATE MISSING FILLING OP
# Fill BMI NaNs with average of group age
for age in data.edat_g.unique():
    val = data[data.edat_g==age]['bmi_val'].mean(skipna=True)
    data.loc[(data.bmi_val.isna()) & (data.edat_g==age), 'bmi_val'] = val

In [17]:
# COVARIATE MISSING FILLING OP
# Fill durada_intervenció with mean of the UP
for up in data.up.unique():
    val = data[data.up==up]['Durada_intervencio_minuts'].mean(skipna=True)
    data.loc[(data.Durada_intervencio_minuts.isna()|data.Durada_intervencio_minuts==0.) & (data.up==up), 'Durada_intervencio_minuts'] = val
    
data.Durada_intervencio_minuts = data.Durada_intervencio_minuts.fillna(data.Durada_intervencio_minuts.mean())

In [18]:
# Change Charlson vars to binary (from Absent/Present)
for charl_col in ['Acute_Myocardial','Cerebrovascular_d','Renal_d','Dementia','Mild_Liver_d','Cancer','Diabetes','Moderate_Severe_Liver_d']:
  data[charl_col] = data[charl_col].map({'Absent':0, 'Present':1})

In [19]:
# Group antibiotic
data['Antibiotic'] = np.zeros(len(data))
data.loc[~((data.antibio1=='Sense antibiòtic')) , 'Antibiotic'] = 1
data['Antibiotic'] = data['Antibiotic'].astype(int)

In [20]:
# Compute boolean of "observed event": 1 if there is revision of the prothesis (not survived) and 0 otherwise
data['Ea']=1-data.dat_qx_y.isna()
data['Ei']=data['Ea'] & ((data.diagR=='Infecció i reacció inflamatòria') | (data.diagR=='2n temps Qx') | (data.diagR=='Afluixament sèptic'))
data['En']=data['Ea'] & (~data['Ei'])
data['Ei'] = data['Ei'].astype('int')
data['En'] = data['En'].astype('int')

In [21]:
# Set date2 to end of study period for those cases where event is not observed, for computing durations
data.loc[data.Ea==0, 'dat_qx_y'] = '2024-01-01'

In [22]:
# Compute event when there is death (for competing risk analysis)
data['Ed'] = pd.Series(np.zeros(len(data)))
data.loc[data['codisituaciopacient_new']=='D', 'Ed']=2

In [23]:
# Convert to datetimes
data['dat_qx_x'] = pd.to_datetime(data['dat_qx_x'], utc=True)
data['dat_qx_y'] = pd.to_datetime(data['dat_qx_y'], utc=True)
data['dat_defunc'] = pd.to_datetime(data.loc[data.Ed==2, 'dat_situapac_newrac'], utc=True)
data['smoking_date'] = pd.to_datetime(data['smoking_date'], utc=True)
data['bmi_date'] = pd.to_datetime(data['bmi_date'], utc=True)

In [24]:
# Compute number of months
data['T'] = (data.dat_qx_y - data.dat_qx_x)
data['T'] = data['T'].astype('str').str.strip(' days')
data['T'] = data['T'].astype('float')/30.

In [25]:
# Compute time when there is death (for competing risk analysis)
data.loc[data.Ed==2, 'T'] = (data[data.Ed==2].dat_defunc - data[data.Ed==2].dat_qx_x).astype('str').str.strip(' days').astype('float')/30.

In [26]:
data.to_csv('racat_prep.csv')

In [27]:
len(data)

22781