In [6]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer


In [7]:
data_path = r'C:\Users\aaani\OneDrive - Birmingham City University\Postgrad\Dissertation\Data'
#df = pd.read_csv(f'{data_path}/final_dataset.csv.gz')

In [8]:
historical = pd.read_csv(f'{data_path}/historical_cohort.csv.gz')
contemporary = pd.read_csv(f'{data_path}/contemporary_cohort.csv.gz')

In [None]:
# split the data to prevent data leakage
historical_groups = ['2008 - 2010', '2011 - 2013', '2014 - 2016']
contemporary_groups = ['2017 - 2019', '2020 - 2022']

historical = df[df['anchor_year_group'].isin(historical_groups)]
contemporary = df[df['anchor_year_group'].isin(contemporary_groups)]

In [9]:
# to display all columns, to be able to asses the na values
pd.set_option('display.max_rows', None)


missing_counts = historical.isnull().sum()
missing_values_df = pd.DataFrame({
    'missing_count': missing_counts,
})

print(missing_values_df.sort_values(by='missing_count', ascending=False))


                        missing_count
subject_id                          0
hadm_id                             0
admittime                           0
dischtime                           0
race                                0
hospital_expire_flag                0
gender                              0
anchor_age                          0
anchor_year                         0
anchor_year_group                   0
icd_code                            0
icd_version                         0
stay_id                             0
los                                 0
mean_Anion Gap                      0
mean_Bicarbonate                    0
mean_Calcium, Total                 0
mean_Chloride                       0
mean_Creatinine                     0
mean_Glucose                        0
mean_Hematocrit                     0
mean_Hemoglobin                     0
mean_MCH                            0
mean_MCHC                           0
mean_MCV                            0
mean_Magnesi

In [10]:
missing_counts = contemporary.isnull().sum()
missing_values_df = pd.DataFrame({
    'missing_count': missing_counts,
})

print(missing_values_df.sort_values(by='missing_count', ascending=False))


                        missing_count
subject_id                          0
hadm_id                             0
admittime                           0
dischtime                           0
race                                0
hospital_expire_flag                0
gender                              0
anchor_age                          0
anchor_year                         0
anchor_year_group                   0
icd_code                            0
icd_version                         0
stay_id                             0
los                                 0
mean_Anion Gap                      0
mean_Bicarbonate                    0
mean_Calcium, Total                 0
mean_Chloride                       0
mean_Creatinine                     0
mean_Glucose                        0
mean_Hematocrit                     0
mean_Hemoglobin                     0
mean_MCH                            0
mean_MCHC                           0
mean_MCV                            0
mean_Magnesi

In [None]:
# creating a new feature 'admitted_to_icu' to indicate if a patient was admitted to ICU during their hospital stay
historical['admitted_to_icu']= historical['stay_id'].notnull().astype(int)
contemporary['admitted_to_icu']= contemporary['stay_id'].notnull().astype(int)

#any empty stay ids are filled with 0
historical['stay_id'] = historical['stay_id'].fillna(0).astype(int)
contemporary['stay_id'] = contemporary['stay_id'].fillna(0).astype(int)

#calculate length of stay for hospital 
historical['admittime'] = pd.to_datetime(historical['admittime'])# ensure column is in datetime format
historical['dischtime'] = pd.to_datetime(historical['dischtime']) # type: ignore

contemporary['admittime'] = pd.to_datetime(contemporary['admittime'])
contemporary['dischtime'] = pd.to_datetime(contemporary['dischtime'])

historical['hospital_los'] = (historical['dischtime'] - historical['admittime']).dt.total_seconds() / (24 * 3600)
contemporary['hospital_los'] = (contemporary['dischtime'] - contemporary['admittime']).dt.total_seconds() / (24 * 3600)

# Fill missing ICU length of stay values with 0
historical['los'] = historical['los'].fillna(0)
contemporary['los'] = contemporary['los'].fillna(0)


missing_diagnoses_df = historical[historical['icd_code'].isnull() & historical['icd_version'].isnull()]
print(f"Found {len(missing_diagnoses_df)} rows with missing ICD codes.")
missing_diagnoses_df.head()

In [None]:
# Drop columns with high missing values 
cols_to_drop = [
    'max_Arterial Blood Pressure systolic', 'mean_Arterial Blood Pressure mean', 
    'mean_Arterial Blood Pressure systolic', 'min_Arterial Blood Pressure diastolic', 
    'min_Arterial Blood Pressure systolic', 'min_Arterial Blood Pressure mean', 
    'max_Arterial Blood Pressure diastolic', 'max_Arterial Blood Pressure mean', 
    'mean_Arterial Blood Pressure diastolic', 'max_Orientation', 'min_Orientation', 
    'mean_Orientation', 'min_Strength L Arm', 'mean_Strength L Arm', 
    'max_Strength L Arm', 'mean_ST Segment Monitoring On', 'max_ST Segment Monitoring On', 
    'min_ST Segment Monitoring On', 'mean_Temperature Fahrenheit', 'min_Temperature Fahrenheit', 
    'max_Temperature Fahrenheit', 'min_Non Invasive Blood Pressure mean', 'max_Non Invasive Blood Pressure mean', 
    'mean_Non Invasive Blood Pressure mean', 'mean_Non Invasive Blood Pressure systolic', 
    'max_Non Invasive Blood Pressure diastolic', 'max_Non Invasive Blood Pressure systolic', 
    'mean_Non Invasive Blood Pressure diastolic', 'min_Non Invasive Blood Pressure diastolic', 
    'min_Non Invasive Blood Pressure systolic', 'mean_Parameters Checked', 'max_Parameters Checked', 
    'min_Parameters Checked', 'max_Alarms On', 'mean_Alarms On', 'min_Alarms On', 
    'max_Richmond-RAS Scale', 'mean_Richmond-RAS Scale', 'min_Richmond-RAS Scale', 
    'max_GCS - Verbal Response', 'min_GCS - Verbal Response', 'mean_GCS - Verbal Response', 
    'mean_GCS - Motor Response', 'min_GCS - Motor Response', 'max_GCS - Motor Response', 
    'mean_GCS - Eye Opening', 'max_GCS - Eye Opening', 'min_GCS - Eye Opening', 
    'max_Activity / Mobility (JH-HLM)', 'min_Activity / Mobility (JH-HLM)', 
    'mean_Activity / Mobility (JH-HLM)', 'min_Respiratory Rate', 'max_Respiratory Rate', 
    'mean_Respiratory Rate', 'mean_Heart Rate', 'mean_O2 saturation pulseoxymetry', 
    'max_O2 saturation pulseoxymetry', 'max_Heart Rate', 'min_O2 saturation pulseoxymetry', 
    'min_Heart Rate', 'dod', 'outtime'
]

historical = historical.drop(columns=cols_to_drop)
contemporary = contemporary.drop(columns=cols_to_drop)

In [None]:
# removing rows with missing diagnoses
historical = historical.dropna(subset=['icd_code', 'icd_version'])
contemporary = contemporary.dropna(subset=['icd_code', 'icd_version'])

In [7]:
cols_to_impute = [
    'min_Phosphate', 'max_Phosphate', 'mean_Phosphate',
    'mean_Calcium, Total', 'max_Calcium, Total', 'min_Calcium, Total',
    'mean_Magnesium', 'min_Magnesium', 'max_Magnesium',
    'max_RDW', 'min_RDW', 'mean_RDW',
    'mean_MCH', 'max_MCH', 'min_MCH',
    'mean_MCV', 'max_MCV', 'min_MCV',
    'min_Red Blood Cells', 'mean_Red Blood Cells', 'max_Red Blood Cells',
    'min_MCHC', 'mean_MCHC', 'max_MCHC',
    'max_White Blood Cells', 'mean_White Blood Cells', 'min_White Blood Cells',
    'max_Hemoglobin', 'mean_Hemoglobin', 'min_Hemoglobin',
    'min_Platelet Count', 'mean_Platelet Count', 'max_Platelet Count',
    'min_Glucose', 'max_Glucose', 'mean_Glucose',
    'mean_Anion Gap', 'min_Anion Gap', 'max_Anion Gap',
    'max_Bicarbonate', 'min_Bicarbonate', 'mean_Bicarbonate',
    'mean_Hematocrit', 'min_Hematocrit', 'max_Hematocrit',
    'max_Chloride', 'min_Chloride', 'mean_Chloride',
    'max_Sodium', 'mean_Sodium', 'min_Sodium',
    'mean_Urea Nitrogen', 'min_Urea Nitrogen', 'max_Urea Nitrogen',
    'mean_Creatinine', 'min_Creatinine', 'max_Creatinine',
    'max_Potassium', 'min_Potassium', 'mean_Potassium'
]

# imputing missing values with mean to not change the existing distributions
imputer = SimpleImputer(strategy='mean')

historical[cols_to_impute] = imputer.fit_transform(historical[cols_to_impute])
contemporary[cols_to_impute] = imputer.transform(contemporary[cols_to_impute])


In [9]:
def create_readmission_target(df):
    """
    Calculates the 30-day readmission column for a given dataframe.
    """
    
    #create temp df to calulate the target variable
    temp_df = df[['subject_id', 'hadm_id', 'admittime', 'dischtime']].copy()

    # De-duplicate the temporary dataframe to handle multiple events per admission
    temp_df = temp_df.drop_duplicates()
    
    # Ensure date columns are datetime objects
    temp_df['admittime'] = pd.to_datetime(temp_df['admittime'])
    temp_df['dischtime'] = pd.to_datetime(temp_df['dischtime'])

    # Sort by patient and admission time to ensure correct chronological order
    temp_df = temp_df.sort_values(by=['subject_id', 'admittime'])

    # For each patient, find the timestamp of their next admission
    temp_df['next_admission_time'] = temp_df.groupby('subject_id')['admittime'].shift(-1) #groups data by a single patient, only focuses on the admit time

    # Calculate the time in days from discharge to the next admission
    temp_df['days_to_next_admission'] = (temp_df['next_admission_time'] - temp_df['dischtime']).dt.days

    # Create the binary target variable
    temp_df['readmission_30d'] = (temp_df['days_to_next_admission'] <= 30).astype(int)

    # Merge just the new target column back into the original dataframe
    df = pd.merge(df, temp_df[['hadm_id', 'readmission_30d']], on='hadm_id', how='left')
    
    return df

historical = create_readmission_target(historical)
contemporary = create_readmission_target(contemporary)

In [13]:
# Drop unnecessary columns
cols_to_drop = ['admittime', 'dischtime', 'anchor_year']

historical = historical.drop(columns=cols_to_drop)
contemporary = contemporary.drop(columns=cols_to_drop)

In [14]:
historical.to_csv(f'{data_path}/historical_cohort.csv.gz', index=False, compression='gzip')
contemporary.to_csv(f'{data_path}/contemporary_cohort.csv.gz', index=False, compression='gzip')

In [11]:
historical

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,race,hospital_expire_flag,gender,anchor_age,anchor_year,anchor_year_group,...,max_Platelet Count,max_Potassium,max_RDW,max_Red Blood Cells,max_Sodium,max_Urea Nitrogen,max_White Blood Cells,admitted_to_icu,hospital_los,readmission_30d
0,10000032,22595853,2180-05-06 22:23:00,2180-05-07 17:15:00,WHITE,0,F,52,2180,2014 - 2016,...,71.000000,4.500000,15.300000,3.800000,137.000000,25.000000,4.200000,0,0.786111,0
1,10000032,22595853,2180-05-06 22:23:00,2180-05-07 17:15:00,WHITE,0,F,52,2180,2014 - 2016,...,71.000000,4.500000,15.300000,3.800000,137.000000,25.000000,4.200000,0,0.786111,0
2,10000032,22595853,2180-05-06 22:23:00,2180-05-07 17:15:00,WHITE,0,F,52,2180,2014 - 2016,...,71.000000,4.500000,15.300000,3.800000,137.000000,25.000000,4.200000,0,0.786111,0
3,10000032,22595853,2180-05-06 22:23:00,2180-05-07 17:15:00,WHITE,0,F,52,2180,2014 - 2016,...,71.000000,4.500000,15.300000,3.800000,137.000000,25.000000,4.200000,0,0.786111,0
4,10000032,22595853,2180-05-06 22:23:00,2180-05-07 17:15:00,WHITE,0,F,52,2180,2014 - 2016,...,71.000000,4.500000,15.300000,3.800000,137.000000,25.000000,4.200000,0,0.786111,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5052624,19999987,23865745,2145-11-02 21:38:00,2145-11-11 12:57:00,UNKNOWN,0,F,57,2145,2011 - 2013,...,238.016113,4.188413,15.562005,3.551424,138.610387,25.545923,8.330381,1,8.638194,0
5052625,19999987,23865745,2145-11-02 21:38:00,2145-11-11 12:57:00,UNKNOWN,0,F,57,2145,2011 - 2013,...,238.016113,4.188413,15.562005,3.551424,138.610387,25.545923,8.330381,1,8.638194,0
5052626,19999987,23865745,2145-11-02 21:38:00,2145-11-11 12:57:00,UNKNOWN,0,F,57,2145,2011 - 2013,...,238.016113,4.188413,15.562005,3.551424,138.610387,25.545923,8.330381,1,8.638194,0
5052627,19999987,23865745,2145-11-02 21:38:00,2145-11-11 12:57:00,UNKNOWN,0,F,57,2145,2011 - 2013,...,238.016113,4.188413,15.562005,3.551424,138.610387,25.545923,8.330381,1,8.638194,0


In [12]:
contemporary

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,race,hospital_expire_flag,gender,anchor_age,anchor_year,anchor_year_group,...,max_Platelet Count,max_Potassium,max_RDW,max_Red Blood Cells,max_Sodium,max_Urea Nitrogen,max_White Blood Cells,admitted_to_icu,hospital_los,readmission_30d
0,10000084,23052089,2160-11-21 01:56:00,2160-11-25 14:52:00,WHITE,0,M,72,2160,2017 - 2019,...,238.016113,4.188413,15.562005,3.551424,138.610387,25.545923,8.330381,0,4.538889,0
1,10000084,23052089,2160-11-21 01:56:00,2160-11-25 14:52:00,WHITE,0,M,72,2160,2017 - 2019,...,238.016113,4.188413,15.562005,3.551424,138.610387,25.545923,8.330381,0,4.538889,0
2,10000084,23052089,2160-11-21 01:56:00,2160-11-25 14:52:00,WHITE,0,M,72,2160,2017 - 2019,...,238.016113,4.188413,15.562005,3.551424,138.610387,25.545923,8.330381,0,4.538889,0
3,10000084,23052089,2160-11-21 01:56:00,2160-11-25 14:52:00,WHITE,0,M,72,2160,2017 - 2019,...,238.016113,4.188413,15.562005,3.551424,138.610387,25.545923,8.330381,0,4.538889,0
4,10000084,23052089,2160-11-21 01:56:00,2160-11-25 14:52:00,WHITE,0,M,72,2160,2017 - 2019,...,238.016113,4.188413,15.562005,3.551424,138.610387,25.545923,8.330381,0,4.538889,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1546723,19999828,29734428,2147-07-18 16:23:00,2147-08-04 18:10:00,WHITE,0,F,46,2147,2017 - 2019,...,367.000000,4.188413,17.800000,4.380000,138.610387,25.545923,9.300000,0,17.074306,0
1546724,19999828,29734428,2147-07-18 16:23:00,2147-08-04 18:10:00,WHITE,0,F,46,2147,2017 - 2019,...,367.000000,4.188413,17.800000,4.380000,138.610387,25.545923,9.300000,0,17.074306,0
1546725,19999828,29734428,2147-07-18 16:23:00,2147-08-04 18:10:00,WHITE,0,F,46,2147,2017 - 2019,...,367.000000,4.188413,17.800000,4.380000,138.610387,25.545923,9.300000,0,17.074306,0
1546726,19999828,29734428,2147-07-18 16:23:00,2147-08-04 18:10:00,WHITE,0,F,46,2147,2017 - 2019,...,367.000000,4.188413,17.800000,4.380000,138.610387,25.545923,9.300000,0,17.074306,0
