In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from tqdm import tqdm


# Functions for demographics
def calculate_age(dob, intime):
    return intime.year - dob.year - ((intime.month, intime.day) < (dob.month, dob.day))

def categorize_age(age):
    if 15 <= age <= 29:
        return '15-29'
    elif 30 <= age <= 49:
        return '30-49'
    elif 50 <= age <= 69:
        return '50-69'
    else:
        return '70-89'

def categorize_ethnicity(ethnicity):
    ethnicity = ethnicity.upper()
    if ethnicity in ['WHITE', 'WHITE - RUSSIAN', 'WHITE - OTHER EUROPEAN', 'WHITE - BRAZILIAN', 'WHITE - EASTERN EUROPEAN']:
        return 'White'
    elif ethnicity in ['BLACK/AFRICAN AMERICAN', 'BLACK/CAPE VERDEAN', 'BLACK/HAITIAN', 'BLACK/AFRICAN', 'CARIBBEAN ISLAND']:
        return 'Black'
    elif ethnicity in ['HISPANIC OR LATINO', 'HISPANIC/LATINO - PUERTO RICAN', 'HISPANIC/LATINO - DOMINICAN', 'HISPANIC/LATINO - MEXICAN']:
        return 'Hispanic'
    elif ethnicity in ['ASIAN', 'ASIAN - CHINESE', 'ASIAN - INDIAN']:
        return 'Asian'
    else:
        return 'Other'

def categorize_insurance(insurance):
    if 'MEDICARE' in insurance.upper():
        return 'Medicare'
    elif 'PRIVATE' in insurance.upper():
        return 'Private'
    elif 'MEDICAID' in insurance.upper():
        return 'Medicaid'
    elif 'SELF PAY' in insurance.upper():
        return 'Self Pay'
    else:
        return 'Government'

# Functions for text preprocessing
def preprocess1(x):
    y = re.sub(r'\[(.*?)\]', '', x)         
    y = re.sub(r'[0-9]+\.', '', y)           
    y = re.sub(r'dr\.', 'doctor', y)
    y = re.sub(r'm\.d\.', 'md', y)
    y = re.sub(r'admission date:', '', y)
    y = re.sub(r'discharge date:', '', y)
    y = re.sub(r'--|__|==', '', y)
    return y

def preprocessing(df):
    df = df.copy()
    df['TEXT'] = df['TEXT'].fillna(' ')
    df['TEXT'] = df['TEXT'].str.replace('\n', ' ', regex=False)
    df['TEXT'] = df['TEXT'].str.replace('\r', ' ', regex=False)
    df['TEXT'] = df['TEXT'].apply(str.strip)
    df['TEXT'] = df['TEXT'].str.lower()
    df['TEXT'] = df['TEXT'].apply(lambda x: preprocess1(x))
    return df

def calculate_short_term_mortality(icu_stays):
    # If DEATHTIME is not null then label as positive (1)
    icu_stays['short_term_mortality'] = icu_stays['DEATHTIME'].notnull().astype(int)
    return icu_stays

def calculate_readmission(icu_stays):
    # Check that the required columns exist
    required = ['DISCHTIME', 'INTIME', 'hadm_id']
    for col in required:
        if col not in icu_stays.columns:
            raise KeyError(f"Column {col} is missing in the input data.")
    
    # Sort by subject, admission time, and ICU intime
    icu_stays = icu_stays.sort_values(by=['subject_id', 'ADMITTIME', 'INTIME'])
    
    # Get the discharge time of the current admission
    icu_stays['current_admission_dischtime'] = icu_stays.groupby(['subject_id', 'hadm_id'])['DISCHTIME'].transform('first')
    
    # For each subject, get the next ICU admission's intime and corresponding hadm_id
    icu_stays['next_admission_icu_intime'] = icu_stays.groupby('subject_id')['INTIME'].shift(-1)
    icu_stays['next_hadm_id'] = icu_stays.groupby('subject_id')['hadm_id'].shift(-1)
    
    # Calculate readmission if the next ICU admission happens within 30 days of the current discharge time
    icu_stays['readmission_within_30_days'] = (
        (icu_stays['next_admission_icu_intime'] - icu_stays['current_admission_dischtime']).dt.days <= 30
    ).astype(int)
    icu_stays['readmission_within_30_days'] = icu_stays['readmission_within_30_days'].fillna(0).astype(int)
    return icu_stays


admissions_path = 'ADMISSIONS.csv.gz'
icustays_path   = 'ICUSTAYS.csv.gz'
patients_path   = 'PATIENTS.csv.gz'
notes_path      = 'NOTEEVENTS.csv.gz'

df_adm = pd.read_csv(admissions_path, compression='gzip', low_memory=False, 
                     usecols=['SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'DISCHTIME', 'DEATHTIME', 'ETHNICITY', 'INSURANCE'])
df_icustays = pd.read_csv(icustays_path, compression='gzip', low_memory=False, 
                          usecols=['SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'INTIME', 'OUTTIME'])
df_patients = pd.read_csv(patients_path, compression='gzip', low_memory=False, 
                          usecols=['SUBJECT_ID', 'DOB', 'GENDER'])
df_notes = pd.read_csv(notes_path, compression='gzip', low_memory=False, 
                       usecols=['SUBJECT_ID', 'HADM_ID', 'CHARTDATE', 'TEXT'])

# Convert datetime columns in ADMISSIONS
df_adm['ADMITTIME'] = pd.to_datetime(df_adm['ADMITTIME'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
df_adm['DISCHTIME'] = pd.to_datetime(df_adm['DISCHTIME'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
df_adm['DEATHTIME'] = pd.to_datetime(df_adm['DEATHTIME'], format='%Y-%m-%d %H:%M:%S', errors='coerce')

# Convert datetime columns in ICUSTAYS
df_icustays['INTIME'] = pd.to_datetime(df_icustays['INTIME'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
df_icustays['OUTTIME'] = pd.to_datetime(df_icustays['OUTTIME'], format='%Y-%m-%d %H:%M:%S', errors='coerce')

# Convert datetime columns in PATIENTS (DOB) 
df_patients['DOB'] = pd.to_datetime(df_patients['DOB'], format='%Y-%m-%d', errors='coerce')

# Convert datetime in NOTEEVENTS
df_notes['CHARTDATE'] = pd.to_datetime(df_notes['CHARTDATE'], format='%Y-%m-%d', errors='coerce')

# Rename columns for consistency across dataframes
df_adm.rename(columns={'SUBJECT_ID': 'subject_id', 'HADM_ID': 'hadm_id'}, inplace=True)
df_icustays.rename(columns={'SUBJECT_ID': 'subject_id', 'HADM_ID': 'hadm_id'}, inplace=True)
df_patients.rename(columns={'SUBJECT_ID': 'subject_id'}, inplace=True)
df_notes.rename(columns={'SUBJECT_ID': 'subject_id', 'HADM_ID': 'hadm_id'}, inplace=True)

# Merge admissions with ICU stays
df_icu = pd.merge(df_adm, df_icustays, on=['subject_id', 'hadm_id'], how='inner')

# Merge the demographics from patients into the ICU dataframe.
df_icu = pd.merge(df_icu, df_patients[['subject_id', 'DOB', 'GENDER']], on='subject_id', how='left')

# Calculate patient age at the time of ICU INTIME (first ICU stay time)
df_icu['age'] = df_icu.apply(lambda row: calculate_age(row['DOB'], row['INTIME']) if pd.notnull(row['DOB']) and pd.notnull(row['INTIME']) else np.nan, axis=1)
df_icu['age_category'] = df_icu['age'].apply(lambda x: categorize_age(x) if pd.notnull(x) else 'Unknown')

# Categorize ethnicity and insurance (if available)
df_icu['ethnicity_category'] = df_icu['ETHNICITY'].apply(lambda x: categorize_ethnicity(x) if pd.notnull(x) else 'Other')
df_icu['insurance_category'] = df_icu['INSURANCE'].apply(lambda x: categorize_insurance(x) if pd.notnull(x) else 'Other')

# Ensure gender is in a consistent format: male or female
df_icu['gender'] = df_icu['GENDER'].str.lower().apply(lambda x: 'male' if 'm' in x else ('female' if 'f' in x else x))

# Calculate short-term mortality: if DEATHTIME exists, label as 1 else 0.
df_icu = calculate_short_term_mortality(df_icu)

df_icu = calculate_readmission(df_icu)

# For each patient, we select the first ICU stay (by INTIME)
df_first_icu = df_icu.sort_values(by='INTIME').groupby('subject_id').first().reset_index()

first_icu_notes = df_notes[df_notes['hadm_id'].isin(df_first_icu['hadm_id'])]

notes_agg = first_icu_notes.groupby(['subject_id', 'hadm_id'])['TEXT'].apply(lambda texts: " ".join(texts)).reset_index()
notes_agg = preprocessing(notes_agg)  # Clean the text

df_first_icu = pd.merge(df_first_icu, notes_agg, on=['subject_id', 'hadm_id'], how='left')

df_first_icu['TEXT'] = df_first_icu['TEXT'].fillna('')

# Check the maximum size (number of characters and word count) of the concatenated notes
df_first_icu['note_length_chars'] = df_first_icu['TEXT'].apply(len)
df_first_icu['note_length_words'] = df_first_icu['TEXT'].apply(lambda x: len(x.split()))
max_chars = df_first_icu['note_length_chars'].max()
max_words = df_first_icu['note_length_words'].max()
print(f"Maximum note length (chars): {max_chars}")
print(f"Maximum note length (words): {max_words}")

# Check the overall shape of the final dataset 
print("Final dataset shape:", df_first_icu.shape)

# Count positive numbers for short-term mortality and readmission
num_mortality_positive = df_first_icu['short_term_mortality'].sum()
num_readmission_positive = df_first_icu['readmission_within_30_days'].sum()
print("Number of patients with short-term mortality (positive):", num_mortality_positive)
print("Number of patients with readmission within 30 days (positive):", num_readmission_positive)

df_first_icu.to_csv('final_first_icu_dataset.csv', index=False)


Maximum note length (chars): 3859128
Maximum note length (words): 525957
Final dataset shape: (46476, 25)
Number of patients with short-term mortality (positive): 4460
Number of patients with readmission within 30 days (positive): 4055


In [13]:
duplicate_patients = df_first_icu[df_first_icu.duplicated(subset=['subject_id'], keep=False)]
print("Number of duplicate patient entries:", duplicate_patients.shape[0])

if duplicate_patients.shape[0] > 0:
    print("Duplicate patient entries found:")
    print(duplicate_patients[['subject_id']].drop_duplicates())
else:
    print("No duplicate patient entries found.")


Number of duplicate patient entries: 0
No duplicate patient entries found.


In [17]:
df_first_icu.columns


Index(['subject_id', 'hadm_id', 'ADMITTIME', 'DISCHTIME', 'DEATHTIME',
       'INSURANCE', 'ETHNICITY', 'ICUSTAY_ID', 'INTIME', 'OUTTIME', 'DOB',
       'GENDER', 'age', 'age_category', 'ethnicity_category',
       'insurance_category', 'gender', 'short_term_mortality',
       'current_admission_dischtime', 'next_admission_icu_intime',
       'next_hadm_id', 'readmission_within_30_days', 'TEXT',
       'note_length_chars', 'note_length_words'],
      dtype='object')

In [19]:
df_first_icu.info

<bound method DataFrame.info of        subject_id  hadm_id           ADMITTIME           DISCHTIME DEATHTIME  \
0               2   163353 2138-07-17 19:04:00 2138-07-21 15:48:00       NaT   
1               3   145834 2101-10-20 19:08:00 2101-10-31 13:58:00       NaT   
2               4   185777 2191-03-16 00:28:00 2191-03-23 18:41:00       NaT   
3               5   178980 2103-02-02 04:31:00 2103-02-04 12:15:00       NaT   
4               6   107064 2175-05-30 07:15:00 2175-06-15 16:00:00       NaT   
...           ...      ...                 ...                 ...       ...   
46471       99985   176670 2181-01-27 02:47:00 2181-02-12 17:05:00       NaT   
46472       99991   151118 2184-12-24 08:30:00 2185-01-05 12:15:00       NaT   
46473       99992   197084 2144-07-25 18:03:00 2144-07-28 17:56:00       NaT   
46474       99995   137810 2147-02-08 08:00:00 2147-02-11 13:15:00       NaT   
46475       99999   113369 2117-12-30 07:15:00 2118-01-04 16:30:00       NaT   

      I

In [21]:
import pandas as pd
import numpy as np

# Example function to split a text into chunks of a given token size.
def split_text_to_chunks(text, chunk_size=512):
    # Tokenize the text by whitespace
    tokens = text.split()
    # Split tokens into chunks of 512
    chunks = [' '.join(tokens[i:i + chunk_size]) for i in range(0, len(tokens), chunk_size)]
    return chunks

# Function that returns a Series with one column per chunk.
def split_into_512_token_columns(text, chunk_size=512):
    chunks = split_text_to_chunks(text, chunk_size)
    chunk_dict = {}
    for i, chunk in enumerate(chunks):
        chunk_dict[f"note_chunk_{i+1}"] = chunk
    return pd.Series(chunk_dict)

# Apply the splitting function to each row in the 'TEXT' column.
df_note_chunks = df_first_icu['TEXT'].apply(split_into_512_token_columns)

# concatenate the original df_first_icu with the new note chunk columns.
df_final = pd.concat([df_first_icu, df_note_chunks], axis=1)
print(df_final.head())
note_chunk_cols = [col for col in df_final.columns if col.startswith('note_chunk_')]
print("Note chunk columns:", note_chunk_cols)


   subject_id  hadm_id           ADMITTIME           DISCHTIME DEATHTIME  \
0           2   163353 2138-07-17 19:04:00 2138-07-21 15:48:00       NaT   
1           3   145834 2101-10-20 19:08:00 2101-10-31 13:58:00       NaT   
2           4   185777 2191-03-16 00:28:00 2191-03-23 18:41:00       NaT   
3           5   178980 2103-02-02 04:31:00 2103-02-04 12:15:00       NaT   
4           6   107064 2175-05-30 07:15:00 2175-06-15 16:00:00       NaT   

  INSURANCE ETHNICITY  ICUSTAY_ID              INTIME             OUTTIME  \
0   Private     ASIAN      243653 2138-07-17 21:20:07 2138-07-17 23:32:21   
1  Medicare     WHITE      211552 2101-10-20 19:10:11 2101-10-26 20:43:09   
2   Private     WHITE      294638 2191-03-16 00:29:31 2191-03-17 16:46:31   
3   Private     ASIAN      214757 2103-02-02 06:04:24 2103-02-02 08:06:00   
4  Medicare     WHITE      228232 2175-05-30 21:30:54 2175-06-03 13:39:54   

   ... note_chunk_1019 note_chunk_1020  note_chunk_1021 note_chunk_1022  \
0  ..

In [25]:
# Save the final dataframe as a CSV file named 'final_unstructured.csv'
df_final.to_csv('final_unstructured.csv', index=False)


In [23]:
df_final.shape

(46476, 1053)