In [2]:
import pandas as pd

# Load the MIMIC-III datasets
admissions = pd.read_csv('ADMISSIONS.csv.gz', compression='gzip')
patients = pd.read_csv('PATIENTS.csv.gz', compression='gzip')
icu_stays = pd.read_csv('ICUSTAYS.csv.gz', compression='gzip')

# Convert relevant columns to datetime format
admissions['ADMITTIME'] = pd.to_datetime(admissions['ADMITTIME'])
admissions['DISCHTIME'] = pd.to_datetime(admissions['DISCHTIME'])
admissions['DEATHTIME'] = pd.to_datetime(admissions['DEATHTIME'])
icu_stays['INTIME'] = pd.to_datetime(icu_stays['INTIME'])
icu_stays['OUTTIME'] = pd.to_datetime(icu_stays['OUTTIME'])

# Rename columns for consistency
admissions.rename(columns={'SUBJECT_ID': 'subject_id', 'HADM_ID': 'hadm_id'}, inplace=True)
patients.rename(columns={'SUBJECT_ID': 'subject_id'}, inplace=True)
icu_stays.rename(columns={'SUBJECT_ID': 'subject_id', 'HADM_ID': 'hadm_id'}, inplace=True)

# Merge ICU stays with admissions to include DEATHTIME, DISCHTIME, and ETHNICITY
icu_stays = pd.merge(
    icu_stays,
    admissions[['subject_id', 'hadm_id', 'ADMITTIME', 'DISCHTIME', 'DEATHTIME', 'ETHNICITY', 'INSURANCE']],
    on=['subject_id', 'hadm_id'],
    how='left'
)

# Merge GENDER from patients into ICU stays
icu_stays = pd.merge(icu_stays, patients[['subject_id', 'GENDER', 'DOB']], on='subject_id', how='left')

# Function to calculate short-term mortality
def calculate_short_term_mortality(icu_stays):
    icu_stays['short_term_mortality'] = icu_stays['DEATHTIME'].notnull().astype(int)
    return icu_stays

def calculate_readmission(icu_stays):
    if 'DISCHTIME' not in icu_stays.columns:
        raise KeyError("DISCHTIME column is missing in the input data.")
    if 'INTIME' not in icu_stays.columns:
        raise KeyError("INTIME column is missing in the input data.")
    if 'hadm_id' not in icu_stays.columns:
        raise KeyError("hadm_id column is missing in the input data.")
    
    # Sort by subject_id, admission time, and ICU intime
    icu_stays = icu_stays.sort_values(by=['subject_id', 'ADMITTIME', 'INTIME'])
    
    # Extract DISCHTIME of the current admission
    icu_stays['current_admission_dischtime'] = icu_stays.groupby(['subject_id', 'hadm_id'])['DISCHTIME'].transform('first')
    
    # Sort by subject_id and admission time for cross-admission comparison
    icu_stays = icu_stays.sort_values(by=['subject_id', 'ADMITTIME'])
    
    # Identify the INTIME of the first ICU stay in the next admission
    icu_stays['next_admission_icu_intime'] = icu_stays.groupby('subject_id')['INTIME'].shift(-1)
    icu_stays['next_hadm_id'] = icu_stays.groupby('subject_id')['hadm_id'].shift(-1)
    
    # Calculate time difference between DISCHTIME of current admission and INTIME of next ICU stay
    icu_stays['readmission_within_30_days'] = (
        (icu_stays['next_admission_icu_intime'] - icu_stays['current_admission_dischtime']).dt.days <= 30
    ).astype(int)
    
    # Fill NaN values with 0 for readmission status (e.g., no next admission)
    icu_stays['readmission_within_30_days'] = icu_stays['readmission_within_30_days'].fillna(0).astype(int)
    
    return icu_stays

# Apply the short-term mortality and readmission functions
icu_stays = calculate_short_term_mortality(icu_stays)
icu_stays = calculate_readmission(icu_stays)

# Extract the first ICU stay for each patient
first_icu_stays = icu_stays.sort_values(by=['subject_id', 'ICUSTAY_ID']).groupby('subject_id').first().reset_index()

# Calculate age at ICU admission
def calculate_age(dob, intime):
    return intime.year - dob.year - ((intime.month, intime.day) < (dob.month, dob.day))

first_icu_stays['age'] = first_icu_stays.apply(lambda x: calculate_age(pd.to_datetime(x['DOB']), x['INTIME']), axis=1)
first_icu_stays = first_icu_stays[(first_icu_stays['age'] >= 15) & (first_icu_stays['age'] <= 90)]

# Categorize age into buckets
def categorize_age(age):
    if 15 <= age <= 29:
        return '15-29'
    elif 30 <= age <= 49:
        return '30-49'
    elif 50 <= age <= 69:
        return '50-69'
    else:
        return '70-89'

first_icu_stays['age_bucket'] = first_icu_stays['age'].apply(categorize_age)

# Categorize ethnicity
def categorize_ethnicity(ethnicity):
    ethnicity = ethnicity.upper()
    if ethnicity in ['WHITE', 'WHITE - RUSSIAN', 'WHITE - OTHER EUROPEAN', 'WHITE - BRAZILIAN', 'WHITE - EASTERN EUROPEAN']:
        return 'White'
    elif ethnicity in ['BLACK/AFRICAN AMERICAN', 'BLACK/CAPE VERDEAN', 'BLACK/HAITIAN', 'BLACK/AFRICAN', 'CARIBBEAN ISLAND']:
        return 'Black'
    elif ethnicity in [
        'HISPANIC OR LATINO', 'HISPANIC/LATINO - PUERTO RICAN', 'HISPANIC/LATINO - DOMINICAN', 'HISPANIC/LATINO - MEXICAN']:
        return 'Hispanic'
    elif ethnicity in ['ASIAN', 'ASIAN - CHINESE', 'ASIAN - INDIAN']:
        return 'Asian'
    else:
        return 'Other'

first_icu_stays['categorized_ethnicity'] = first_icu_stays['ETHNICITY'].apply(categorize_ethnicity)

# Categorize insurance
def categorize_insurance(insurance):
    if 'MEDICARE' in insurance.upper():
        return 'Medicare'
    elif 'PRIVATE' in insurance.upper():
        return 'Private'
    elif 'MEDICAID' in insurance.upper():
        return 'Medicaid'
    elif 'SELF PAY' in insurance.upper():
        return 'Self Pay'
    else:
        return 'Government'

first_icu_stays['categorized_insurance'] = first_icu_stays['INSURANCE'].apply(categorize_insurance)

# One-hot encoding for categorical columns (excluding GENDER)
first_icu_stays = pd.get_dummies(
    first_icu_stays,
    columns=['age_bucket', 'categorized_ethnicity', 'categorized_insurance'],
    drop_first=False
)

# Save the structured data
first_icu_stays.to_csv('structured_first_icu_stays.csv', index=False)

print("Processing complete. File saved as 'structured_first_icu_stays_with_gender.csv'.")


Processing complete. File saved as 'structured_first_icu_stays_with_gender.csv'.


In [4]:
# Check for duplicate patient IDs
duplicate_patients = first_icu_stays[first_icu_stays.duplicated(subset=['subject_id'], keep=False)]
if duplicate_patients.empty:
    print("No duplicate patients found in the dataset.")
else:
    print(f"Number of duplicate patients: {duplicate_patients['subject_id'].nunique()}")
    print("Details of duplicate patients:")
    print(duplicate_patients)


No duplicate patients found in the dataset.


In [6]:
import pandas as pd

# Load the processed dataset
processed_file = 'structured_first_icu_stays.csv'
df = pd.read_csv(processed_file)

# Strip whitespace from column names
df.columns = df.columns.str.strip()

# Check the shape of the dataset
dataset_shape = df.shape

# Calculate positive cases for short-term mortality
positive_mortality_count = df['short_term_mortality'].sum()

# Calculate positive cases for readmission within 30 days
positive_readmission_count = df['readmission_within_30_days'].sum()

# Output results
dataset_shape, positive_mortality_count, positive_readmission_count


((36615, 39), 4521, 2095)