In [None]:
import pandas as pd

# Load MIMIC-III data
admissions = pd.read_csv('ADMISSIONS.csv.gz', compression='gzip')
patients = pd.read_csv('PATIENTS.csv.gz', compression='gzip')
icu_stays = pd.read_csv('ICUSTAYS.csv.gz', compression='gzip')

# Convert relevant columns to datetime format
admissions['ADMITTIME'] = pd.to_datetime(admissions['ADMITTIME'])
admissions['DISCHTIME'] = pd.to_datetime(admissions['DISCHTIME'])
admissions['DEATHTIME'] = pd.to_datetime(admissions['DEATHTIME'])
icu_stays['INTIME'] = pd.to_datetime(icu_stays['INTIME'])
icu_stays['OUTTIME'] = pd.to_datetime(icu_stays['OUTTIME'])

# Rename columns for consistency
admissions.rename(columns={'SUBJECT_ID': 'subject_id', 'HADM_ID': 'hadm_id'}, inplace=True)
patients.rename(columns={'SUBJECT_ID': 'subject_id'}, inplace=True)
icu_stays.rename(columns={'SUBJECT_ID': 'subject_id', 'HADM_ID': 'hadm_id'}, inplace=True)

# Merge ICU stays with admissions to include DEATHTIME, DISCHTIME, and ETHNICITY
icu_stays = pd.merge(
    icu_stays, 
    admissions[['subject_id', 'hadm_id', 'ADMITTIME', 'DISCHTIME', 'DEATHTIME', 'ETHNICITY', 'INSURANCE']], 
    on=['subject_id', 'hadm_id'], 
    how='left'
)

# Function to calculate short-term mortality
def calculate_short_term_mortality(icu_stays):
    # Mark as 1 if DEATHTIME exists within the ICU stay, otherwise 0
    icu_stays['short_term_mortality'] = icu_stays['DEATHTIME'].notnull().astype(int)
    return icu_stays

# Function to calculate readmission within 30 days considering all ICU stays
def calculate_readmission(icu_stays):
    # Sort by subject and ICU admission time
    icu_stays = icu_stays.sort_values(by=['subject_id', 'ICUSTAY_ID'])
    
    # Calculate the difference between OUTTIME of the current ICU stay and INTIME of the next ICU stay
    icu_stays['time_diff'] = (
        icu_stays.groupby('subject_id')['INTIME']
        .shift(-1) - icu_stays['OUTTIME']
    ).dt.days
    
    # Mark readmission within 30 days
    icu_stays['readmitted_within_30_days'] = (
        (icu_stays['time_diff'] <= 30) & (icu_stays['time_diff'] > 0)
    ).astype(int)
    
    # Fill NaN with 0 for patients with only one ICU stay
    icu_stays['readmitted_within_30_days'] = icu_stays['readmitted_within_30_days'].fillna(0).astype(int)
    
    return icu_stays

# Apply the short-term mortality function
icu_stays = calculate_short_term_mortality(icu_stays)

# Apply the readmission function
icu_stays = calculate_readmission(icu_stays)

# Extract the first ICU stay for each patient
first_icu_stays = icu_stays.sort_values(by=['subject_id', 'ICUSTAY_ID']).groupby('subject_id').first().reset_index()

# Merge first ICU stay data with patient details
first_icu_stays = pd.merge(first_icu_stays, patients, on='subject_id', how='left')

# Calculate patient age at ICU admission
def calculate_age(dob, intime):
    return intime.year - dob.year - ((intime.month, intime.day) < (dob.month, dob.day))

first_icu_stays['age'] = first_icu_stays.apply(lambda x: calculate_age(pd.to_datetime(x['DOB']), x['INTIME']), axis=1)
first_icu_stays = first_icu_stays[(first_icu_stays['age'] >= 15) & (first_icu_stays['age'] <= 90)]

# Categorize age into buckets
def categorize_age(age):
    if 15 <= age <= 29:
        return '15-29'
    elif 30 <= age <= 49:
        return '30-49'
    elif 50 <= age <= 69:
        return '50-69'
    else:
        return '70-89'

first_icu_stays['age_bucket'] = first_icu_stays['age'].apply(categorize_age)

# Categorize ethnicity
def categorize_ethnicity(ethnicity):
    ethnicity = ethnicity.upper()
    if ethnicity in [
        'WHITE', 'WHITE - RUSSIAN', 'WHITE - OTHER EUROPEAN', 
        'WHITE - BRAZILIAN', 'WHITE - EASTERN EUROPEAN'
    ]:
        return 'White'
    elif ethnicity in [
        'BLACK/AFRICAN AMERICAN', 'BLACK/CAPE VERDEAN', 
        'BLACK/HAITIAN', 'BLACK/AFRICAN', 'CARIBBEAN ISLAND'
    ]:
        return 'Black'
    elif ethnicity in [
        'HISPANIC OR LATINO', 'HISPANIC/LATINO - PUERTO RICAN', 
        'HISPANIC/LATINO - DOMINICAN', 'HISPANIC/LATINO - GUATEMALAN', 
        'HISPANIC/LATINO - CUBAN', 'HISPANIC/LATINO - SALVADORAN', 
        'HISPANIC/LATINO - CENTRAL AMERICAN (OTHER)', 
        'HISPANIC/LATINO - MEXICAN', 'HISPANIC/LATINO - COLOMBIAN', 
        'HISPANIC/LATINO - HONDURAN'
    ]:
        return 'Hispanic'
    elif ethnicity in [
        'ASIAN', 'ASIAN - CHINESE', 'ASIAN - ASIAN INDIAN', 
        'ASIAN - VIETNAMESE', 'ASIAN - FILIPINO', 'ASIAN - CAMBODIAN', 
        'ASIAN - OTHER', 'ASIAN - KOREAN', 'ASIAN - JAPANESE', 'ASIAN - THAI'
    ]:
        return 'Asian'
    else:
        return 'Other'

first_icu_stays['categorized_ethnicity'] = first_icu_stays['ETHNICITY'].apply(categorize_ethnicity)

# Categorize insurance
def categorize_insurance(insurance):
    if 'MEDICARE' in insurance.upper():
        return 'Medicare'
    elif 'PRIVATE' in insurance.upper():
        return 'Private'
    elif 'MEDICAID' in insurance.upper():
        return 'Medicaid'
    elif 'SELF PAY' in insurance.upper():
        return 'Self Pay'
    else:
        return 'Government'

first_icu_stays['categorized_insurance'] = first_icu_stays['INSURANCE'].apply(categorize_insurance)

# One-hot encoding for categorical columns
first_icu_stays = pd.get_dummies(
    first_icu_stays, 
    columns=['age_bucket', 'categorized_ethnicity', 'categorized_insurance'], 
    drop_first=False
)

# Save the structured data
first_icu_stays.to_csv('structured_first_icu_stays.csv', index=False)

print("Processing complete. File saved as 'structured_first_icu_stays.csv'.")


In [2]:
import pandas as pd

# Load the MIMIC-III datasets
admissions = pd.read_csv('ADMISSIONS.csv.gz', compression='gzip')
patients = pd.read_csv('PATIENTS.csv.gz', compression='gzip')
icu_stays = pd.read_csv('ICUSTAYS.csv.gz', compression='gzip')

# Convert relevant columns to datetime format
admissions['ADMITTIME'] = pd.to_datetime(admissions['ADMITTIME'])
admissions['DISCHTIME'] = pd.to_datetime(admissions['DISCHTIME'])
admissions['DEATHTIME'] = pd.to_datetime(admissions['DEATHTIME'])
icu_stays['INTIME'] = pd.to_datetime(icu_stays['INTIME'])
icu_stays['OUTTIME'] = pd.to_datetime(icu_stays['OUTTIME'])

# Rename columns for consistency
admissions.rename(columns={'SUBJECT_ID': 'subject_id', 'HADM_ID': 'hadm_id'}, inplace=True)
patients.rename(columns={'SUBJECT_ID': 'subject_id'}, inplace=True)
icu_stays.rename(columns={'SUBJECT_ID': 'subject_id', 'HADM_ID': 'hadm_id'}, inplace=True)

# Merge ICU stays with admissions to include DEATHTIME, DISCHTIME, and ETHNICITY
icu_stays = pd.merge(
    icu_stays,
    admissions[['subject_id', 'hadm_id', 'ADMITTIME', 'DISCHTIME', 'DEATHTIME', 'ETHNICITY', 'INSURANCE']],
    on=['subject_id', 'hadm_id'],
    how='left'
)

# Merge GENDER from patients into ICU stays
icu_stays = pd.merge(icu_stays, patients[['subject_id', 'GENDER', 'DOB']], on='subject_id', how='left')

# Function to calculate short-term mortality
def calculate_short_term_mortality(icu_stays):
    icu_stays['short_term_mortality'] = icu_stays['DEATHTIME'].notnull().astype(int)
    return icu_stays

# Function to calculate readmission within 30 days
def calculate_readmission(icu_stays):
    icu_stays = icu_stays.sort_values(by=['subject_id', 'ICUSTAY_ID'])
    icu_stays['time_diff'] = (
        icu_stays.groupby('subject_id')['INTIME']
        .shift(-1) - icu_stays['OUTTIME']
    ).dt.days
    icu_stays['readmitted_within_30_days'] = (
        (icu_stays['time_diff'] <= 30) & (icu_stays['time_diff'] > 0)
    ).astype(int)
    icu_stays['readmitted_within_30_days'] = icu_stays['readmitted_within_30_days'].fillna(0).astype(int)
    return icu_stays

# Apply the short-term mortality and readmission functions
icu_stays = calculate_short_term_mortality(icu_stays)
icu_stays = calculate_readmission(icu_stays)

# Extract the first ICU stay for each patient
first_icu_stays = icu_stays.sort_values(by=['subject_id', 'ICUSTAY_ID']).groupby('subject_id').first().reset_index()

# Calculate age at ICU admission
def calculate_age(dob, intime):
    return intime.year - dob.year - ((intime.month, intime.day) < (dob.month, dob.day))

first_icu_stays['age'] = first_icu_stays.apply(lambda x: calculate_age(pd.to_datetime(x['DOB']), x['INTIME']), axis=1)
first_icu_stays = first_icu_stays[(first_icu_stays['age'] >= 15) & (first_icu_stays['age'] <= 90)]

# Categorize age into buckets
def categorize_age(age):
    if 15 <= age <= 29:
        return '15-29'
    elif 30 <= age <= 49:
        return '30-49'
    elif 50 <= age <= 69:
        return '50-69'
    else:
        return '70-89'

first_icu_stays['age_bucket'] = first_icu_stays['age'].apply(categorize_age)

# Categorize ethnicity
def categorize_ethnicity(ethnicity):
    ethnicity = ethnicity.upper()
    if ethnicity in ['WHITE', 'WHITE - RUSSIAN', 'WHITE - OTHER EUROPEAN', 'WHITE - BRAZILIAN', 'WHITE - EASTERN EUROPEAN']:
        return 'White'
    elif ethnicity in ['BLACK/AFRICAN AMERICAN', 'BLACK/CAPE VERDEAN', 'BLACK/HAITIAN', 'BLACK/AFRICAN', 'CARIBBEAN ISLAND']:
        return 'Black'
    elif ethnicity in [
        'HISPANIC OR LATINO', 'HISPANIC/LATINO - PUERTO RICAN', 'HISPANIC/LATINO - DOMINICAN', 'HISPANIC/LATINO - MEXICAN']:
        return 'Hispanic'
    elif ethnicity in ['ASIAN', 'ASIAN - CHINESE', 'ASIAN - INDIAN']:
        return 'Asian'
    else:
        return 'Other'

first_icu_stays['categorized_ethnicity'] = first_icu_stays['ETHNICITY'].apply(categorize_ethnicity)

# Categorize insurance
def categorize_insurance(insurance):
    if 'MEDICARE' in insurance.upper():
        return 'Medicare'
    elif 'PRIVATE' in insurance.upper():
        return 'Private'
    elif 'MEDICAID' in insurance.upper():
        return 'Medicaid'
    elif 'SELF PAY' in insurance.upper():
        return 'Self Pay'
    else:
        return 'Government'

first_icu_stays['categorized_insurance'] = first_icu_stays['INSURANCE'].apply(categorize_insurance)

# One-hot encoding for categorical columns (excluding GENDER)
first_icu_stays = pd.get_dummies(
    first_icu_stays,
    columns=['age_bucket', 'categorized_ethnicity', 'categorized_insurance'],
    drop_first=False
)

# Save the structured data
first_icu_stays.to_csv('structured_first_icu_stays_with_gender.csv', index=False)

print("Processing complete. File saved as 'structured_first_icu_stays_with_gender.csv'.")


Processing complete. File saved as 'structured_first_icu_stays_with_gender.csv'.


In [4]:
# Check for duplicate patient IDs
duplicate_patients = first_icu_stays[first_icu_stays.duplicated(subset=['subject_id'], keep=False)]
if duplicate_patients.empty:
    print("No duplicate patients found in the dataset.")
else:
    print(f"Number of duplicate patients: {duplicate_patients['subject_id'].nunique()}")
    print("Details of duplicate patients:")
    print(duplicate_patients)


No duplicate patients found in the dataset.


In [6]:
df = pd.read_csv('structured_first_icustays.csv')
print(df.head())
print(df.describe())
print(df.info())


   subject_id  ROW_ID  hadm_id  ICUSTAY_ID DBSOURCE FIRST_CAREUNIT  \
0           3       2   145834      211552  carevue           MICU   
1           4       3   185777      294638  carevue           MICU   
2           6       5   107064      228232  carevue           SICU   
3           9       9   150750      220597  carevue           MICU   
4          11      11   194540      229441  carevue           SICU   

  LAST_CAREUNIT  FIRST_WARDID  LAST_WARDID               INTIME  ...  \
0          MICU            12           12  2101-10-20 19:10:11  ...   
1          MICU            52           52  2191-03-16 00:29:31  ...   
2          SICU            33           33  2175-05-30 21:30:54  ...   
3          MICU            15           15  2149-11-09 13:07:02  ...   
4          SICU            57           57  2178-04-16 06:19:32  ...   

  categorized_ethnicity_Asian  categorized_ethnicity_Black  \
0                       False                        False   
1                     

In [8]:
import pandas as pd

# Load the processed dataset
processed_file = 'structured_first_icu_stays_with_gender.csv'
df = pd.read_csv(processed_file)

# Check the shape of the dataset
dataset_shape = first_icu_stays.shape

# Calculate positive cases for short-term mortality
positive_mortality_count = first_icu_stays['short_term_mortality'].sum()

# Calculate positive cases for readmission within 30 days
positive_readmission_count = first_icu_stays['readmitted_within_30_days'].sum()

dataset_shape, positive_mortality_count, positive_readmission_count


((36615, 37), 4521, 1547)

In [10]:
# Map boolean columns back to categorical values for ethnicity
ethnicity_columns = [
    'categorized_ethnicity_Asian',
    'categorized_ethnicity_Black',
    'categorized_ethnicity_Hispanic',
    'categorized_ethnicity_Other',
    'categorized_ethnicity_White'
]
df['categorized_ethnicity'] = df[ethnicity_columns].idxmax(axis=1).str.replace('categorized_ethnicity_', '')

# Map boolean columns back to categorical values for insurance
insurance_columns = [
    'categorized_insurance_Government',
    'categorized_insurance_Medicaid',
    'categorized_insurance_Medicare',
    'categorized_insurance_Private',
    'categorized_insurance_Self Pay'
]
df['categorized_insurance'] = df[insurance_columns].idxmax(axis=1).str.replace('categorized_insurance_', '')


In [None]:
outcomes = ['short_term_mortality', 'readmitted_within_30_days']


In [None]:
# Demographics and outcomes
demographics = ['GENDER', 'categorized_ethnicity', 'categorized_insurance']
outcomes = ['short_term_mortality', 'readmitted_within_30_days']

# Create a dictionary to store counts
counts = {}
for demographic in demographics:
    counts[demographic] = {}
    for outcome in outcomes:
        grouped = df.groupby([demographic, outcome]).size().unstack(fill_value=0)
        counts[demographic][outcome] = grouped

# Display the counts
for demographic, outcome_data in counts.items():
    print(f"\nCounts for {demographic}:")
    for outcome, grouped_counts in outcome_data.items():
        print(f"\n{outcome}:")
        print(grouped_counts)
        

In [None]:
print(df.columns)
