# 01 - Exploratory Data Analysis: Patients & Admissions

## Objective
Understand the structure and characteristics of the MIMIC-IV patient demographics and hospital admissions data.

## Dataset
- **Source:** MIMIC-IV Clinical Database Demo v2.2
- **Tables:** 
  - `patients.csv` - Patient demographics
  - `admissions.csv` - Hospital admissions

---

## 1. Setup & Imports

In [None]:
# Data manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Warnings
import warnings
warnings.filterwarnings('ignore')

print(" Libraries imported successfully")

## 2. Load Data

In [None]:
# Define data paths
DATA_PATH = '../../../../mimic-iv-demo/mimic-iv-clinical-database-demo-2.2/hosp/'

# Load patients data
patients = pd.read_csv(DATA_PATH + 'patients.csv')
print(f" Patients data loaded: {patients.shape[0]} rows, {patients.shape[1]} columns")

# Load admissions data
admissions = pd.read_csv(DATA_PATH + 'admissions.csv')
print(f" Admissions data loaded: {admissions.shape[0]} rows, {admissions.shape[1]} columns")

## 3. Patients Data Exploration

### 3.1 First Look at Patients Data

In [None]:
# Display first rows
patients.head(10)

In [None]:
# Data info
patients.info()

In [None]:
# Statistical summary
patients.describe()

### 3.2 Missing Values Analysis

In [None]:
# Check for missing values
missing_patients = patients.isnull().sum()
missing_patients_pct = (missing_patients / len(patients)) * 100

missing_df = pd.DataFrame({
    'Missing Values': missing_patients,
    'Percentage': missing_patients_pct
})

print("Missing Values in Patients:")
print(missing_df[missing_df['Missing Values'] > 0].sort_values('Missing Values', ascending=False))

### 3.3 Patient Demographics

In [None]:
# Gender distribution
print("Gender Distribution:")
print(patients['gender'].value_counts())
print(f"\nPercentage:")
print(patients['gender'].value_counts(normalize=True) * 100)

In [None]:
# Age distribution
print("Age Statistics:")
print(patients['anchor_age'].describe())

# Visualize age distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
axes[0].hist(patients['anchor_age'], bins=20, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Age')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Age Distribution of Patients')
axes[0].axvline(patients['anchor_age'].mean(), color='red', linestyle='--', label=f'Mean: {patients["anchor_age"].mean():.1f}')
axes[0].legend()

# Boxplot
axes[1].boxplot(patients['anchor_age'], vert=True)
axes[1].set_ylabel('Age')
axes[1].set_title('Age Distribution (Boxplot)')
axes[1].set_xticklabels(['Patients'])

plt.tight_layout()
plt.show()

In [None]:
# Age by gender
fig, ax = plt.subplots(figsize=(10, 6))
patients.boxplot(column='anchor_age', by='gender', ax=ax)
plt.suptitle('')
plt.title('Age Distribution by Gender')
plt.xlabel('Gender')
plt.ylabel('Age')
plt.show()

print("\nAge by Gender:")
print(patients.groupby('gender')['anchor_age'].describe())

In [None]:
# Mortality analysis
deceased = patients['dod'].notna().sum()
total = len(patients)
mortality_rate = (deceased / total) * 100

print(f"Mortality Analysis:")
print(f"  Total patients: {total}")
print(f"  Deceased: {deceased}")
print(f"  Alive: {total - deceased}")
print(f"  Mortality rate: {mortality_rate:.2f}%")

# Visualize
fig, ax = plt.subplots(figsize=(8, 6))
mortality_data = patients['dod'].notna().value_counts()
colors = ['#2ecc71', '#e74c3c']
ax.pie(mortality_data, labels=['Alive', 'Deceased'], autopct='%1.1f%%', colors=colors, startangle=90)
ax.set_title('Patient Mortality Rate')
plt.show()

## 4. Admissions Data Exploration

### 4.1 First Look at Admissions Data

In [None]:
# Display first rows
admissions.head(10)

In [None]:
# Data info
admissions.info()

In [None]:
# Statistical summary for numerical columns
admissions.describe()

### 4.2 Missing Values Analysis

In [None]:
# Check for missing values
missing_admissions = admissions.isnull().sum()
missing_admissions_pct = (missing_admissions / len(admissions)) * 100

missing_adm_df = pd.DataFrame({
    'Missing Values': missing_admissions,
    'Percentage': missing_admissions_pct
})

print("Missing Values in Admissions:")
print(missing_adm_df[missing_adm_df['Missing Values'] > 0].sort_values('Missing Values', ascending=False))

### 4.3 Admission Characteristics

In [None]:
# Admission type distribution
print("Admission Type Distribution:")
print(admissions['admission_type'].value_counts())

# Visualize
fig, ax = plt.subplots(figsize=(10, 6))
admissions['admission_type'].value_counts().plot(kind='bar', ax=ax, color='steelblue', edgecolor='black')
ax.set_title('Distribution of Admission Types')
ax.set_xlabel('Admission Type')
ax.set_ylabel('Count')
ax.tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Admission location
print("\nAdmission Location Distribution:")
print(admissions['admission_location'].value_counts())

# Top 10 admission locations
fig, ax = plt.subplots(figsize=(12, 6))
admissions['admission_location'].value_counts().head(10).plot(kind='barh', ax=ax, color='coral', edgecolor='black')
ax.set_title('Top 10 Admission Locations')
ax.set_xlabel('Count')
ax.set_ylabel('Admission Location')
plt.tight_layout()
plt.show()

In [None]:
# Discharge location
print("\nDischarge Location Distribution:")
print(admissions['discharge_location'].value_counts())

# Top 10 discharge locations
fig, ax = plt.subplots(figsize=(12, 6))
admissions['discharge_location'].value_counts().head(10).plot(kind='barh', ax=ax, color='lightgreen', edgecolor='black')
ax.set_title('Top 10 Discharge Locations')
ax.set_xlabel('Count')
ax.set_ylabel('Discharge Location')
plt.tight_layout()
plt.show()

In [None]:
# Insurance type
print("\nInsurance Type Distribution:")
print(admissions['insurance'].value_counts())

# Visualize
fig, ax = plt.subplots(figsize=(10, 6))
admissions['insurance'].value_counts().plot(kind='pie', ax=ax, autopct='%1.1f%%', startangle=90)
ax.set_title('Insurance Type Distribution')
ax.set_ylabel('')
plt.tight_layout()
plt.show()

In [None]:
# Race/Ethnicity distribution
print("\nRace/Ethnicity Distribution:")
print(admissions['race'].value_counts())

# Top 10 races
fig, ax = plt.subplots(figsize=(12, 6))
admissions['race'].value_counts().head(10).plot(kind='barh', ax=ax, color='mediumpurple', edgecolor='black')
ax.set_title('Top 10 Race/Ethnicity Groups')
ax.set_xlabel('Count')
ax.set_ylabel('Race/Ethnicity')
plt.tight_layout()
plt.show()

In [None]:
# Hospital mortality (died during admission)
hospital_deaths = admissions['hospital_expire_flag'].sum()
total_admissions = len(admissions)
hospital_mortality_rate = (hospital_deaths / total_admissions) * 100

print(f"\nHospital Mortality Analysis:")
print(f"  Total admissions: {total_admissions}")
print(f"  Deaths during admission: {hospital_deaths}")
print(f"  Hospital mortality rate: {hospital_mortality_rate:.2f}%")

### 4.4 Length of Stay Analysis

In [None]:
# Convert datetime columns
admissions['admittime'] = pd.to_datetime(admissions['admittime'])
admissions['dischtime'] = pd.to_datetime(admissions['dischtime'])

# Calculate length of stay (LOS) in days
admissions['los_days'] = (admissions['dischtime'] - admissions['admittime']).dt.total_seconds() / (24 * 3600)

print("Length of Stay Statistics:")
print(admissions['los_days'].describe())

In [None]:
# Visualize LOS distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
axes[0].hist(admissions['los_days'], bins=30, edgecolor='black', alpha=0.7, color='skyblue')
axes[0].set_xlabel('Length of Stay (days)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Length of Stay')
axes[0].axvline(admissions['los_days'].mean(), color='red', linestyle='--', label=f'Mean: {admissions["los_days"].mean():.1f} days')
axes[0].axvline(admissions['los_days'].median(), color='green', linestyle='--', label=f'Median: {admissions["los_days"].median():.1f} days')
axes[0].legend()

# Boxplot
axes[1].boxplot(admissions['los_days'], vert=True)
axes[1].set_ylabel('Length of Stay (days)')
axes[1].set_title('Length of Stay (Boxplot)')
axes[1].set_xticklabels(['All Admissions'])

plt.tight_layout()
plt.show()

In [None]:
# LOS by admission type
print("\nLength of Stay by Admission Type:")
print(admissions.groupby('admission_type')['los_days'].describe())

# Visualize
fig, ax = plt.subplots(figsize=(12, 6))
admissions.boxplot(column='los_days', by='admission_type', ax=ax)
plt.suptitle('')
plt.title('Length of Stay by Admission Type')
plt.xlabel('Admission Type')
plt.ylabel('Length of Stay (days)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 5. Joining Patients & Admissions

### 5.1 Merge the datasets

In [None]:
# Merge patients and admissions
patient_admissions = admissions.merge(patients, on='subject_id', how='left')

print(f" Merged dataset created: {patient_admissions.shape[0]} rows, {patient_admissions.shape[1]} columns")
patient_admissions.head()

### 5.2 Admissions per Patient

In [None]:
# Count admissions per patient
admissions_per_patient = admissions.groupby('subject_id').size().reset_index(name='num_admissions')

print("Admissions per Patient Statistics:")
print(admissions_per_patient['num_admissions'].describe())

print(f"\nPatients with multiple admissions: {(admissions_per_patient['num_admissions'] > 1).sum()}")
print(f"Max admissions for a single patient: {admissions_per_patient['num_admissions'].max()}")

In [None]:
# Visualize admissions per patient
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar plot
admissions_per_patient['num_admissions'].value_counts().sort_index().plot(kind='bar', ax=axes[0], color='teal', edgecolor='black')
axes[0].set_xlabel('Number of Admissions')
axes[0].set_ylabel('Number of Patients')
axes[0].set_title('Distribution of Admissions per Patient')

# Pie chart for single vs multiple admissions
single_adm = (admissions_per_patient['num_admissions'] == 1).sum()
multiple_adm = (admissions_per_patient['num_admissions'] > 1).sum()
axes[1].pie([single_adm, multiple_adm], labels=['Single Admission', 'Multiple Admissions'], 
            autopct='%1.1f%%', colors=['lightblue', 'orange'], startangle=90)
axes[1].set_title('Single vs Multiple Admissions')

plt.tight_layout()
plt.show()

### 5.3 Demographics Analysis by Admission Type

In [None]:
# Age by admission type
print("Age by Admission Type:")
print(patient_admissions.groupby('admission_type')['anchor_age'].describe())

In [None]:
# Gender distribution by admission type
gender_admission = pd.crosstab(patient_admissions['admission_type'], patient_admissions['gender'], normalize='index') * 100
print("\nGender Distribution by Admission Type (%):")
print(gender_admission)

# Visualize
gender_admission.plot(kind='bar', figsize=(10, 6), color=['pink', 'lightblue'], edgecolor='black')
plt.title('Gender Distribution by Admission Type')
plt.xlabel('Admission Type')
plt.ylabel('Percentage')
plt.xticks(rotation=45)
plt.legend(title='Gender')
plt.tight_layout()
plt.show()

### 5.4 Mortality Analysis

In [None]:
# Compare hospital mortality vs overall mortality
print("Mortality Comparison:")
print(f"Hospital deaths (during admission): {admissions['hospital_expire_flag'].sum()}")
print(f"Overall deaths (patients with dod): {patients['dod'].notna().sum()}")
print(f"\nNote: Overall deaths include patients who died after discharge")

In [None]:
# Mortality by admission type
mortality_by_type = patient_admissions.groupby('admission_type')['hospital_expire_flag'].agg(['sum', 'count', 'mean'])
mortality_by_type.columns = ['Deaths', 'Total Admissions', 'Mortality Rate']
mortality_by_type['Mortality Rate'] = mortality_by_type['Mortality Rate'] * 100

print("\nMortality by Admission Type:")
print(mortality_by_type.sort_values('Mortality Rate', ascending=False))

# Visualize
fig, ax = plt.subplots(figsize=(10, 6))
mortality_by_type['Mortality Rate'].sort_values(ascending=False).plot(kind='bar', ax=ax, color='crimson', edgecolor='black')
ax.set_title('Hospital Mortality Rate by Admission Type')
ax.set_xlabel('Admission Type')
ax.set_ylabel('Mortality Rate (%)')
ax.tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()

## 6. Key Findings & Summary

In [None]:
print("="*80)
print("KEY FINDINGS - PATIENTS & ADMISSIONS EXPLORATORY ANALYSIS")
print("="*80)

print("\n DATASET OVERVIEW:")
print(f"  • Total unique patients: {len(patients)}")
print(f"  • Total admissions: {len(admissions)}")
print(f"  • Average admissions per patient: {len(admissions)/len(patients):.2f}")

print("\n PATIENT DEMOGRAPHICS:")
print(f"  • Gender distribution: {patients['gender'].value_counts().to_dict()}")
print(f"  • Average age: {patients['anchor_age'].mean():.1f} years")
print(f"  • Age range: {patients['anchor_age'].min()} - {patients['anchor_age'].max()} years")
print(f"  • Overall mortality rate: {((patients['dod'].notna().sum() / len(patients)) * 100):.2f}%")

print("\n ADMISSIONS CHARACTERISTICS:")
print(f"  • Most common admission type: {admissions['admission_type'].mode()[0]}")
print(f"  • Average length of stay: {admissions['los_days'].mean():.1f} days")
print(f"  • Median length of stay: {admissions['los_days'].median():.1f} days")
print(f"  • Hospital mortality rate: {hospital_mortality_rate:.2f}%")

print("\n READMISSIONS:")
print(f"  • Patients with multiple admissions: {(admissions_per_patient['num_admissions'] > 1).sum()}")
print(f"  • Max admissions for single patient: {admissions_per_patient['num_admissions'].max()}")

print("\n" + "="*80)
print(" Exploratory analysis completed!")
print("="*80)