In [7]:
"""
HEALTHCARE READMISSION ANALYTICS PROJECT
=========================================
Author: Abhishek Saxena
Date: January 2026
Purpose: End-to-end analysis of hospital readmissions using diabetes patient data

This script demonstrates:
1. Data acquisition from public sources
2. Data preprocessing and cleaning
3. Exploratory data analysis
4. Predictive modeling
5. Results visualization and reporting

"""
# ============================================================================
# SECTION 1: ENVIRONMENT SETUP AND LIBRARY IMPORTS
# ============================================================================

# Data manipulation and analysis
import pandas as pd
import numpy as np

#Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report, 
    confusion_matrix, 
    roc_auc_score, 
    roc_curve,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score
)

# Utility libraries
import warnings
import os
from datetime import datetime


# Configuration settings
warnings.filterwarnings('ignore')  # Suppress warnings for cleaner output
pd.set_option('display.max_columns', None)  # Display all columns
pd.set_option('display.max_rows', 100)  # Display up to 100 rows
plt.style.use('seaborn-v0_8-darkgrid')  # Set consistent plot style

# Set random seed for reproducibility
np.random.seed(42)

print("=" * 80)
print("HEALTHCARE READMISSION ANALYTICS PROJECT")
print("=" * 80)
print(f"Execution started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("All libraries imported successfully!\n")


HEALTHCARE READMISSION ANALYTICS PROJECT
Execution started at: 2026-01-22 00:21:50
All libraries imported successfully!



In [8]:
# ============================================================================
# SECTION 2: DATA ACQUISITION
# ============================================================================

"""
We'll download the Diabetes 130-US Hospitals dataset from UCI ML Repository.
This dataset contains 10 years of hospital admission records for diabetes patients.

Dataset characteristics:
- 101,766 hospital admissions
- ~50 features (patient demographics, diagnoses, medications, procedures)
- Target variable: Readmission status
"""

print("\n" + "=" * 80)
print("DATA ACQUISITION")
print("=" * 80)

# Define data URL - UCI Machine Learning Repository
DATA_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/00296/dataset_diabetes.zip"

# Alternative manual download instructions
print("""
DATA DOWNLOAD INSTRUCTIONS:
---------------------------
Option 1 (Recommended): Manual Download
1. Visit: https://archive.ics.uci.edu/ml/datasets/diabetes+130-us+hospitals+for+years+1999-2008
2. Click 'Download' to get 'dataset_diabetes.zip'
3. Extract the ZIP file
4. Place 'diabetic_data.csv' in your working directory

Option 2: Direct Download (if accessible)
The script will attempt to download automatically.
""")

# Check if data already exists
data_file = 'diabetes_data/diabetic_data.csv'

if os.path.exists(data_file):
    print(f"✓ Data file '{data_file}' found in current directory.")
else:
    print(f"✗ Data file '{data_file}' not found.")
    print("Please download manually following the instructions above.")
    print("\nFor this tutorial, we'll proceed assuming the file is available.")

# Load the dataset
try:
    print("\nLoading dataset...")
    df_raw = pd.read_csv(data_file)
    print(f"✓ Dataset loaded successfully!")
    print(f"  - Shape: {df_raw.shape[0]:,} rows × {df_raw.shape[1]} columns")
    print(f"  - Memory usage: {df_raw.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    
except FileNotFoundError:
    print("\n⚠ ERROR: Data file not found!")
    print("Creating sample instructions for students...")
    print("\nTo continue with this tutorial:")
    print("1. Download the dataset from the UCI repository")
    print("2. Save 'diabetic_data.csv' in this directory")
    print("3. Re-run this script")

 # For demonstration purposes, we'll create a note

# Display basic information about the dataset
print("\n" + "-" * 80)
print("INITIAL DATA PREVIEW")
print("-" * 80)
print("\nFirst 5 rows:")
print(df_raw.head())

print("\n\nDataset Info:")
print(df_raw.info())

print("\n\nBasic Statistics:")
print(df_raw.describe())




DATA ACQUISITION

DATA DOWNLOAD INSTRUCTIONS:
---------------------------
Option 1 (Recommended): Manual Download
1. Visit: https://archive.ics.uci.edu/ml/datasets/diabetes+130-us+hospitals+for+years+1999-2008
2. Click 'Download' to get 'dataset_diabetes.zip'
3. Extract the ZIP file
4. Place 'diabetic_data.csv' in your working directory

Option 2: Direct Download (if accessible)
The script will attempt to download automatically.

✓ Data file 'diabetes_data/diabetic_data.csv' found in current directory.

Loading dataset...
✓ Dataset loaded successfully!
  - Shape: 101,766 rows × 50 columns
  - Memory usage: 192.87 MB

--------------------------------------------------------------------------------
INITIAL DATA PREVIEW
--------------------------------------------------------------------------------

First 5 rows:
   encounter_id  patient_nbr             race  gender      age weight  \
0       2278392      8222157        Caucasian  Female   [0-10)      ?   
1        149190     55629189  