# PRJ-002: Diabetes 130-US Hospitals for Years 1999-2008 

In [None]:
admission_type_id={1:"Emergency", 2:"Urgent", 3:"Elective",4:"Newborn",5:"Not Available",6:"NULL",7:"Not Mapped"}

discharge_disposition_id = {
    1: "Discharged to home",
    2: "Discharged/transferred to another short term hospital",
    3: "Discharged/transferred to SNF",
    4: "Discharged/transferred to ICF",
    5: "Discharged/transferred to another type of inpatient care institution",
    6: "Discharged/transferred to home with home health service",
    7: "Left AMA",
    8: "Discharged/transferred to home under care of Home IV provider",
    9: "Admitted as an inpatient to this hospital",
    10: "Neonate discharged to another hospital for neonatal aftercare",
    11: "Expired",
    12: "Still patient or expected to return for outpatient services",
    13: "Hospice / home",
    14: "Hospice / medical facility",
    15: "Discharged/transferred within this institution to Medicare approved swing bed",
    16: "Discharged/transferred/referred another institution for outpatient services",
    17: "Discharged/transferred/referred to this institution for outpatient services",
    18: "NULL",
    19: "Expired at home. Medicaid only, hospice.",
    20: "Expired in a medical facility. Medicaid only, hospice.",
    21: "Expired, place unknown. Medicaid only, hospice.",
    22: "Discharged/transferred to another rehab fac including rehab units of a hospital",
    23: "Discharged/transferred to a long term care hospital",
    24: "Discharged/transferred to a nursing facility certified under Medicaid but not certified under Medicare.",
    25: "Not Mapped",
    26: "Unknown/Invalid",
    30: "Discharged/transferred to another Type of Health Care Institution not Defined Elsewhere",
    27: "Discharged/transferred to a federal health care facility.",
    28: "Discharged/transferred/referred to a psychiatric hospital or psychiatric distinct part unit of a hospital",
    29: "Discharged/transferred to a Critical Access Hospital (CAH)."
}

admission_source_id = {
    1: "Physician Referral",
    2: "Clinic Referral",
    3: "HMO Referral",
    4: "Transfer from a hospital",
    5: "Transfer from a Skilled Nursing Facility (SNF)",
    6: "Transfer from another health care facility",
    7: "Emergency Room",
    8: "Court/Law Enforcement",
    9: "Not Available",
    10: "Transfer from critical access hospital",
    11: "Normal Delivery",
    12: "Premature Delivery",
    13: "Sick Baby",
    14: "Extramural Birth",
    15: "Not Available",
    17: "NULL",
    18: "Transfer From Another Home Health Agency",
    19: "Readmission to Same Home Health Agency",
    20: "Not Mapped",
    21: "Unknown/Invalid",
    22: "Transfer from hospital inpt/same fac reslt in a sep claim",
    23: "Born inside this hospital",
    24: "Born outside this hospital",
    25: "Transfer from Ambulatory Surgery Center",
    26: "Transfer from Hospice"
}

dosage={0:"No",1:"Down",2:"Up",3:"Steady"}

## Imports
Copied from previous project to have a foundation. We can delete unused ones later.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy.stats import gaussian_kde
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

## Import dataset and print basic info

In [None]:
data = pd.read_csv("diabetic_data.csv")

print(data.info())
print(data.head())

# Replace "?" and "None" with "NaN" and check for missing values
data.replace('?', np.nan, inplace=True)
data.replace('None', np.nan, inplace=True)
data.isnull().sum()

## Preprocess Data

### Handle missing values
Drop columns with too many missing values and fill in others.

In [None]:
# Mostly missing values and/or low relevance
# E.g. max_glu_serum and A1Cresult are relevant, but have 94% and 83% missing values.
data.drop(columns=['weight', 
                   'payer_code', 
                   'max_glu_serum', 
                   'A1Cresult',
                   'encounter_id',
                   'patient_nbr'], inplace=True)

# Many missing values, but potentially high relevance
data['medical_specialty'].fillna('Unknown', inplace=True)

# Few missing values
data['race'].fillna('Unknown', inplace=True)
data['diag_1'].fillna('Unknown', inplace=True)
data['diag_2'].fillna('Unknown', inplace=True)
data['diag_3'].fillna('Unknown', inplace=True)

# Remove 3 rows where gender is "Unknown/Invalid"
data = data[data['gender'] != 'Unknown/Invalid']

# Display updated summary of missing values
print(data.isnull().sum())

### Encoding

In [None]:
# Binary Encoding
data['gender'] = data['gender'].map({'Male': 1, 'Female': 0})
data['change'] = data['change'].map({'Ch': 1, 'No': 0})
data['diabetesMed'] = data['diabetesMed'].map({'Yes': 1, 'No': 0})

# Numeric mapping for medication columns
medication_columns = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 
                      'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 
                      'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 
                      'miglitol', 'troglitazone', 'tolazamide', 'examide', 
                      'citoglipton', 'insulin', 'glyburide-metformin', 
                      'glipizide-metformin', 'glimepiride-pioglitazone', 
                      'metformin-rosiglitazone', 'metformin-pioglitazone']

data[medication_columns] = data[medication_columns].replace({'Steady': 1, 'No': 0, 'Down': -1, 'Up': 2})

# One-Hot Encoding for categorical columns
data = pd.get_dummies(data, columns=['race', 'medical_specialty', 'admission_type_id', 'discharge_disposition_id', 'diag_1', 'diag_2', 'diag_3'], drop_first=True)


# Ordinal Encoding for ordered categories
age_order = [['[0-10)', '[10-20)', '[20-30)', '[30-40)', '[40-50)', '[50-60)', '[60-70)', '[70-80)', '[80-90)', '[90-100)']]
readmitted_order = [['NO', '>30', '<30']]
ordinal_encoder = OrdinalEncoder(categories=age_order + readmitted_order)
data[['age', 'readmitted']] = ordinal_encoder.fit_transform(data[['age', 'readmitted']])


### Splitting

In [None]:
# Split into features and target
X = data.drop('readmitted', axis=1)
y = data['readmitted']

# Split into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Scaling

In [None]:
scaler = StandardScaler()
numerical_features = ['time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient']
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])

In [None]:
# Testing for missing values, troubleshooting
print("Missing values in X_train:", X_train.isnull().sum().sum())

## Model choice
We decided to start with two classification and one clustering algorithm.
### Logistic Regression
- Effective for rougly linear relationships
- Interpretable (giving us an idea of the meaning of each feature)
- Efficient for large datasets
### Random Forest
- Captures complex, non-linear relationships (good counterpart to Logisitc Regression)
- Allows determining feature importance
- Usually performs well without hypertuning
### K-Means Clustering
- Easy to interpret
- Effective for grouping patients with similar characteristics
- Might still be fast enough for a dataset of our size
- There are different ways to make K-Means faster, if needed.

## Model implementation

### Logistic Regression

In [None]:
# Initialize
log_reg = LogisticRegression(solver='saga', max_iter=1000, random_state=42)

# train
log_reg.fit(X_train, y_train)

# Predict
y_pred_log_reg = log_reg.predict(X_test)

# Evaluate
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log_reg))
print(classification_report(y_test, y_pred_log_reg))
print(confusion_matrix(y_test, y_pred_log_reg))