## **STEP 1: Loading and understanding the dataset**

In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

dataset = pd.read_csv('diabetic_data.csv')

In [2]:
# Display basic information
print("Dataset Shape:", dataset.shape)
print("Dataset Info:")
dataset.info()

Dataset Shape: (101766, 50)
Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 50 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   encounter_id              101766 non-null  int64 
 1   patient_nbr               101766 non-null  int64 
 2   race                      101766 non-null  object
 3   gender                    101766 non-null  object
 4   age                       101766 non-null  object
 5   weight                    101766 non-null  object
 6   admission_type_id         101766 non-null  int64 
 7   discharge_disposition_id  101766 non-null  int64 
 8   admission_source_id       101766 non-null  int64 
 9   time_in_hospital          101766 non-null  int64 
 10  payer_code                101766 non-null  object
 11  medical_specialty         101766 non-null  object
 12  num_lab_procedures        101766 non-null  int64 
 13  num_procedures   

In [3]:
# Display the first few rows
print("First 5 Rows of the Dataset:")
print(dataset.head())

First 5 Rows of the Dataset:
   encounter_id  patient_nbr             race  gender      age weight  \
0       2278392      8222157        Caucasian  Female   [0-10)      ?   
1        149190     55629189        Caucasian  Female  [10-20)      ?   
2         64410     86047875  AfricanAmerican  Female  [20-30)      ?   
3        500364     82442376        Caucasian    Male  [30-40)      ?   
4         16680     42519267        Caucasian    Male  [40-50)      ?   

   admission_type_id  discharge_disposition_id  admission_source_id  \
0                  6                        25                    1   
1                  1                         1                    7   
2                  1                         1                    7   
3                  1                         1                    7   
4                  1                         1                    7   

   time_in_hospital  ... citoglipton insulin  glyburide-metformin  \
0                 1  ...          No

In [4]:
# Summary statistics for numerical features
print("Summary Statistics for Numerical Features:")
print(dataset.describe())

Summary Statistics for Numerical Features:
       encounter_id   patient_nbr  admission_type_id  \
count  1.017660e+05  1.017660e+05      101766.000000   
mean   1.652016e+08  5.433040e+07           2.024006   
std    1.026403e+08  3.869636e+07           1.445403   
min    1.252200e+04  1.350000e+02           1.000000   
25%    8.496119e+07  2.341322e+07           1.000000   
50%    1.523890e+08  4.550514e+07           1.000000   
75%    2.302709e+08  8.754595e+07           3.000000   
max    4.438672e+08  1.895026e+08           8.000000   

       discharge_disposition_id  admission_source_id  time_in_hospital  \
count             101766.000000        101766.000000     101766.000000   
mean                   3.715642             5.754437          4.395987   
std                    5.280166             4.064081          2.985108   
min                    1.000000             1.000000          1.000000   
25%                    1.000000             1.000000          2.000000   
50%     

In [5]:
# Check for missing values
missing_values = dataset.isnull().sum()
print("Missing Values per Column:")
print(missing_values[missing_values > 0])

Missing Values per Column:
max_glu_serum    96420
A1Cresult        84748
dtype: int64


## **STEP 2: Data Preparation**

In [6]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [7]:
# Remove 'weight' column due to high percentage of missing values
dataset_cleaned = dataset.drop(columns=['weight'])

In [8]:
# Treat missing values for 'medical_specialty' and 'payer_code' as 'Unknown'
dataset_cleaned['medical_specialty'] = dataset_cleaned['medical_specialty'].replace('?', 'Unknown')
dataset_cleaned['payer_code'] = dataset_cleaned['payer_code'].replace('?', 'Unknown')

In [9]:
# Impute missing values in 'race' with the mode
race_mode = dataset_cleaned['race'].replace('?', pd.NA).mode()[0]
dataset_cleaned['race'] = dataset_cleaned['race'].replace('?', race_mode)

In [10]:
# Impute missing values in 'diag_1', 'diag_2', 'diag_3' with 'Unknown'
diagnosis_cols = ['diag_1', 'diag_2', 'diag_3']
for col in diagnosis_cols:
    dataset_cleaned[col] = dataset_cleaned[col].replace('?', 'Unknown')

In [11]:
# Handling Class Imbalance (for informational purposes, actual handling might vary based on modeling approach)
target_distribution = dataset_cleaned['readmitted'].value_counts(normalize=True) * 100
print("Distribution of 'readmitted':\n", target_distribution)

Distribution of 'readmitted':
 readmitted
NO     53.911916
>30    34.928169
<30    11.159916
Name: proportion, dtype: float64


In [12]:
# Feature Selection and Encoding

# Selecting categorical variables for one-hot encoding
categorical_cols = ['race', 'gender', 'age', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
                    'diag_1', 'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult'] + list(dataset_cleaned.columns[24:48])

In [13]:
# Keeping relevant numerical columns and excluding identifiers
numerical_cols = ['time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications',
                  'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses']

In [14]:
# Encoding categorical variables with OneHotEncoder and outputting as a sparse matrix to save memory
encoder = OneHotEncoder(sparse_output=True, drop='first')  # Using sparse output to save memory
encoded_features_sparse = encoder.fit_transform(dataset_cleaned[categorical_cols])

In [15]:
# Since we're using sparse matrices, direct conversion to a DataFrame is skipped to save memory
# Prepare numerical data as a DataFrame for potential dense input requirements
numerical_data_df = dataset_cleaned[numerical_cols].reset_index(drop=True)

In [16]:
# Output shapes as a check
print("Shape of encoded features:", encoded_features_sparse.shape)
print("Shape of numerical data:", numerical_data_df.shape)

Shape of encoded features: (101766, 2369)
Shape of numerical data: (101766, 8)


## **STEP 3: Model Implementation**

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from scipy.sparse import hstack

# Prepare the full feature set by combining numerical and encoded categorical features
X = hstack([numerical_data_df, encoded_features_sparse])
y = dataset_cleaned['readmitted'].apply(lambda x: 1 if x == '<30' else 0)  # Binary classification for readmission <30 days

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


# **A. Single Machine Learning Algorithms**

**1. LOGISTIC REGRESSION MODEL**

In [18]:
# Initialize and train the Logistic Regression model
lr_model = LogisticRegression(max_iter=100, solver='liblinear', random_state=42)
lr_model.fit(X_train, y_train)

# Predictions
y_pred_train = lr_model.predict(X_train)
y_pred_test = lr_model.predict(X_test)

# Evaluate the model
accuracy_train = accuracy_score(y_train, y_pred_train)
accuracy_test = accuracy_score(y_test, y_pred_test)
f1_train = f1_score(y_train, y_pred_train)
f1_test = f1_score(y_test, y_pred_test)

print("Training Accuracy:", accuracy_train)
print("Testing Accuracy:", accuracy_test)
print("Training F1 Score:", f1_train)
print("Testing F1 Score:", f1_test)

Training Accuracy: 0.8889967081019997
Testing Accuracy: 0.888375749238479
Training F1 Score: 0.049237243556023144
Testing F1 Score: 0.04215851602023609


**2. DECISION TREE**

In [19]:
from sklearn.tree import DecisionTreeClassifier

# Initialize the Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42)

# Train the model
dt_model.fit(X_train, y_train)

# Predictions
y_pred_train_dt = dt_model.predict(X_train)
y_pred_test_dt = dt_model.predict(X_test)

# Evaluate the model
accuracy_train_dt = accuracy_score(y_train, y_pred_train_dt)
accuracy_test_dt = accuracy_score(y_test, y_pred_test_dt)
f1_train_dt = f1_score(y_train, y_pred_train_dt)
f1_test_dt = f1_score(y_test, y_pred_test_dt)

print("Decision Tree - Training Accuracy:", accuracy_train_dt)
print("Decision Tree - Testing Accuracy:", accuracy_test_dt)
print("Decision Tree - Training F1 Score:", f1_train_dt)
print("Decision Tree - Testing F1 Score:", f1_test_dt)



Decision Tree - Training Accuracy: 1.0
Decision Tree - Testing Accuracy: 0.8226392846614916
Decision Tree - Training F1 Score: 1.0
Decision Tree - Testing F1 Score: 0.16551086453999075


**K-NEAREST NEIGHBORS (KNN)**

In [20]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize the KNN model
knn_model = KNeighborsClassifier(n_neighbors=3)

# Train the model
knn_model.fit(X_train, y_train)

# Predictions
y_pred_train_knn = knn_model.predict(X_train)
y_pred_test_knn = knn_model.predict(X_test)

# Evaluate the model
accuracy_train_knn = accuracy_score(y_train, y_pred_train_knn)
accuracy_test_knn = accuracy_score(y_test, y_pred_test_knn)
f1_train_knn = f1_score(y_train, y_pred_train_knn)
f1_test_knn = f1_score(y_test, y_pred_test_knn)

print("K-NEAREST NEIGHBORS - Training Accuracy:", accuracy_train_dt)
print("K-NEAREST NEIGHBORS - Testing Accuracy:", accuracy_test_dt)
print("K-NEAREST NEIGHBORS - Training F1 Score:", f1_train_dt)
print("K-NEAREST NEIGHBORS - Testing F1 Score:", f1_test_dt)

K-NEAREST NEIGHBORS - Training Accuracy: 1.0
K-NEAREST NEIGHBORS - Testing Accuracy: 0.8226392846614916
K-NEAREST NEIGHBORS - Training F1 Score: 1.0
K-NEAREST NEIGHBORS - Testing F1 Score: 0.16551086453999075


**SUPPORT VECTOR MACHINE (SVM)**

In [21]:
from sklearn.svm import LinearSVC

# Initialize the SVM model
svm_model = LinearSVC(random_state=42, max_iter=1000)

# Train the model
svm_model.fit(X_train, y_train)

# Predictions
y_pred_train_svm = svm_model.predict(X_train)
y_pred_test_svm = svm_model.predict(X_test)

# Evaluate the model
accuracy_train_svm = accuracy_score(y_train, y_pred_train_svm)
accuracy_test_svm = accuracy_score(y_test, y_pred_test_svm)
f1_train_svm = f1_score(y_train, y_pred_train_svm)
f1_test_svm = f1_score(y_test, y_pred_test_svm)

print("SVM - Training Accuracy:", accuracy_train_dt)
print("SVM - Testing Accuracy:", accuracy_test_dt)
print("SVM - Training F1 Score:", f1_train_dt)
print("SVM - Testing F1 Score:", f1_test_dt)

SVM - Training Accuracy: 1.0
SVM - Testing Accuracy: 0.8226392846614916
SVM - Training F1 Score: 1.0
SVM - Testing F1 Score: 0.16551086453999075


**NAIVE BAYES**

In [22]:
from sklearn.naive_bayes import GaussianNB

# Initialize the Naive Bayes model
nb_model = GaussianNB()

# Since Naive Bayes doesn't handle sparse data natively, convert to dense
X_train_dense = X_train.toarray()
X_test_dense = X_test.toarray()

# Train the model
nb_model.fit(X_train_dense, y_train)

# Predictions
y_pred_train_nb = nb_model.predict(X_train_dense)
y_pred_test_nb = nb_model.predict(X_test_dense)

# Evaluate the model
accuracy_train_nb = accuracy_score(y_train, y_pred_train_nb)
accuracy_test_nb = accuracy_score(y_test, y_pred_test_nb)
f1_train_nb = f1_score(y_train, y_pred_train_nb)
f1_test_nb = f1_score(y_test, y_pred_test_nb)

print("NAIVE BAYES - Training Accuracy:", accuracy_train_dt)
print("NAIVE BAYES - Testing Accuracy:", accuracy_test_dt)
print("NAIVE BAYES - Training F1 Score:", f1_train_dt)
print("NAIVE BAYES - Testing F1 Score:", f1_test_dt)

NAIVE BAYES - Training Accuracy: 1.0
NAIVE BAYES - Testing Accuracy: 0.8226392846614916
NAIVE BAYES - Training F1 Score: 1.0
NAIVE BAYES - Testing F1 Score: 0.16551086453999075


## **B. Ensemble Learning Algorithms**

**RANDOM FOREST**

In [23]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Predictions
y_pred_train_rf = rf_model.predict(X_train)
y_pred_test_rf = rf_model.predict(X_test)

# Evaluate the model
accuracy_train_rf = accuracy_score(y_train, y_pred_train_rf)
accuracy_test_rf = accuracy_score(y_test, y_pred_test_rf)
f1_train_rf = f1_score(y_train, y_pred_train_rf)
f1_test_rf = f1_score(y_test, y_pred_test_rf)

print("Random Forest - Training Accuracy:", accuracy_train_rf)
print("Random Forest - Testing Accuracy:", accuracy_test_rf)
print("Random Forest - Training F1 Score:", f1_train_rf)
print("Random Forest - Testing F1 Score:", f1_test_rf)


Random Forest - Training Accuracy: 0.9999877167985064
Random Forest - Testing Accuracy: 0.8886214011987815
Random Forest - Training F1 Score: 0.9999449672555171
Random Forest - Testing F1 Score: 0.012200435729847494


**GRADIENT BOOSTING**

In [24]:
from sklearn.ensemble import GradientBoostingClassifier

# Initialize the Gradient Boosting model
gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)

# Train the model
gb_model.fit(X_train, y_train)

# Predictions
y_pred_train_gb = gb_model.predict(X_train)
y_pred_test_gb = gb_model.predict(X_test)

# Evaluate the model
accuracy_train_gb = accuracy_score(y_train, y_pred_train_gb)
accuracy_test_gb = accuracy_score(y_test, y_pred_test_gb)
f1_train_gb = f1_score(y_train, y_pred_train_gb)
f1_test_gb = f1_score(y_test, y_pred_test_gb)

print("GRADIENT BOOSTING - Training Accuracy:", accuracy_train_rf)
print("GRADIENT BOOSTING - Testing Accuracy:", accuracy_test_rf)
print("GRADIENT BOOSTING - Training F1 Score:", f1_train_rf)
print("GRADIENT BOOSTING - Testing F1 Score:", f1_test_rf)


GRADIENT BOOSTING - Training Accuracy: 0.9999877167985064
GRADIENT BOOSTING - Testing Accuracy: 0.8886214011987815
GRADIENT BOOSTING - Training F1 Score: 0.9999449672555171
GRADIENT BOOSTING - Testing F1 Score: 0.012200435729847494


**ADABOOST**

In [25]:
from sklearn.ensemble import AdaBoostClassifier

# Initialize the AdaBoost model
ab_model = AdaBoostClassifier(n_estimators=100, random_state=42)

# Train the model
ab_model.fit(X_train, y_train)

# Predictions
y_pred_train_ab = ab_model.predict(X_train)
y_pred_test_ab = ab_model.predict(X_test)

# Evaluate the model
accuracy_train_ab = accuracy_score(y_train, y_pred_train_ab)
accuracy_test_ab = accuracy_score(y_test, y_pred_test_ab)
f1_train_ab = f1_score(y_train, y_pred_train_ab)
f1_test_ab = f1_score(y_test, y_pred_test_ab)

print("ADABOOST - Training Accuracy:", accuracy_train_rf)
print("ADABOOST - Testing Accuracy:", accuracy_test_rf)
print("ADABOOST - Training F1 Score:", f1_train_rf)
print("ADABOOST - Testing F1 Score:", f1_test_rf)

ADABOOST - Training Accuracy: 0.9999877167985064
ADABOOST - Testing Accuracy: 0.8886214011987815
ADABOOST - Training F1 Score: 0.9999449672555171
ADABOOST - Testing F1 Score: 0.012200435729847494
