In [1]:
from warnings import filterwarnings
filterwarnings(action='ignore')

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import plotly.express as px

import scipy.stats as stats
from scipy.stats import zscore

from sklearn.preprocessing import LabelEncoder,OneHotEncoder,OrdinalEncoder
from sklearn.preprocessing import PolynomialFeatures,PowerTransformer,StandardScaler

from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector

from sklearn.metrics import classification_report,accuracy_score,precision_score,confusion_matrix
from sklearn.metrics import recall_score,f1_score,balanced_accuracy_score
from sklearn.metrics import precision_recall_curve

from sklearn.model_selection import StratifiedKFold,cross_val_score,train_test_split
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB,ComplementNB,CategoricalNB

from sklearn.impute import KNNImputer

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier,VotingClassifier

from imblearn.over_sampling import SMOTE,SMOTEN,SMOTENC

import re

from sklearn.feature_selection import SelectKBest,VarianceThreshold

from fancyimpute import IterativeImputer,knn,IterativeSVD

In [2]:
link = 'F:\GREAT LAKES\PROJECT\CAPSTONE\SMOTENC_over_sampling_for_ decision tree (Ordinal)\Ordinal_encoded_imbalanced.csv'
link_nc = 'F:\GREAT LAKES\PROJECT\CAPSTONE\SMOTENC_over_sampling_for_ decision tree (Ordinal)\Ordinal_encoded_smotenc.csv'

In [3]:
df = pd.read_csv(link)
df = df.iloc[:,1:]

df_nc = pd.read_csv(link_nc)
df_nc = df_nc.iloc[:,1:]

In [4]:
df.shape

(100985, 39)

In [6]:
def split(predictor,Target,testing_size=0.25,stratify=True):
    if stratify:
        X_train, X_test, y_train, y_test = train_test_split(predictor,Target,test_size=testing_size,stratify=Target,
                                                    random_state=93)
        return X_train, X_test, y_train, y_test
    else:
        X_train, X_test, y_train, y_test = train_test_split(predictor,Target,test_size=testing_size,random_state=93)
        return X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = split(df.drop(columns=['readmitted']),df[['readmitted']])

In [7]:
def split_nc(predictor,Target,testing_size=0.25):
        X_train, X_test, y_train, y_test = train_test_split(predictor,Target,test_size=testing_size,
                                                    random_state=93)
        return X_train, X_test, y_train, y_test
Xb_train, Xb_test, yb_train, yb_test = split(df_nc.drop(columns=['readmitted']),df_nc[['readmitted']])

In [8]:
def cv_report_nc(Model,Training_data,Target_imbalanced):
    X_train, X_test, y_train, y_test = train_test_split(Training_data,Target_imbalanced,
                                                        test_size=0.25, random_state=42)
    ba = cross_val_score(Model,X_train,y_train,cv=StratifiedKFold(),
                         scoring='balanced_accuracy').mean()
    pr = cross_val_score(Model,X_train,y_train,cv=StratifiedKFold(),scoring='precision').mean()
    re = cross_val_score(Model,X_train,y_train,cv=StratifiedKFold(),scoring='recall').mean()
    f1 = cross_val_score(Model,X_train,y_train,cv=StratifiedKFold(),scoring='f1').mean()
    print("Metric            :  percentage")
    print("-"*35)
    print(f"Balanced accuracy :  {ba*100:0.2f} %")
    print(f"Precision         :  {pr*100:0.2f} %")
    print(f"recall            :  {re*100:0.2f} %")
    print(f"F1-score          :  {f1*100:0.2f} %")

In [8]:
X_train.shape

(75738, 38)

In [9]:
knn_model = KNeighborsClassifier(n_neighbors=5)

In [10]:
knn_model.fit(X_train,y_train)

In [12]:
print(classification_report(y_train,knn_model.predict(X_train)))

              precision    recall  f1-score   support

           0       0.90      0.99      0.94     67253
           1       0.66      0.09      0.16      8485

    accuracy                           0.89     75738
   macro avg       0.78      0.54      0.55     75738
weighted avg       0.87      0.89      0.86     75738



In [13]:
print(classification_report(y_test,knn_model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.89      0.99      0.94     22418
           1       0.22      0.03      0.05      2829

    accuracy                           0.88     25247
   macro avg       0.55      0.51      0.49     25247
weighted avg       0.81      0.88      0.84     25247



## Fitting model on SMOTENC oversampled data

In [9]:
Xb_train, Xb_test, yb_train, yb_test = split(df_nc.drop(columns=['readmitted']).apply(zscore),df_nc[['readmitted']])

In [31]:
knn_model_nc = KNeighborsClassifier(n_neighbors=5)
knn_model_nc.fit(Xb_train,yb_train)

In [32]:
print(classification_report(yb_train,knn_model_nc.predict(Xb_train)))

              precision    recall  f1-score   support

           0       0.91      0.82      0.86     67253
           1       0.84      0.92      0.87     67253

    accuracy                           0.87    134506
   macro avg       0.87      0.87      0.87    134506
weighted avg       0.87      0.87      0.87    134506



In [33]:
print(classification_report(yb_test,knn_model_nc.predict(Xb_test)))

              precision    recall  f1-score   support

           0       0.87      0.75      0.80     22418
           1       0.78      0.89      0.83     22418

    accuracy                           0.82     44836
   macro avg       0.82      0.82      0.82     44836
weighted avg       0.82      0.82      0.82     44836



In [35]:
cv_report_nc(knn_model_nc,Xb_train,yb_train)

Metric            :  percentage
-----------------------------------
Balanced accuracy :  80.25 %
Precision         :  76.38 %
recall            :  87.51 %
F1-score          :  81.57 %


## `Feature Engineering`

In [36]:
# Health_index = 1 / (number_emergency + number_inpatient + number_outpatient)
# severity_of_disease = (time_in_hospital(in days) + num_procedures + num_medications + num_lab_procedures + number_of_diagnoses)

In [38]:
# Based on frequency of patient’s visit to the hospital is high 
# then we can say that patient is less healthier and 
# less healthier patient tends to readmit quickly. 
# Higher the health_index lesser the chance that person will readmit (inversely proportional)

In [13]:
hospital_data = (df_nc['number_outpatient'] + df_nc['number_emergency'] + df_nc['number_inpatient'])
health_index = hospital_data

# for some of the patients there is no visit

In [14]:
# Severity of disease  is the feature created based on time spent in hospital, 
# number of procedures, medications taken by patient. 
# For probabilistic interpretation we divided it by total values.

severity = (df_nc['time_in_hospital'] + df_nc['num_lab_procedures'] + df_nc['num_procedures'] + df_nc['num_medications'] + df_nc['number_diagnoses'])

severity_of_disease = severity

In [15]:
health_index_std = pd.DataFrame(health_index,columns=['Health_index'])
severity_of_disease_std = pd.DataFrame(severity_of_disease,columns=['Severity_of_disease'])

In [16]:
# lets add 'severity index' & 'health_index' to SMOTE resampled data

predictor_nc_f = pd.concat([df_nc.drop(columns=['readmitted']),health_index_std,severity_of_disease_std],axis=1)
Target_nc_f = df_nc[['readmitted']]

In [17]:
Xbf_train, Xbf_test, ybf_train, ybf_test = split_nc(predictor_nc_f.apply(zscore),Target_nc_f,testing_size=0.25)

In [44]:
knn_model_ncf = KNeighborsClassifier(n_neighbors=5)
knn_model_ncf.fit(Xbf_train,ybf_train)

In [45]:
print(classification_report(ybf_train,knn_model_ncf.predict(Xbf_train)))

              precision    recall  f1-score   support

           0       0.91      0.82      0.86     67220
           1       0.83      0.92      0.87     67286

    accuracy                           0.87    134506
   macro avg       0.87      0.87      0.87    134506
weighted avg       0.87      0.87      0.87    134506



In [46]:
print(classification_report(ybf_test,knn_model_ncf.predict(Xbf_test)))

              precision    recall  f1-score   support

           0       0.88      0.75      0.81     22451
           1       0.78      0.89      0.84     22385

    accuracy                           0.82     44836
   macro avg       0.83      0.82      0.82     44836
weighted avg       0.83      0.82      0.82     44836



In [47]:
cv_report_nc(knn_model_ncf,Xbf_train,ybf_train)

Metric            :  percentage
-----------------------------------
Balanced accuracy :  80.26 %
Precision         :  76.27 %
recall            :  87.80 %
F1-score          :  81.63 %


In [17]:
knn_model_1 = KNeighborsClassifier(n_neighbors=7,weights='distance')
knn_model_1.fit(Xbf_train,ybf_train)

In [18]:
cv_report_nc(knn_model_1,Xbf_train,ybf_train)

Metric            :  percentage
-----------------------------------
Balanced accuracy :  80.77 %
Precision         :  76.37 %
recall            :  89.06 %
F1-score          :  82.23 %


In [19]:
knn_model_2 = KNeighborsClassifier(n_neighbors=7)
knn_model_2.fit(Xbf_train,ybf_train)

In [20]:
cv_report_nc(knn_model_2,Xbf_train,ybf_train)

Metric            :  percentage
-----------------------------------
Balanced accuracy :  79.98 %
Precision         :  76.01 %
recall            :  87.55 %
F1-score          :  81.37 %


In [18]:
knn_model_3 = KNeighborsClassifier(n_neighbors=5,p=1)
knn_model_3.fit(Xbf_train,ybf_train)

In [19]:
cv_report_nc(knn_model_3,Xbf_train,ybf_train)

Metric            :  percentage
-----------------------------------
Balanced accuracy :  80.90 %
Precision         :  75.37 %
recall            :  91.75 %
F1-score          :  82.76 %
