In [2]:
from warnings import filterwarnings
filterwarnings(action='ignore')

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import plotly.express as px

import scipy.stats as stats
from scipy.stats import zscore

from sklearn.metrics import classification_report,accuracy_score,precision_score,confusion_matrix
from sklearn.metrics import recall_score,f1_score,balanced_accuracy_score,roc_curve
from sklearn.metrics import precision_recall_curve,log_loss,cohen_kappa_score,roc_auc_score

from sklearn.model_selection import StratifiedKFold,cross_val_score,train_test_split
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier,VotingClassifier

import re

from sklearn.feature_selection import SelectKBest,VarianceThreshold

from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier

In [3]:
from User_defined_functions import cv_report

In [4]:
link_nc_rf = 'F:\GREAT LAKES\PROJECT\CAPSTONE\SMOTE_NC_Decision_Tree&_Random_Forest_data\SMOTENC_Dummy_encoded_data_reduced_size.csv'

In [5]:
df = pd.read_csv(link_nc_rf)
df = df.iloc[:,1:]

In [6]:
df_tree = df.copy()

In [7]:
Training_data = df_tree.drop('readmitted',1)
Target_balanced = df_tree[['readmitted']]

In [8]:
patient_info = ['race', 'gender', 'age', 'admission_type_id',
                'discharge_disposition_id','admission_source_id',
                'medical_specialty','diagnosis_1','diagnosis_2', 'diagnosis_3',
                'max_glu_serum', 'A1Cresult','change', 'diabetesMed']

# Features which contains no information
drop_list_patient_info = ['encounter_id','patient_nbr','weight','payer_code']

# Numeric features
patient_info_numeric = ['time_in_hospital','num_lab_procedures',
                        'num_procedures','num_medications',
                        'number_outpatient','number_emergency',
                        'number_inpatient','number_diagnoses']

# Taking initial decision to keep 16 features
feature_medicine = ['metformin','repaglinide', 'nateglinide', 
                     'chlorpropamide', 'glimepiride', 'glipizide', 
                     'glyburide', 'tolbutamide','pioglitazone',
                     'rosiglitazone', 'acarbose', 'miglitol','tolazamide',
                     'insulin','glyburide-metformin', 'glipizide-metformin']

# Initial decision to remove 7 features
drop_list_medicine = ['acetohexamide','troglitazone','examide','citoglipton',
                      'glimepiride-pioglitazone','metformin-rosiglitazone',
                      'metformin-pioglitazone']


# The medicines which contains 'Steady' & 'No' for encoding purpose
two_category_medicine = ['acetohexamide', 'tolbutamide', 'troglitazone',
                         'glipizide-metformin', 'glimepiride-pioglitazone',
                         'metformin-rosiglitazone', 'metformin-pioglitazone']
# ------------------------------------------------------------------
# The medicines which contains 'Steady','Up','Down' & 'No' for encoding purpose (14)
four_category_medicine = ['metformin','repaglinide', 'nateglinide', 'chlorpropamide',
                          'glimepiride','glipizide','glyburide','pioglitazone','rosiglitazone',
                          'acarbose','miglitol','tolazamide','insulin','glyburide-metformin'] 
# ------------------------------------------------------------------
# considering these two medicines with only 'steady' & 'No' categories for encoding purpose
two_category_medicine_to_be_kept = ['tolbutamide','glipizide-metformin']
# ------------------------------------------------------------------                                                
# Out of initial removal decision, these 2 are compulsorily removed.
single_category_medicine = ['examide', 'citoglipton']
                                             

print(set(drop_list_medicine) & set(two_category_medicine))

dependent = 'readmitted'

{'troglitazone', 'acetohexamide', 'glimepiride-pioglitazone', 'metformin-pioglitazone', 'metformin-rosiglitazone'}


In [9]:
def cv_report(Model,Training_data,Target_imbalanced):
    X_train, X_test, y_train, y_test = train_test_split(Training_data,Target_imbalanced,
                                                        test_size=0.25, random_state=42,
                                                        stratify=Target_imbalanced)
    ba = cross_val_score(Model,X_train,y_train,cv=StratifiedKFold(),
                         scoring='balanced_accuracy').mean()
    pr = cross_val_score(Model,X_train,y_train,cv=StratifiedKFold(),scoring='precision').mean()
    re = cross_val_score(Model,X_train,y_train,cv=StratifiedKFold(),scoring='recall').mean()
    f1 = cross_val_score(Model,X_train,y_train,cv=StratifiedKFold(),scoring='f1').mean()
    print(f"Balanced accuracy : {ba}")
    print(f"Precision : {pr}")
    print(f"recall : {re}")
    print(f"F1-score : {f1}")

In [11]:
X_train, X_test, y_train, y_test = train_test_split(Training_data,Target_balanced, 
                                                    test_size=0.25,random_state=93)

In [12]:
Xg_model = XGBClassifier(n_estimators=150,max_depth=4)

In [13]:
trans = dict([(column,"more_than_".join((column.split(">")))) if '>' in column 
              else (column,"(".join((column.split("[")))) if '[' in column 
              else (column,column) for column in X_train.columns.to_list()])

In [14]:
X_train_xg = X_train.rename(columns=trans)

In [15]:
X_test_xg = X_test.rename(columns=trans)

In [16]:
Xg_model.fit(X_train_xg,y_train)

In [17]:
print(classification_report(y_train,Xg_model.predict(X_train_xg)))

              precision    recall  f1-score   support

           0       0.88      0.99      0.93     67220
           1       0.99      0.86      0.92     67286

    accuracy                           0.93    134506
   macro avg       0.94      0.93      0.93    134506
weighted avg       0.94      0.93      0.93    134506



In [18]:
XG_features = [column for column,importance in zip(X_train_xg.columns.to_list(),Xg_model.feature_importances_.tolist()) 
 if importance > 0]

In [None]:
len(XG_features)

In [19]:
Xg_model_imp = XGBClassifier(n_estimators=150,max_depth=4)
Xg_model_imp.fit(X_train_xg[XG_features],y_train)

In [20]:
print(classification_report(y_train,Xg_model_imp.predict(X_train_xg[XG_features])))

              precision    recall  f1-score   support

           0       0.88      0.99      0.93     67220
           1       0.99      0.86      0.92     67286

    accuracy                           0.93    134506
   macro avg       0.94      0.93      0.93    134506
weighted avg       0.94      0.93      0.93    134506



In [21]:
print(classification_report(y_test,Xg_model_imp.predict(X_test_xg[XG_features])))

              precision    recall  f1-score   support

           0       0.88      0.99      0.93     22451
           1       0.99      0.86      0.92     22385

    accuracy                           0.92     44836
   macro avg       0.93      0.92      0.92     44836
weighted avg       0.93      0.92      0.92     44836

