In [445]:
from warnings import filterwarnings
filterwarnings(action='ignore')

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
sns.set_style('darkgrid')
import plotly.express as px

import scipy.stats as stats
from scipy.stats import zscore

from sklearn.preprocessing import LabelEncoder,OneHotEncoder,OrdinalEncoder
from sklearn.preprocessing import PolynomialFeatures,PowerTransformer,StandardScaler

from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector

from sklearn.metrics import classification_report,accuracy_score,precision_score,confusion_matrix
from sklearn.metrics import recall_score,f1_score,balanced_accuracy_score
from sklearn.metrics import precision_recall_curve

from sklearn.model_selection import StratifiedKFold,cross_val_score,train_test_split
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.naive_bayes import GaussianNB

from sklearn.impute import KNNImputer

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier,VotingClassifier

from sklearn.ensemble import GradientBoostingClassifier,AdaBoostClassifier

from sklearn.feature_selection import RFE,SelectKBest,SequentialFeatureSelector,chi2

from imblearn.over_sampling import SMOTE,SMOTEN,SMOTENC

from xgboost import XGBClassifier

import re

In [377]:
link_imb = 'F:\GREAT LAKES\PROJECT\CAPSTONE\Iterative_Imputatiof_diagnosis\Fully_finally_imputed_data.csv'

In [378]:
# link_over = 'F:\GREAT LAKES\PROJECT\CAPSTONE\MODELING\Logistic regression\Final_data_for_modeling\Over_sampled_data(SMOTENC)_for_logistic_regression\SMOTENC_logistic_regression_reduced_size.csv'

In [379]:
df_imb = pd.read_csv(link_imb)
df_imb = df_imb.iloc[:,1:]

In [380]:
# df_over = pd.read_csv(link_over)
# df_over = df_over.iloc[:,1:]

# predictors_balanced = df_over.drop('readmitted',1)

In [381]:
patient_info = ['race', 'gender', 'age', 'admission_type_id',
                'discharge_disposition_id','admission_source_id',
                'medical_specialty','diagnosis_1','diagnosis_2', 'diagnosis_3',
                'max_glu_serum', 'A1Cresult','change', 'diabetesMed']

# Features which contains no information
drop_list_patient_info = ['encounter_id','patient_nbr','weight','payer_code']

# Numeric features
patient_info_numeric = ['time_in_hospital','num_lab_procedures',
                        'num_procedures','num_medications',
                        'number_outpatient','number_emergency',
                        'number_inpatient','number_diagnoses']

# Taking initial decision to keep 16 features
feature_medicine = ['metformin','repaglinide', 'nateglinide', 
                     'chlorpropamide', 'glimepiride', 'glipizide', 
                     'glyburide', 'tolbutamide','pioglitazone',
                     'rosiglitazone', 'acarbose', 'miglitol','tolazamide',
                     'insulin','glyburide-metformin', 'glipizide-metformin']

# Initial decision to remove 7 features
drop_list_medicine = ['acetohexamide','troglitazone','examide','citoglipton',
                      'glimepiride-pioglitazone','metformin-rosiglitazone',
                      'metformin-pioglitazone']


# The medicines which contains 'Steady' & 'No' for encoding purpose
two_category_medicine = ['acetohexamide', 'tolbutamide', 'troglitazone',
                         'glipizide-metformin', 'glimepiride-pioglitazone',
                         'metformin-rosiglitazone', 'metformin-pioglitazone']
# ------------------------------------------------------------------
# The medicines which contains 'Steady','Up','Down' & 'No' for encoding purpose (14)
four_category_medicine = ['metformin','repaglinide', 'nateglinide', 'chlorpropamide',
                          'glimepiride','glipizide','glyburide','pioglitazone','rosiglitazone',
                          'acarbose','miglitol','tolazamide','insulin','glyburide-metformin'] 
# ------------------------------------------------------------------
# considering these two medicines with only 'steady' & 'No' categories for encoding purpose
two_category_medicine_to_be_kept = ['tolbutamide','glipizide-metformin']
# ------------------------------------------------------------------                                                
# Out of initial removal decision, these 2 are compulsorily removed.
single_category_medicine = ['examide', 'citoglipton']
                                             

print(set(drop_list_medicine) & set(two_category_medicine))

dependent = 'readmitted'

{'metformin-rosiglitazone', 'acetohexamide', 'glimepiride-pioglitazone', 'metformin-pioglitazone', 'troglitazone'}


In [382]:
df_numeric = df_imb[patient_info_numeric].copy()
Target_imbalanced = df_imb[['readmitted']]

In [383]:
hospital_data = (df_imb['number_outpatient'] + df_imb['number_emergency'] + df_imb['number_inpatient'])
health_index = hospital_data.copy()


severity = (df_imb['time_in_hospital'] + df_imb['num_lab_procedures'] + df_imb['num_procedures'] + df_imb['num_medications'] + df_imb['number_diagnoses'])
severity_of_disease = severity.copy()

In [384]:
df_numeric.columns

Index(['time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'number_diagnoses'],
      dtype='object')

In [385]:
df_numeric['Health_index'] = health_index
df_numeric['severity_of_disease'] = severity_of_disease

std = StandardScaler()
df_numeric_std = pd.DataFrame(std.fit_transform(df_numeric),columns=std.feature_names_in_)

In [386]:
# Importing standard scalar

In [387]:
link_scale = 'F:\GREAT LAKES\PROJECT\CAPSTONE\MODELING\Logistic regression\Final_data_for_modeling\Logistic regression_numeric_model\scalar.pickle'
import pickle
with open(link_scale,'wb') as sc:
    pickle.dump(std,sc)

In [397]:
from imblearn.over_sampling import KMeansSMOTE

In [364]:
over_sample = KMeansSMOTE(cluster_balance_threshold=0.134,random_state=55,k_neighbors=12,
                          sampling_strategy=0.99,)
predictor_balanced,Target_balanced = over_sample.fit_resample(df_numeric_std,Target_imbalanced)
Xb_train, Xb_test, yb_train, yb_test = train_test_split(predictor_balanced,Target_balanced,
                                                        test_size=0.25,random_state=42)

In [365]:
# link_kmeans_oversampled = 'F:\GREAT LAKES\PROJECT\CAPSTONE\MODELING\Logistic regression\Final_data_for_modeling\Logistic regression_numeric_model\Kmeans_numeric_oversampled.csv'
# df_numeric_oversample = pd.concat([predictor_balanced,Target_balanced],axis=1)
# df_numeric_oversample.to_csv(link_kmeans_oversampled)

## `Logistic Regression`

In [399]:
logistic_numeric = LogisticRegression(random_state=93)
logistic_numeric.fit(Xb_train,yb_train)

In [400]:
link_logistic = r'F:\GREAT LAKES\PROJECT\CAPSTONE\MODELING\Logistic regression\Final_data_for_modeling\Logistic regression_numeric_model\Numeric_model.pickle'
import pickle
with open(link_logistic,'wb') as log:
    pickle.dump(logistic_numeric,log)

In [428]:
#logistic_numeric.predict(std.transform(df_numeric.iloc[5].values.reshape(1,-1)))[0]

In [402]:
print(classification_report(y_train,Log_reg.predict(X_train)))

              precision    recall  f1-score   support

          No       0.90      0.91      0.90     67431
         Yes       0.90      0.89      0.90     66403

    accuracy                           0.90    133834
   macro avg       0.90      0.90      0.90    133834
weighted avg       0.90      0.90      0.90    133834



In [429]:
print(classification_report(y_test,Log_reg.predict(X_test)))

              precision    recall  f1-score   support

          No       0.89      0.91      0.90     22240
         Yes       0.90      0.89      0.90     22372

    accuracy                           0.90     44612
   macro avg       0.90      0.90      0.90     44612
weighted avg       0.90      0.90      0.90     44612



## `Support vector machine`

In [451]:
support_vector = SVC(random_state=93)
support_vector.fit(Xb_train,yb_train)

In [454]:
svc_link = 'F:\GREAT LAKES\PROJECT\CAPSTONE\MODELING\Support_vector_machine\SVC.pickle'
import pickle
with open(svc_link,'wb') as svc:
    pickle.dump(support_vector,svc)

In [455]:
support_vector.predict(std.transform(df_numeric.iloc[5].values.reshape(1,-1)))

array(['No'], dtype=object)

In [462]:
print(classification_report(yb_test.head(5000),support_vector.predict(Xb_test.head(5000))))

              precision    recall  f1-score   support

          No       0.91      0.91      0.91      2562
         Yes       0.91      0.90      0.90      2438

    accuracy                           0.91      5000
   macro avg       0.91      0.91      0.91      5000
weighted avg       0.91      0.91      0.91      5000



## `K-Nearest-Neighbour`

In [442]:
knn = KNeighborsClassifier(n_neighbors=9,n_jobs=-1,p=2)
knn.fit(Xb_train,yb_train)

In [443]:
print(classification_report(y_train,knn.predict(X_train)))

              precision    recall  f1-score   support

          No       0.90      0.94      0.92     67431
         Yes       0.93      0.90      0.92     66403

    accuracy                           0.92    133834
   macro avg       0.92      0.92      0.92    133834
weighted avg       0.92      0.92      0.92    133834



In [444]:
print(classification_report(y_test,knn.predict(X_test)))

              precision    recall  f1-score   support

          No       0.90      0.93      0.91     22240
         Yes       0.93      0.90      0.91     22372

    accuracy                           0.91     44612
   macro avg       0.91      0.91      0.91     44612
weighted avg       0.91      0.91      0.91     44612



## `Naive Bayes`

In [447]:
Naive = GaussianNB()
Naive.fit(Xb_train,yb_train)

In [449]:
print(classification_report(y_train,Naive.predict(X_train)))

              precision    recall  f1-score   support

          No       0.79      0.94      0.86     67431
         Yes       0.92      0.75      0.82     66403

    accuracy                           0.84    133834
   macro avg       0.86      0.84      0.84    133834
weighted avg       0.85      0.84      0.84    133834



In [450]:
print(classification_report(y_test,Naive.predict(X_test)))

              precision    recall  f1-score   support

          No       0.78      0.94      0.85     22240
         Yes       0.92      0.74      0.82     22372

    accuracy                           0.84     44612
   macro avg       0.85      0.84      0.83     44612
weighted avg       0.85      0.84      0.83     44612



## `Random Forest`

In [425]:
rf = RandomForestClassifier(n_estimators=100,min_samples_split=40,max_depth=40,
                       max_features=1.0,max_samples=0.85,random_state=93,n_jobs=-1)
rf.fit(Xb_train,yb_train)

In [426]:
print(classification_report(y_train,rf.predict(X_train)))

              precision    recall  f1-score   support

          No       0.90      0.98      0.94     67431
         Yes       0.97      0.89      0.93     66403

    accuracy                           0.93    133834
   macro avg       0.94      0.93      0.93    133834
weighted avg       0.94      0.93      0.93    133834



In [427]:
print(classification_report(y_test,rf.predict(X_test)))

              precision    recall  f1-score   support

          No       0.89      0.96      0.93     22240
         Yes       0.96      0.89      0.92     22372

    accuracy                           0.92     44612
   macro avg       0.93      0.92      0.92     44612
weighted avg       0.93      0.92      0.92     44612



## `Gradient Boosting Machine`

In [413]:
Gb_model = GradientBoostingClassifier(n_estimators=100,random_state=93,max_depth=4,
                                      learning_rate=1.0)
Gb_model.fit(Xb_train,yb_train)

In [430]:
print(classification_report(yb_train,Gb_model.predict(Xb_train)))

              precision    recall  f1-score   support

          No       0.90      0.99      0.94     67431
         Yes       0.99      0.88      0.93     66403

    accuracy                           0.94    133834
   macro avg       0.94      0.94      0.94    133834
weighted avg       0.94      0.94      0.94    133834



In [431]:
print(classification_report(yb_test,Gb_model.predict(Xb_test)))

              precision    recall  f1-score   support

          No       0.89      0.98      0.93     22240
         Yes       0.98      0.88      0.93     22372

    accuracy                           0.93     44612
   macro avg       0.93      0.93      0.93     44612
weighted avg       0.93      0.93      0.93     44612

