In [1]:
import os
import numpy as np
import pandas as pd
import psycopg2
import datetime
import random
import warnings
import joblib

from matplotlib import pyplot as plt
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import auc, roc_curve
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.svm import SVC
from sklearn.utils import shuffle
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from xgboost.sklearn import XGBClassifier
from scipy import stats

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None) # show all columns
pd.set_option("max_colwidth", None)

In [2]:
# parameters
par_LR = [{'C':[0.01,0.1,1,10],
           'class_weight':['balanced'],
           'max_iter':[5]}]
par_RF = [{'n_estimators':[30, 60],
            'max_depth':[30, 60],
            'max_leaf_nodes':[30, 60],
            #'class_weight':[scale_pos_weight],
          }]
par_ANN = [{'hidden_layer_sizes':[(50, 50), (100, 100), (150, 150)]}]
par_XGB = [{'n_estimators': [30, 60], 
            'max_depth': [30, 60],
            'reg_lambda':[0.1, 1, 10],
            'gamma':[0.1, 1, 10],
            #'scale_pos_weight':[scale_pos_weight],
           }]
par_FIN = [{'n_estimators': [60], 
            'max_depth': [30],
            'reg_lambda':[10],
            'gamma':[0.1],
            #'scale_pos_weight':[scale_pos_weight],
           }]

max_count_LR = float("inf")
max_count_RF = float("inf")
max_count_ANN = float("inf")
max_count_XGB = 50
max_count_FIN = 50
use_LR, use_RF, use_ANN, use_XGB, use_FIN = True, False, True, True, False

delimsText = "-" * 120
input_data = "d1_m8.csv"
split_percentage = 0.8
train_test_range = (0, -3, -3, None) # [,)

In [3]:
class mf:
    @staticmethod # a function to search and download tables from PSQL
    def search_table(s = '*', f = 'admissions', w = '', like = '', equal = '', download_as = ''):
        temp = s.lower()
        for i in temp:
            if i < "a" or i > "z":
                raise TypeError
        del temp
        conn = psycopg2.connect(database = "mimic", user = "postgres",
                                password = "pandora", host = "127.0.0.1", port = "5432")
        cursor = conn.cursor()
        if (w != '') & (like != ""):
            cursor.execute("select " + s + " from mimic." + f +' where '+ w +
                           ' like ' + "'%"+ like +"%'")
        elif (w != '') & (equal != ''):
            cursor.execute("select " + s + " from mimic." + f +' where '+ w + "="
                          +equal)
        else:
            cursor.execute("select " + s + " from mimic." + f)
        table = cursor.fetchall()
        if len(table) != 0:
            colnames = [desc[0] for desc in cursor.description]
            columns = cursor.fetchall()
            cursor.close()
            conn.close()
            columns = pd.DataFrame(columns)
            table = pd.DataFrame(table)
            table.columns = colnames
        if download_as != "":
            table.to_csv("download data/" + download_as + ".csv", index = False)
        return table
    
    
    @staticmethod # some functions to convert time into hours
    def calculate_time(x):
        if x == "nan":
            return "NaN"
        else:
            curTime = datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S")
            utcTime2 = datetime.datetime.strptime("1970-01-01 00:00:00", "%Y-%m-%d %H:%M:%S")
            metTime = curTime - utcTime2
            timeStamp = metTime.days * 24 * 3600 + metTime.seconds
            return (timeStamp / 3600 + 87600 + 72)
    
    
    @staticmethod
    def transform_time(dataframe, column_name):
        dataframe[column_name + "_h"] = dataframe[column_name].apply(lambda x: calculate_time(str(x)))
    
    
    @staticmethod # use winsorize to deal with outlier
    def winsorize(data, percentage_one = 0.01, percentage_two = 0.99, show = False):
        if show == True:
            plt.figure(figsize = (5, 5), dpi = 300)
            ax1 = plt.subplot(121)
            plt.hist(data, bins = 100)
        count1 = len(data)
        lower_bound = data.quantile(percentage_one)
        upper_bound = data.quantile(percentage_two)
        count2 = len(data[data < lower_bound]) + len(data[data > upper_bound])
        data[data < lower_bound] = lower_bound
        data[data > upper_bound] = upper_bound
    
        if show == True:
            print("{0}:\nlower bound is {1}\nupeer bound is {2}\ntotally dealt with {3} of {4} ({5}%)".format(data.name, 
                   lower_bound, upper_bound, count2, count1, (count2 / count1) * 100))
            ax2 = plt.subplot(122)
            plt.hist(data, bins=100)
    
    
    @staticmethod # by using regression to fill Nan
    def fill(dataframe, variables_list, variables_y, parameters, model, show = False):
        for i in variables_y:
            if show == True:
                plt.figure(figsize = (10, 10), dpi = 300)
                ax1 = plt.subplot(121)
                plt.hist(dataframe[i], bins = 100)
            clf = GridSearchCV(model, parameters, cv = 5)
            X = dataframe[dataframe[i].notnull()][variables_list]
            y = dataframe[dataframe[i].notnull()][i]
            clf.fit(X, y)
            scores = clf.score(X, y)
            # scores = cross_val_score(estimator=model, X=dataframe[variables_list], y=dataframe['hospital_mortality'], cv=5,scoring = "neg_mean_absolute_error")
            # train_score=model.score(dataframe[variables_list], dataframe['hospital_mortality'])
    
            print("the best model is:", clf.best_estimator_, "with best score", clf.best_score_)
            print("mean test scores are", clf.cv_results_["mean_test_score"])
            fillX = dataframe[dataframe[i].isnull()][variables_list]
            dataframe.loc[dataframe[dataframe[i].isnull()].index, i] = clf.predict(fillX)
            if show == True:
                ax2 = plt.subplot(122)
                plt.hist(dataframe[i], bins = 100)
                plt.show()
    
    
    @staticmethod # k-s test
    def ksTest(dataframe, variables, show = ""):
        for i in variables:
            res = stats.kstest(dataframe[dataframe[i].notnull()][i], "norm")
            print(str(i) + ":", res)
            if show != "" and i == show:
                plt.hist(dataframe[i], bins=50)
                plt.show()
    
    
    @staticmethod # t-test
    def tTest(dataframe, variables, groupby, show = ""):
        result = []
        for i in variables:
            group1 = dataframe[(dataframe[groupby] == 1) & (dataframe[i].notnull())][i]
            group2 = dataframe[(dataframe[groupby] == 0) & (dataframe[i].notnull())][i]
            res = stats.ttest_ind(group1, group2)
            result.append(res)
            print(i, ":", res)
            if show != "" and i == show:
                fig = plt.figure(figsize = (5, 5), dpi = 300)
                ax1 = fig.add_subplot(111)
                ax1.hist([group1, group2], bins = 50)
                plt.show()
        return result
    
    
    @staticmethod # chi-square test
    def chi2Test(dataframe, variables_x, variable_y, show = False):
        for i in variables_x:
            count = 0
            row = len(dataframe[i].unique())
            column = len(dataframe[variable_y].unique())
            chi2_data = np.zeros((row, column))
            list1 = dataframe[i].unique()
            list2 = dataframe[variable_y].unique()
            for j in range(len(list1)):
                for k in range(len(list2)):
                    chi2_data[j, k] = len(dataframe[(dataframe[i] == list1[j])
                                                    & (dataframe[variable_y] == list2[k])])
                    count += len(dataframe[(dataframe[i] == list1[j])
                                           & (dataframe[variable_y] == list2[k])])
            res = stats.chi2_contingency(np.array(chi2_data))
            print("chi2Test between {0} and {1}:\np-value is: {2}:\nthere are totally {3} samples".format(i, variable_y, res[1], res[2], count))
            if show:
                print("chi2Test metrix between {0} and {1}:\n{2}".format(i, variable_y, chi2_data))
    
    
    @staticmethod # Mann-Whitney U test
    def wilcox(dataframe, variables, groupby, show = ""):
        for j in groupby:
            for i in variables:
                group1 = dataframe[(dataframe[j] == 1) & (dataframe[i].notnull())][i]
                group2 = dataframe[(dataframe[j] == 0) & (dataframe[i].notnull())][i]
                res = stats.mannwhitneyu(group1, group2)
                print(i, ":", res)
                if show != "" and i == show:
                    fig = plt.figure(figsize=(5, 5), dpi=300)
                    ax1 = fig.add_subplot(111)
                    ax1.hist([group1, group2], bins=50)
                    plt.show()
    
    
    @staticmethod
    def meanFill(dataframe, variables_list, show = False):
        for i in variables_list:
            dataframe[i].fillna(dataframe[i].mean(), inplace = True)
    
    
    @staticmethod
    def to_csv(res, fp_path):
        dirname, filename = os.path.split(fp_path)
        for i in "\\/:*?\"<>|":
            if i in filename:
                filename = filename.replace(i, "_")
        if not os.path.exists(dirname):
            os.makedirs(dirname)
        res.to_csv(os.path.join(dirname, filename))
    
    @staticmethod
    def dump(res, fp_path):
        dirname, filename = os.path.split(fp_path)
        for i in "\\/:*?\"<>|":
            if i in filename:
                filename = filename.replace(i, "_")
        if not os.path.exists(dirname):
            os.makedirs(dirname)
        joblib.dump(res, os.path.join(dirname, filename))

In [4]:
# train test data split
data = pd.read_csv(input_data)
columns_lists = data.columns.to_list()
for i in range(len(columns_lists)):
    tmp_columns = columns_lists[i]
    for j in "\\/:*?\"<>| ":
        tmp_columns = tmp_columns.replace(j, "_")
    columns_lists[i] = tmp_columns
data.columns = columns_lists
train_variables = data.columns.to_list()[train_test_range[0]:train_test_range[1]]
outcome_variables = data.columns.to_list()[train_test_range[2]:train_test_range[3]]
mf.meanFill(data, columns_lists)
data = data.astype("int")
sub_data = data[list(train_variables) + outcome_variables]
train = sub_data[:int(len(data) * split_percentage)]
test = sub_data[int(len(data) * split_percentage):]
name = data.columns.to_list()[train_test_range[0]:train_test_range[1]]
y_name = data.columns.to_list()[train_test_range[2]:train_test_range[3]]

TypeError: must be str, not int

In [None]:
# constants
data.describe()
report = []
count = 0
enumerate_lists = [0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 3, 4, 5, 6, 
                   7, 8, 9, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 200, 300, 400, 500] + list(range(500, 9001, 100))
test_name = []
file_name = []
for i in name:
    test_name.append(i + ".csv")
for i in y_name:
    file_name.append("./test/" + i + ".csv")

In [None]:
# Logistic Regression
if use_LR:
    report = []
    count = 0
    for time, scale_pos_weight in enumerate(enumerate_lists):
        for j, i in enumerate(outcome_variables):
            if count >= max_count_LR:
                print("Aborted by max_count equaled", max_count_LR)
                break
            count += 1
            parameters = par_LR
            model = GridSearchCV(LogisticRegression(n_jobs = -1), parameters, cv = 5)
            model.fit(train[train_variables], train[i])
            y_score = model.predict_proba(test[train_variables])[:, 1]
            fpr, tpr, _ = roc_curve(np.array(test[i]), np.array(y_score), pos_label = 2)
            AUROC = auc(fpr, tpr)
            trainS = model.score(train[train_variables], train[i])
            testS = model.score(test[train_variables], test[i])
            f1S = metrics.f1_score(test[i], model.predict(test[train_variables]),average = "micro")
            preS = metrics.precision_score(test[i], model.predict(test[train_variables]),average = "micro")
            recS = metrics.recall_score(test[i], model.predict(test[train_variables]),average = "micro")
            AUROC = auc(fpr, tpr)
            #feature_importance=pd.DataFrame({'column':train_variables, 'values':model.best_estimator_.coef_})
            #feature_importance.sort_values(by='values', axis=0, ascending=False, inplace=True)
            report.append([trainS, testS, f1S, preS, recS, AUROC, model.best_params_, i])
            columns = ["trainS", "testS", "f1S", "preS", "recS", "AUROC", "model.best_params_", "i"]
            #feature_importance.iloc[0].values,feature_importance.iloc[1].values,feature_importance.iloc[2].values,
            #feature_importance.iloc[3].values,feature_importance.iloc[4].values])
            print("{0}: {1} - {2} - {3} - {4} - {5} - {6} - {7}".format(i, trainS, testS, f1S, preS, recS, AUROC, model.best_params_))
            print(delimsText)
            res = pd.DataFrame(report, columns = columns)
            mf.to_csv(res, "./result/trained models/LR_models/report.csv")
            mf.dump(model, "./result/trained models/LR_models/" + i + "_LR.model")
        if count >= max_count_LR:
            break

In [None]:
# Random Forest
if use_RF:
    report = []
    count = 0
    for time, scale_pos_weight in enumerate(enumerate_lists):
        for i in outcome_variables:
            if count >= max_count_RF:
                print("Aborted by max_count equaled", max_count_RF)
                break
            count += 1
            parameters = par_RF[0]
            parameters.update({"class_weight":[scale_pos_weight]})
            parameters = [parameters]
            model = GridSearchCV(RandomForestClassifier(), parameters, cv = 5)
            model.fit(train[train_variables], train[i])
            y_score = model.predict_proba(test[train_variables])[:, 1]
            fpr, tpr, _ = roc_curve(np.array(test[i]), np.array(y_score), pos_label = 2)
            AUROC = auc(fpr, tpr)
            trainS = model.score(train[train_variables], train[i])
            testS = model.score(test[train_variables], test[i])
            f1S = metrics.f1_score(test[i], model.predict(test[train_variables]), average = "micro")
            preS = metrics.precision_score(test[i], model.predict(test[train_variables]), average = "micro")
            recS = metrics.recall_score(test[i], model.predict(test[train_variables]), average = "micro")
            AUROC = auc(fpr, tpr)
            #feature_importance = pd.DataFrame({"column":train_variables, "values":model.best_estimator_.coef_})
            #feature_importance.sort_values(by = "values", axis = 0, ascending = False, inplace = True)
            report.append([trainS, testS, f1S, preS, recS, AUROC, model.best_params_, i])
            columns = ["trainS", "testS", "f1S", "preS", "recS", "AUROC", "model.best_params_", "i"]
            #feature_importance.iloc[0].values,feature_importance.iloc[1].values,feature_importance.iloc[2].values,
            #feature_importance.iloc[3].values,feature_importance.iloc[4].values])
            print("{0}: {1} - {2} - {3} - {4} - {5} - {6} - {7}".format(i, trainS, testS, f1S, preS, recS, AUROC, model.best_params_))
            print(delimsText)
        if count >= max_count_RF:
            break
        res = pd.DataFrame(report, columns = columns)
        mf.to_csv(res, "./result/trained models/RF_models/RF_report.csv")
        mf.dump(model, "./result/trained models/RF_models/" + i + "_RF.model")

In [None]:
# Artificial Neural Network
if use_ANN:
    report = []
    count = 0
    for i in outcome_variables:
        if count >= max_count_ANN:
                print("Aborted by max_count equaled", max_count_ANN)
                break
        count += 1
        parameters = par_ANN
        #model = GridSearchCV(RandomForestClassifier(), parameters, cv = 5)
        model = GridSearchCV(MLPClassifier(), parameters, cv = 5)
        #model = RandomForestClassifier(class_weight = "balanced")
        model.fit(train[train_variables], train[i])
        
        y_score = model.predict_proba(test[train_variables])[:,1]
        fpr, tpr, _ = roc_curve(np.array(test[i]), np.array(y_score), pos_label = 2)
        AUROC = auc(fpr, tpr)
        trainS = model.score(train[train_variables], train[i])
        testS = model.score(test[train_variables], test[i])
        f1S = metrics.f1_score(test[i], model.predict(test[train_variables]),average='micro')
        preS = metrics.precision_score(test[i], model.predict(test[train_variables]),average='micro')
        recS = metrics.recall_score(test[i], model.predict(test[train_variables]),average='micro')
        AUROC = auc(fpr, tpr)
        #feature_importance=pd.DataFrame({'column':train_variables, 'values':model.best_estimator_.coef_})
        #feature_importance.sort_values(by='values', axis=0, ascending=False, inplace=True)
        report.append([trainS, testS, f1S, preS, recS, AUROC, model.best_params_, i])
        columns = ["trainS", "testS", "f1S", "preS", "recS", "AUROC", "model.best_params_", "i"]
        #feature_importance.iloc[0].values,feature_importance.iloc[1].values,feature_importance.iloc[2].values,
        #feature_importance.iloc[3].values,feature_importance.iloc[4].values])
        print("{0}: {1} - {2} - {3} - {4} - {5} - {6} - {7}".format(i, trainS, testS, f1S, preS, recS, AUROC, model.best_params_))
        print(delimsText)
        res = pd.DataFrame(report, columns = columns)
        mf.to_csv(res, "./result/trained models/ANN_models/report.csv")
        mf.dump(model, "./result/trained models/ANN_models/" + i + "_ANN.model")

In [None]:
# XGBoost
if use_XGB:
    for time, scale_pos_weight in enumerate(enumerate_lists):
        for i in outcome_variables:
            if count >= max_count_XGB:
                print("Aborted by max_count equaled", max_count_XGB)
                break
            count += 1
            parameters = par_XGB[0]
            parameters.update({'scale_pos_weight':[scale_pos_weight]})
            parameters = [parameters]
            model = GridSearchCV(XGBClassifier(n_jobs = -1), parameters, cv = 5)
            model.fit(train[train_variables], train[i])
            y_score = model.predict_proba(test[train_variables])[:,1]
            fpr, tpr, _ = roc_curve(np.array(test[i]), np.array(y_score), pos_label = 2)
            AUROC = auc(fpr, tpr)
            trainS = model.score(train[train_variables], train[i])
            testS = model.score(test[train_variables], test[i])
            f1S = metrics.f1_score(test[i], model.predict(test[train_variables]), average = "micro")
            preS = metrics.precision_score(test[i], model.predict(test[train_variables]), average = "micro")
            recS = metrics.recall_score(test[i], model.predict(test[train_variables]), average = "micro")
            AUROC = auc(fpr, tpr)
            feature_importance = pd.DataFrame({"column":train_variables, "values":model.best_estimator_.feature_importances_})
            feature_importance.sort_values(by = "values", axis = 0, ascending = False, inplace = True)
            report.append([trainS, testS, f1S, preS, recS, AUROC, model.best_params_, i,
                          feature_importance.iloc[0].values, feature_importance.iloc[1].values, feature_importance.iloc[2].values,
                          feature_importance.iloc[3].values, feature_importance.iloc[4].values, scale_pos_weight])
            columns = ["trainS", "testS", "f1S", "preS", "recS", "AUROC", "model.best_params_", "i",
                        "feature_importance.iloc[0].values", "feature_importance.iloc[1].values", "feature_importance.iloc[2].values",
                        "feature_importance.iloc[3].values", "feature_importance.iloc[4].values", "scale_pos_weight"]
            print("{0}. scale: {1}\n{2} - {3} - {4} - {5} - {6} - {7} - {8}".format(i, scale_pos_weight, trainS, testS, f1S, preS, recS, AUROC, model.best_params_))
            print(delimsText)
        if count >= max_count_XGB:
            break
        res = pd.DataFrame(report, columns = columns)
        mf.to_csv(res, "./result/trained models/XGB_models/report.csv")
        mf.dump(model, "./result/trained models/XGB_models/" + i + "_XGB.model")

In [None]:
# ROC curve
for i in range(len(name)):
    plt.figure(dpi = 500)
    modelXGB = joblib.load('./result/trained models/XGB_models/' + test_name[i] + '_XGB.model')
    y_score = modelXGB.predict_proba(test[train_variables])[:, 1]
    fpr, tpr, _ = roc_curve(test[test_name[i]], y_score)
    AUROC = auc(fpr, tpr)
    plt.plot(fpr, tpr, color = 'blue', label = 'XGB (AUROC = %0.2f)' % AUROC)
    modelANN = joblib.load('./result/trained models/ANN_models/' + test_name[i] + '_ANN.model')
    y_score = modelANN.predict_proba(test[train_variables])[:,1]
    fpr, tpr, _ = roc_curve(test[test_name[i]], y_score)
    AUROC = auc(fpr, tpr)
    plt.plot(fpr, tpr, color = 'darkorange', label = 'ANN (AUROC = %0.2f)'%AUROC)
    modelLR = joblib.load('./result/trained models/LR_models/' + test_name[i] + '_LR.model')
    y_score = modelLR.predict_proba(test[train_variables])[:,1]
    fpr, tpr, _ = roc_curve(test[test_name[i]], y_score)
    AUROC=auc(fpr, tpr)
    plt.plot(fpr, tpr, color = "red", label = "LR (AUROC = %0.2f)" % AUROC)
    modelRF = joblib.load("./result/trained models/RF_models/" + test_name[i] + "_RF.model")
    y_score = modelRF.predict_proba(test[train_variables])[:,1]
    fpr, tpr, _ = roc_curve(test[test_name[i]], y_score)
    AUROC = auc(fpr, tpr)
    plt.plot(fpr, tpr, color = 'green', label = 'RF (AUROC = %0.2f)' % AUROC)
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.plot([0, 1], [0, 1], color = "navy", linestyle = "--")
    plt.title("ROC curve of " + name[i] + " model")
    plt.legend(loc = "lower right")
    plt.savefig("./result/ROC curve/" + name[i] + ".jpg")
    plt.show()

In [None]:
#score table
model_name = ["XGB", "ANN", "LR", "RF"]
res = []
for i in range(len(test_name)):
    modelXGB = joblib.load('result/trained models/XGB_models/' + test_name[i] + '_XGB.model')
    modelANN = joblib.load('result/trained models/ANN_models/' + test_name[i] + '_ANN.model')
    modelLR = joblib.load('result/trained models/LR_models/' + test_name[i] + '_LR.model')
    modelRF = joblib.load('result/trained models/RF_models/' + test_name[i] + '_RF.model')
    model_list = [modelXGB, modelANN, modelLR, modelRF]
    for j in range(len(model_list)):
        y_score = model_list[j].predict_proba(test[train_variables])[:,1]
        fpr, tpr, _ = roc_curve(test[test_name[i]], y_score)
        trainS = model_list[j].score(train[train_variables], train[test_name[i]])
        testS = model_list[j].score(test[train_variables], test[test_name[i]])
        f1S = metrics.f1_score(test[test_name[i]], model_list[j].predict(test[train_variables]))
        preS = metrics.precision_score(test[test_name[i]], model_list[j].predict(test[train_variables]))
        recS = metrics.recall_score(test[test_name[i]], model_list[j].predict(test[train_variables]))
        AUROC = auc(fpr, tpr)
        #print(trainS, '-', testS, '-', f1S, '-', preS, '-', recS, '-', AUROC)
        #print(metrics.confusion_matrix(test[i], model.predict(test[train_variables])))
        res.append([trainS, testS, f1S, preS, recS, AUROC, name[i], model_name[j]])  
res = pd.DataFrame(res, columns = columns)
mf.to_csv(res, "result/ROC curve/score table.csv")

In [None]:
# XGBoost_FIN
if use_FIN:
    report = []
    count = 0
    
    for time,scale_pos_weight in enumerate([30]):
        for i in outcome_variables:
            if count >= max_count_FIN:
                print("Aborted by max_count equaled", max_count)
                break
            count += 1
            parameters = [par_FIN[0].update({'scale_pos_weight':[scale_pos_weight]})]
            model = GridSearchCV(XGBClassifier(n_jobs = -1), parameters, cv = 5)
            model.fit(train[train_variables], train[i])
            y_score=model.predict_proba(test[train_variables])[:,1]
            fpr, tpr, _ = roc_curve(test[i], y_score)
            AUROC = auc(fpr, tpr)
            trainS = model.score(train[train_variables], train[i])
            testS = model.score(test[train_variables], test[i])
            f1S = metrics.f1_score(test[i], model.predict(test[train_variables]))
            preS = metrics.precision_score(test[i], model.predict(test[train_variables]))
            recS = metrics.recall_score(test[i], model.predict(test[train_variables]))
            AUROC = auc(fpr, tpr)
            feature_importance = pd.DataFrame({"column":train_variables, "values":model.best_estimator_.feature_importances_})
            feature_importance.sort_values(by = "values", axis = 0, ascending = False, inplace = True)
            report.append([trainS, testS, f1S, preS,recS,AUROC,model.best_params_, i,
                          feature_importance.iloc[0].values,feature_importance.iloc[1].values,feature_importance.iloc[2].values,
                          feature_importance.iloc[3].values,feature_importance.iloc[4].values,scale_pos_weight])
            columns = ["trainS", "testS", "f1S", "preS", "recS", "AUROC", "model.best_params_", "i",
                        "feature_importance.iloc[0].values", "feature_importance.iloc[1].values", "feature_importance.iloc[2].values",
                        "feature_importance.iloc[3].values", "feature_importance.iloc[4].values", "scale_pos_weight"]
            print("{0}: \nscale:{1}".format(i, scale_pos_weight))
            print(" - ".join([trainS, testS, f1S, preS, recS, AUROC, model.best_params_]))
            print("-" * 30)
        if count >= max_count_FIN:
            break
        res = pd.DataFrame(report, columns = columns)
        mf.to_csv(res, "./result/trained models/XGB_models/report_fin.csv")
        mf.dump(model, "./result/trained models/XGB_models/" + i + "_XGB.model")

In [None]:
for i in range(len(y_name)):
    model = joblib.load(file_name[i])
    y_score = model.predict_proba(test[train_variables])[:,1]
    fpr, tpr, _ = roc_curve(test[y_name[i]], y_score)
    AUROC = auc(fpr, tpr)
    trainS = model.score(train[train_variables], train[y_name[i]])
    testS = model.score(test[train_variables], test[y_name[i]])
    f1S = metrics.f1_score(test[y_name[i]], model.predict(test[train_variables]))
    preS = metrics.precision_score(test[y_name[i]], model.predict(test[train_variables]))
    recS = metrics.recall_score(test[y_name[i]], model.predict(test[train_variables]))
    AUROC = auc(fpr, tpr)
    print("{0}: {1} - {2} - {3} - {4} - {5} - {6} - {7}\n".format(y_name[i], trainS, testS, f1S, preS, recS, AUROC, model.best_params_)

In [None]:
model = joblib.load("./result/trained models/XGB_models/hospital_mortality_XGB.model")

In [None]:
model.best_estimator_.feature_importances_

In [None]:
feature_importance = pd.DataFrame({"column":train_variables, "values":model.best_estimator_.feature_importances_})
feature_importance.sort_values(by = "values", axis = 0, ascending = False, inplace = True)

In [None]:
feature_importance