# Preprocessing

## Import all necessary Library

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier

## File Reading

In [None]:
np.set_printoptions(suppress=True, precision=20, threshold=10, linewidth=40)  # np forbids scientific counting
pd.set_option('display.float_format', lambda x: '%.2f' % x)  # pd forbids scientific counting

In [None]:
path = "data/train_data.csv"

df_all = pd.read_csv(path)  # Read the data according to your own file address

df_all.drop_duplicates(
    inplace=True)  # drop_duplicates are used to perform deduplication, and inplace=True replaces the original data set
df_all.reset_index(drop=True, inplace=True)  # After deleting the data, restore the index

df_all

In [None]:
# Look at data types and missing cases as a whole
df_all.info()

In [None]:
df_all['Hospital_code'] = df_all['Hospital_code'].astype(object)
df_all['City_Code_Hospital'] = df_all['City_Code_Hospital'].astype(object)
df_all['Bed Grade'] = df_all['Bed Grade'].astype(object)
df_all['City_Code_Patient'] = df_all['City_Code_Patient'].astype(object)
df_all['Hospital_code'] = df_all['Hospital_code'].astype(float)
df_all['Available Extra Rooms in Hospital'] = df_all['Available Extra Rooms in Hospital'].astype(float)
df_all['patientid'] = df_all['patientid'].astype(float)
df_all['Visitors with Patient'] = df_all['Visitors with Patient'].astype(float)

In [None]:
df_all.info()

In [None]:
# Define a function to find missing variables and return a list of missing value rate > specified missing rate (narate)
def filter_col_by_nan(data, narate=0.2):
    '''
    :param data: 查找数据集
    :param narate: 设定变量的缺失值率
    :return: 返回 缺失率>narate的变量名称列表
    '''
    n_samples = data.shape[0]
    list_nan_cols = []
    for col in data.columns:
        if data[col].isna().sum() / n_samples >= (narate):
            list_nan_cols.append(col)
    print(f'Variables with more than {narate * 100}% missing are: {list_nan_cols}')
    return list_nan_cols


list_nullfactor_todrop = filter_col_by_nan(df_all, narate=0.3)
df_select = df_all.drop(list_nullfactor_todrop, axis=1).copy()
df_select

## Variables Analysis

### Univariate Analysis

In [None]:
df_select['Stay'].replace('More than 100 Days', '>100', inplace=True)

for i in ['Stay', 'Department', 'Available Extra Rooms in Hospital', 'Ward_Type', 'Ward_Facility_Code', 'Age',
          'Type of Admission', 'Severity of Illness', 'Bed Grade', 'Hospital_region_code', 'Hospital_type_code',
          'City_Code_Hospital', 'Hospital_code', 'City_Code_Patient', 'Visitors with Patient']:
    count = df_select[i].value_counts()

    plt.figure(figsize=(10, 5))
    sns.barplot(x=count.index.values, y=count.values, data=df_select)
    plt.xlabel(i)
    plt.ylabel('Value')

### Bivariate Analysis

In [None]:

for i in ['Department', 'Ward_Type', 'Ward_Facility_Code', 'Age', 'Type of Admission', 'Severity of Illness',
          'Bed Grade', 'Hospital_region_code', 'Hospital_type_code']:
    plt.figure(figsize=(10, 5))

    sns.countplot(x='Stay', hue=i, data=df_select)

### Numerical Features

In [None]:
sns.distplot(df_select['Admission_Deposit'])

## Missing values

In [None]:
df_select.isnull().sum()

In [None]:
def impute_missing_values(data, columns):
    for column in columns:
        data[column] = data[column].fillna(data[column].value_counts().index[0])
        print(data[column].value_counts().index[0])

In [None]:
impute_columns = ['Bed Grade', 'City_Code_Patient']
impute_missing_values(df_select, impute_columns)

In [None]:
df_select.isnull().sum()

## Encoding

In [None]:
admission_encode = {'Trauma': 1, 'Urgent': 2, 'Emergency': 3}
severity_encode = {'Minor': 1, 'Moderate': 2, 'Extreme': 3}

df_select['Type of Admission'] = df_select['Type of Admission'].map(admission_encode)
df_select['Severity of Illness'] = df_select['Severity of Illness'].map(severity_encode)

df_select['Age'] = df_select['Age'].replace(
    {'0-10': 1, '11-20': 2, '21-30': 3, '31-40': 4, '41-50': 5, '51-60': 6, '61-70': 7,
     '71-80': 8, '81-90': 9, '91-100': 10})

df_select['Stay'] = df_select['Stay'].replace(
    {'0-10': 1, '11-20': 2, '21-30': 3, '31-40': 4, '41-50': 5, '51-60': 6, '61-70': 7,
     '71-80': 8, '81-90': 9, '91-100': 10, '>100': 11})

In [None]:
sequence_columns = ['Available Extra Rooms in Hospital', 'Bed Grade', 'Visitors with Patient', 'Admission_Deposit',
                    'Type of Admission', 'Severity of Illness', 'Age', 'Stay']
onehot_columns = ['Hospital_type_code', 'Hospital_region_code', 'Department', 'Ward_Type', 'Ward_Facility_Code']
other_columns = ['Hospital_code', 'City_Code_Hospital', 'patientid', 'City_Code_Patient']

In [None]:
def onehot_encode(data, columns):
    for column in columns:
        dummies = pd.get_dummies(data[column])
        data = pd.concat([data, dummies], axis=1)
        data.drop(column, axis=1, inplace=True)

    return data

In [None]:
df_select

In [None]:
data = onehot_encode(df_select, onehot_columns)
data

## Scaling

In [None]:
data.set_index('case_id', inplace=True)

y = data['Stay']
X = data.drop(['Stay'], axis=1)

In [None]:
y

In [None]:
scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X), index=X.index, columns=X.columns)

In [None]:
X

## Split dataset

In [None]:
X_all = X
Y_all = y
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, y, random_state=10, test_size=0.2)

In [None]:
print(Xtrain.shape)
print(Xtest.shape)
print(Ytrain.shape)
print(Ytest.shape)
print(X_all.shape)
print(Y_all.shape)

In [None]:
# # K Fold Cross Validation
#
# kfold = KFold(n_splits=10, shuffle=True, random_state=10)
# Xtrain = list()
# Ytrain = list()
# for train_index, test_index in kfold.split(X):
#     print("TRAIN:", train_index, "TEST:", test_index)
#     Xtrain.append(X.loc[train_index + 1])
#     Xtest.append(X.loc[test_index + 1])
#     Ytrain.append(y.loc[train_index + 1])
#     Ytest.append(y.loc[test_index + 1])
# Xtrain
# Ytrain

## Feature Selection

In [None]:

import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import RFE
from sklearn.inspection import permutation_importance

In [None]:
# 1.构建RF模型
RFC_ = RandomForestClassifier()  # 随机森林

In [None]:
# Importance of feature
# c = RFC_.fit(Xtrain, Ytrain).feature_importances_
# print("Importance: ")
# c

In [None]:
# Recursive feature elimination
selector1 = RFE(RFC_, n_features_to_select=0.5, step=1).fit(Xtrain, Ytrain)

In [None]:
selector1.support_.sum()
print(selector1.ranking_)
print(selector1.n_features_)
X_wrapper1 = selector1.transform(Xtrain)

In [None]:
score = cross_val_score(RFC_, X_wrapper1, Ytrain).mean()
score

In [None]:
# The number of features selected
selector1.n_features_

In [None]:
# Feature exclusion sort
selector1.ranking_

In [None]:
support_cat_list = selector1.support_

In [None]:
support_cat_list = [True,
                    True,
                    True,
                    True,
                    True,
                    True,
                    True,
                    True,
                    True,
                    True,
                    True,
                    True,
                    False,
                    False,
                    False,
                    False,
                    False,
                    False,
                    False,
                    False,
                    False,
                    False,
                    True,
                    True,
                    True,
                    False,
                    False,
                    True,
                    True,
                    True,
                    False,
                    False,
                    False,
                    False,
                    False,
                    False,
                    False,
                    True]
support_cat_list

In [None]:
# 4.递归特征消除法和曲线图选取最优特征数量
score = []  # 建立列表
for i in range(12, 22, 1):
    X_wrapper = RFE(RFC_, n_features_to_select=i, step=1).fit_transform(Xtrain, Ytrain)
    once = cross_val_score(RFC_, X_wrapper, Ytrain).mean()  # 交叉验证
    score.append(once)  # 交叉验证结果保存到列表
print(max(score), (score.index(max(score)) * 1) + 1)  # 输出最优分类结果和对应的特征数量
print(score)
plt.figure(figsize=[20, 5])
plt.plot(range(1, 17, 1), score)
plt.xticks(range(1, 17, 1))
plt.show()

In [None]:
# RFE.ranking_

In [None]:
columns_list = list(Xtrain.columns)

In [None]:
list_factor_todrop = []
for i in range(38):
    if support_cat_list[i] is False:
        list_factor_todrop.append(columns_list[i])

list_factor_todrop  #需要抛弃的变量

In [None]:
Xtrain.drop(list_factor_todrop, axis=1, inplace=True)
Xtest.drop(list_factor_todrop, axis=1, inplace=True)
X_all.drop(list_factor_todrop, axis=1, inplace=True)

In [None]:

Xtrain_remove_other = Xtrain.drop(other_columns, axis=1)
Xtest_remove_other = Xtest.drop(other_columns, axis=1)


In [None]:
Xtrain

In [None]:
Xtrain_remove_other

In [None]:
Ytrain

# Train and Evaluation

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

decision_tree_classification = DecisionTreeClassifier(criterion='entropy', random_state=10)

decision_tree = decision_tree_classification.fit(Xtrain, Ytrain)
y_pred_DT = decision_tree.predict(Xtest)

accuracy_score(Ytest, y_pred_DT) * 100

In [None]:
from sklearn.preprocessing import label_binarize

y_one_hot = label_binarize(Ytest, classes=[1,2,3,4,5,6,7,8,9,10,11])
y_one_hot_pred_DT = label_binarize(y_pred_DT, classes=[1,2,3,4,5,6,7,8,9,10,11])
y_one_hot

In [None]:
accuracy_score_array = np.ndarray(shape=(1,11), dtype=float)
for i in range(11):
    accuracy_score_array[0,i] = accuracy_score(y_one_hot[:,i], y_one_hot_pred_DT[:,i])
accuracy_score_array.mean()

In [None]:
from sklearn.metrics import classification_report

print(classification_report(Ytest, y_pred_DT))

In [None]:
from numpy import interp
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle
from sklearn.metrics import roc_curve, auc

n_classes = 11

# 计算每一类的ROC
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_one_hot[:, i], y_one_hot_pred_DT[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# # micro（方法二）
# fpr["micro"], tpr["micro"], _ = roc_curve(y_one_hot.ravel(), y_one_hot_pred_DT.ravel())
# roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

# macro（方法一）
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += interp(all_fpr, fpr[i], tpr[i])
# Finally average it and compute AUC
mean_tpr /= n_classes
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

# Plot all ROC curves
lw=2
plt.figure()
# plt.plot(fpr["micro"], tpr["micro"],
#          label='micro-average ROC curve (area = {0:0.2f})'
#                ''.format(roc_auc["micro"]),
#          color='deeppink', linestyle=':', linewidth=4)

plt.plot(fpr["macro"], tpr["macro"],
         label='macro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["macro"]),
         color='navy', linestyle=':', linewidth=4)

colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=lw,
             label='ROC curve of class {0} (area = {1:0.2f})'
             ''.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('multi-calss ROC')
plt.legend(loc="lower right")
plt.show()

## Decision Tree with HyperParameter adjustment

In [None]:
from tqdm import tqdm

from sklearn.model_selection import GridSearchCV

paramGrid = dict(
    max_depth=range(15),
    criterion=["gini", "entropy"])
dtModel = DecisionTreeClassifier(random_state=10)
grid = GridSearchCV(dtModel, paramGrid, cv=10, return_train_score=True)
grid.fit(Xtrain, Ytrain)
print(grid.best_params_)

In [None]:
print(grid.best_params_)

In [None]:
from sklearn.tree import DecisionTreeClassifier

decision_tree_classification = DecisionTreeClassifier(criterion='gini', random_state=10, max_depth=10)

decision_tree = decision_tree_classification.fit(Xtrain, Ytrain)
y_pred_DT = decision_tree.predict(Xtest)
accuracy_score(Ytest, y_pred_DT) * 100

In [None]:
print(classification_report(Ytest, y_pred_DT))

In [None]:
from sklearn.preprocessing import label_binarize

y_one_hot = label_binarize(Ytest, classes=[1,2,3,4,5,6,7,8,9,10,11])
y_one_hot_pred_DT = label_binarize(y_pred_DT, classes=[1,2,3,4,5,6,7,8,9,10,11])
y_one_hot

In [None]:
accuracy_score_array = np.ndarray(shape=(1,11), dtype=float)
for i in range(11):
    accuracy_score_array[0,i] = accuracy_score(y_one_hot[:,i], y_one_hot_pred_DT[:,i])
accuracy_score_array.mean()

In [None]:
from numpy import interp
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle
from sklearn.metrics import roc_curve, auc

n_classes = 11

# 计算每一类的ROC
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_one_hot[:, i], y_one_hot_pred_DT[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# # micro（方法二）
# fpr["micro"], tpr["micro"], _ = roc_curve(y_one_hot.ravel(), y_one_hot_pred_DT.ravel())
# roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

# macro（方法一）
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += interp(all_fpr, fpr[i], tpr[i])
# Finally average it and compute AUC
mean_tpr /= n_classes
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

# Plot all ROC curves
lw=2
plt.figure()
# plt.plot(fpr["micro"], tpr["micro"],
#          label='micro-average ROC curve (area = {0:0.2f})'
#                ''.format(roc_auc["micro"]),
#          color='deeppink', linestyle=':', linewidth=4)

plt.plot(fpr["macro"], tpr["macro"],
         label='macro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["macro"]),
         color='navy', linestyle=':', linewidth=4)

colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=lw,
             label='ROC curve of class {0} (area = {1:0.2f})'
             ''.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('multi-calss ROC')
plt.legend(loc="lower right")
plt.show()

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_classification = RandomForestClassifier(random_state=10)

rf_model = rf_classification.fit(Xtrain, Ytrain)
y_pred_RF = rf_model.predict(Xtest)
accuracy_score(Ytest, y_pred_RF) * 100

In [None]:
print(classification_report(Ytest, y_pred_RF))

In [None]:
from sklearn.preprocessing import label_binarize

y_one_hot = label_binarize(Ytest, classes=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
y_one_hot_pred_DT = label_binarize(y_pred_DT, classes=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
y_one_hot

In [None]:
accuracy_score_array = np.ndarray(shape=(1, 11), dtype=float)
for i in range(11):
    accuracy_score_array[0, i] = accuracy_score(y_one_hot[:, i], y_one_hot_pred_DT[:, i])
accuracy_score_array.mean()

In [None]:
from numpy import interp
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle
from sklearn.metrics import roc_curve, auc

n_classes = 11

# 计算每一类的ROC
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_one_hot[:, i], y_one_hot_pred_DT[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# micro（方法二）
fpr["micro"], tpr["micro"], _ = roc_curve(y_one_hot.ravel(), y_one_hot_pred_DT.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

# macro（方法一）
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += interp(all_fpr, fpr[i], tpr[i])
# Finally average it and compute AUC
mean_tpr /= n_classes
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

# Plot all ROC curves
lw = 2
plt.figure()
plt.plot(fpr["micro"], tpr["micro"],
         label='micro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["micro"]),
         color='deeppink', linestyle=':', linewidth=4)

plt.plot(fpr["macro"], tpr["macro"],
         label='macro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["macro"]),
         color='navy', linestyle=':', linewidth=4)

colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=lw,
             label='ROC curve of class {0} (area = {1:0.2f})'
                   ''.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('multi-calss ROC')
plt.legend(loc="lower right")
plt.show()

## Random Forest with HyperParameter adjustment

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
from sklearn.model_selection import GridSearchCV

paramGrid = dict(
    n_estimators=np.linspace(10, 100, 10).astype(int),
    max_depth=np.arange(4, 11))
rfModel = RandomForestClassifier(random_state=10)
grid = GridSearchCV(rfModel, paramGrid, cv=10, return_train_score=True)
grid.fit(Xtrain, Ytrain)
print(grid.best_params_)

In [None]:
rf_classification = RandomForestClassifier(random_state=10, max_depth=10, n_estimators=100)

rf_model = rf_classification.fit(Xtrain, Ytrain)
y_pred_RF = rf_model.predict(Xtest)
accuracy_score(Ytest, y_pred_RF) * 100

In [None]:
print(classification_report(Ytest, y_pred_RF))

In [None]:
from sklearn import metrics

fpr, tpr, thresholds = metrics.roc_curve(Ytest, y_pred_RF, pos_label=11)
auc = metrics.auc(fpr, tpr)
auc

In [None]:
import matplotlib.pyplot as plt

plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

## Logistic Regression

In [None]:
# from mlxtend.classifier import LogisticRegression as LR
# from sklearn.metrics import accuracy_score

In [None]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import GridSearchCV
#
# paramGrid = dict(
#     penalty=['l1', 'l2'],
#     C=[0.1, 1, 10, 100, 1000])
# lrModel = LogisticRegression(random_state=10)
# grid = GridSearchCV(lrModel, paramGrid, cv=3, return_train_score=True)
# grid.fit(Xtrain, Ytrain)
# print(grid.best_params_)

In [None]:
# LR = LogisticRegression(fit_intercept=True, C=10, penalty="l2")
#
# LR.fit(Xtrain, Ytrain)
#
# y_pred_LR = LR.predict(Xtest)
# accuracy_score(Ytest, y_pred_LR) * 100

In [None]:
# print(classification_report(Ytest, y_pred_LR))

In [None]:
# from sklearn import metrics
#
# fpr, tpr, thresholds = metrics.roc_curve(Ytest, y_pred_LR, pos_label=11)
# auc = metrics.auc(fpr, tpr)
# auc

In [None]:
# import matplotlib.pyplot as plt
#
# plt.figure()
# lw = 2
# plt.plot(fpr, tpr, color='darkorange',
#          lw=lw, label='ROC curve (area = %0.2f)' % auc)
# plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
# plt.xlim([0.0, 1.0])
# plt.ylim([0.0, 1.05])
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive Rate')
# plt.title('Receiver operating characteristic example')
# plt.legend(loc="lower right")
# plt.show()

## Gradient boost

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

GB = GradientBoostingClassifier(random_state=10)

GB.fit(Xtrain, Ytrain)
y_pred_GB = GB.predict(Xtest)
accuracy_score(Ytest, y_pred_GB) * 100

In [None]:
print(classification_report(Ytest, y_pred_GB))

In [None]:
from sklearn import metrics

fpr, tpr, thresholds = metrics.roc_curve(Ytest, y_pred_GB, pos_label=11)
auc = metrics.auc(fpr, tpr)
auc

In [None]:
import matplotlib.pyplot as plt

plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

NB = GaussianNB()
NB.fit(Xtrain_remove_other, Ytrain)

y_pred_NB = NB.predict(Xtest_remove_other)
accuracy_score(Ytest, y_pred_NB) * 100

In [None]:
print(classification_report(Ytest, y_pred_NB))

In [None]:
from sklearn import metrics

fpr, tpr, thresholds = metrics.roc_curve(Ytest, y_pred_NB, pos_label=11)
auc = metrics.auc(fpr, tpr)
auc

In [None]:
import matplotlib.pyplot as plt

plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

In [None]:
Xtest_remove_other

In [None]:
from sklearn.preprocessing import KBinsDiscretizer

Xtrain_num = Xtrain_remove_other[['Available Extra Rooms in Hospital', 'Visitors with Patient', 'Admission_Deposit']]
Xtrain_cat = Xtrain_remove_other.drop(
    ['Available Extra Rooms in Hospital', 'Visitors with Patient', 'Admission_Deposit'], axis=1)
Xtest_num = Xtest_remove_other[['Available Extra Rooms in Hospital', 'Visitors with Patient', 'Admission_Deposit']]
Xtest_cat = Xtest_remove_other.drop(['Available Extra Rooms in Hospital', 'Visitors with Patient', 'Admission_Deposit'],
                                    axis=1)
kbs = KBinsDiscretizer(n_bins=5, encode='onehot').fit(Xtrain_num)

Xtrain_cat

In [None]:
Xtrain_num

In [None]:
Xtrain_num_1 = pd.cut(Xtrain_num['Available Extra Rooms in Hospital'], 5, labels=False)
Xtrain_num_2 = pd.cut(Xtrain_num['Visitors with Patient'], 5, labels=False)
Xtrain_num_3 = pd.cut(Xtrain_num['Admission_Deposit'], 5, labels=False)

Xtest_num_1 = pd.cut(Xtest_num['Available Extra Rooms in Hospital'], 5, labels=False)
Xtest_num_2 = pd.cut(Xtest_num['Visitors with Patient'], 5, labels=False)
Xtest_num_3 = pd.cut(Xtest_num['Admission_Deposit'], 5, labels=False)

Xtrain_NB = pd.merge(Xtrain_num_1, Xtrain_num_2, left_index=True, right_index=True)
Xtrain_NB = pd.merge(Xtrain_NB, Xtrain_num_3, left_index=True, right_index=True)

Xtest_NB = pd.merge(Xtest_num_1, Xtest_num_2, left_index=True, right_index=True)
Xtest_NB = pd.merge(Xtest_NB, Xtest_num_3, left_index=True, right_index=True)

In [None]:
Xtrain_NB

In [None]:
Xtest_NB

In [None]:
Xtrain_NB = pd.DataFrame(scaler.fit_transform(Xtrain_NB), index=Xtrain_NB.index, columns=Xtrain_NB.columns)
Xtest_NB = pd.DataFrame(scaler.fit_transform(Xtest_NB), index=Xtest_NB.index, columns=Xtest_NB.columns)

In [None]:
Xtrain_NB

In [None]:
Xtrain_cat

In [None]:
Xtrain_NB = pd.merge(Xtrain_NB, Xtrain_cat, left_index=True, right_index=True)
Xtest_NB = pd.merge(Xtest_NB, Xtest_cat, left_index=True, right_index=True)

In [None]:
Xtrain_NB

In [None]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB().fit(Xtrain_NB, Ytrain)

In [None]:
y_pred_NB = mnb.predict(Xtest_NB)

In [None]:
accuracy_score(Ytest, y_pred_NB) * 100

## KNN with HyperParameter adjustment

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
from sklearn.model_selection import GridSearchCV

paramGrid = dict(
    n_neighbors=[10, 50, 300, 500, 1000],
    weights=["uniform", "distance"])
knnModel = KNeighborsClassifier()
grid = GridSearchCV(knnModel, paramGrid, cv=3, return_train_score=True)
grid.fit(Xtrain_remove_other, Ytrain)
print(grid.best_params_)

In [None]:
knn = KNeighborsClassifier(n_neighbors=300, weights='distance')
knn.fit(Xtrain_remove_other, Ytrain)
y_pred_KNN = knn.predict(Xtest_remove_other)
accuracy_score(Ytest, y_pred_KNN) * 100

In [None]:
print(classification_report(Ytest, y_pred_NB))

In [None]:
from sklearn import metrics

fpr, tpr, thresholds = metrics.roc_curve(Ytest, y_pred_KNN, pos_label=11)
auc = metrics.auc(fpr, tpr)
auc

In [None]:
import matplotlib.pyplot as plt

plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

## CATBoost

In [None]:
from catboost import CatBoostClassifier

cb = CatBoostClassifier(random_state=10, use_best_model=True, iterations=1000)
cb.fit(Xtrain, Ytrain, use_best_model=True, verbose=100, eval_set=(Xtest, Ytest))

In [None]:
cb_pred_test = cb.predict(Xtest)
accuracy_score(Ytest, cb_pred_test) * 100

In [None]:
print(classification_report(Ytest, cb_pred_test))

In [None]:
cb_pred_train = cb.predict(Xtrain)
accuracy_score(Ytrain, cb_pred_train) * 100

In [None]:
print(classification_report(Ytrain, cb_pred_train))

In [None]:
cb_pred_all = cb.predict(X_all)
accuracy_score(Y_all, cb_pred_all) * 100

In [None]:
print(classification_report(Y_all, cb_pred_all))

In [None]:
from sklearn import metrics

fpr, tpr, thresholds = metrics.roc_curve(Ytest, cb_pred_all, pos_label=2)
auc = metrics.auc(fpr, tpr)
auc

In [None]:
import matplotlib.pyplot as plt

plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()