# Import Libraries

In [None]:
#Code Block 1

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from pandas_profiling import ProfileReport

from sklearn import preprocessing

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

pd.set_option('display.max_columns',500) 

plt.style.use('seaborn-colorblind') #a style that can be used for plots

sns.set_style('whitegrid')

# Import Data

In [None]:
#Code Block 2

heart = pd.read_csv('data/heart_failure_clinical_records_dataset.csv.xls', index_col = None, header = 0)
heart.info()

In [None]:
#Code Block 3

heart.head()

# Exploring the Data

In [None]:
#Code Block 4

heart.describe()

In [None]:
#Code Block 5

profile = ProfileReport(heart)
profile

In [None]:
#Code Block 6

heart['time'].sort_values().value_counts()
#see vlue counts for time

In [None]:
#Code Block 7

heart['sex'].value_counts()
#to see how many women and men we have in our dataset (1 for men and 0 for women)

In [None]:
#Code Block 8

heart['age'].value_counts()
#to see age value counts

In [None]:
#Code Block 9

(heart["serum_creatinine"] > 2).value_counts()
#to see value counts for serum_creatinine greater than 2 as 2 is above normal level

In [None]:
#Code Block 10

serum_creatinin = heart[heart["serum_creatinine"] > 2]
#create a dataframe for serum_xreatinine above 2
serum_creatinin['DEATH_EVENT'].value_counts()
#to see how many death happened for patients with high level of serum_creatinine

### See how many patients died and how many are men or female 

In [None]:
#Code Block 11

heart['DEATH_EVENT'].value_counts()

In [None]:
#Code Block 12

heart[heart['DEATH_EVENT'] == 1].value_counts('sex')

### See how many patients smoked

In [None]:
#Code Block 13

heart['smoking'].value_counts()

### See how many smoked patients are men and how many are female
- create a dataframe for smoked patients 

In [None]:
#Code Block 14

smoke = heart[heart['smoking'] == 1]
#create a dataframe for smoked patients 
smoke['sex'].value_counts()
#to see the number of smoked men and women

### See how many of smoked patients died 

In [None]:
#Code Block 15

smoke['DEATH_EVENT'].value_counts()

### See how many of smoked patients who died are men and how many are female

In [None]:
#Code Block 16

smoke[smoke['sex'] == 0].value_counts('DEATH_EVENT')
#number of death for smoked female patients 

In [None]:
#Code Block 17

smoke[smoke['sex'] == 1].value_counts('DEATH_EVENT')
#number of death for smoked male patients 

In [None]:
#Code Block 17

women_smoke = smoke[smoke['sex'] == 0]
#dataframe for women who smoked 

### Detect the outliers by visualizing with boxplot 

In [None]:
#Code Block 18

chart = sns.boxplot(x = "serum_creatinine",  data = heart, palette = 'deep')

In [None]:
#Code Block 19

plt.figure(figsize = (20,16))

chart = sns.boxplot(x = "creatinine_phosphokinase",  data = heart, palette = 'deep')


In [None]:
#Code Block 20

chart = sns.boxplot(x = "ejection_fraction",  data = heart, palette = 'deep')

In [None]:
#Code Block 21

chart = sns.boxplot(x = "platelets",  data = heart, palette = 'deep')

In [None]:
#Code Block 22

chart = sns.boxplot(x = "serum_creatinine",  data = heart, palette = 'deep')

In [None]:
#Code Block 23
chart = sns.boxplot(x = "serum_sodium",  data = heart, palette = 'deep')

In [None]:
#Code Block 24

chart = sns.boxplot(x = "serum_sodium",  data = heart, palette = 'deep')

## Cleansing the Data

- Check the dataset for any duplication
- Check for any missng value and null values
- Drop time column as it doesn't have effect on DETH_EVENT we remove it from the dataset

In [None]:
#Code Block 25

heart.duplicated().value_counts()

# Visualize the Data

In [None]:
#Code Block 26

smoke.groupby(['DEATH_EVENT']).sum().plot(kind = 'pie', y = 'sex', autopct='%1.0f%%')
#pie chart for all death event for smoked patients(0 for female and 1 for male)

In [None]:
#Code Block 27

heart.head()

In [None]:
#Code Block 28

heart[['age', 'anaemia', 'creatinine_phosphokinase','diabetes', 'high_blood_pressure',
       'serum_creatinine','sex', 'smoking', 'DEATH_EVENT']].describe()

In [None]:
#Code Block 29

heart.groupby(['smoking']).sum().plot(kind = 'pie', y = 'sex', autopct='%1.0f%%')

In [None]:
#Code Block 30

sns.pairplot(heart)

In [None]:
#Code Block 31

sns.pairplot(heart, hue ='DEATH_EVENT')

In [None]:
#Code Block 32

plt.figure(figsize = (10,8))

heatmap = sns.heatmap(heart.corr(), vmin=-1, vmax=1, annot=True)
plt.title('Correlation Heatmap',fontweight='bold', fontsize='22', horizontalalignment='center')

In [None]:
#Code Block 33

plt.figure(figsize = (10,8))

heatmap = sns.heatmap(heart.corr()[['DEATH_EVENT']].sort_values(by='DEATH_EVENT', ascending=False),
                      vmin=-1, vmax=1, annot=True, cmap='BrBG')
plt.title('Features Correlation with DEATH_EVENT',fontweight='bold', fontsize='20', horizontalalignment='center')

In [None]:
#Code Block 34

women = heart[heart['sex'] == 0]
men =  heart[heart['sex'] == 1]
women.info()

In [None]:
#Code Block 35

plt.figure(figsize = (10,8))

heatmap = sns.heatmap(women.corr()[['DEATH_EVENT']].sort_values(by='DEATH_EVENT', ascending=False),
                      vmin=-1, vmax=1, annot=True, cmap='BrBG')
plt.title('Features Correlation with DEATH_EVENT for Women',fontweight='bold', fontsize='20', horizontalalignment='center')

In [None]:
#Code Block 36 

plt.figure(figsize = (10,8))

heatmap = sns.heatmap(men.corr()[['DEATH_EVENT']].sort_values(by='DEATH_EVENT', ascending=False),
                      vmin=-1, vmax=1, annot=True, cmap='BrBG')
plt.title('Features Correlation with DEATH_EVENT for Men',fontweight='bold', fontsize='20', horizontalalignment='center')

In [None]:
#Code Block 37

heart.groupby(['high_blood_pressure']).sum().plot(kind = 'pie', y = 'DEATH_EVENT', autopct='%1.0f%%')

In [None]:
#Code Block 38

plt.figure(figsize = (20,16))

sns.countplot(x = "age", data = heart, hue = 'DEATH_EVENT', palette = 'deep')
plt.title('Count of Age based on DEATH_EVENT', fontweight='bold', color = 'blue', fontsize='18', horizontalalignment='center')

In [None]:
#Code Block 39

(heart['age'] > 70).value_counts()

In [None]:
#Code Block 40

heart[heart['DEATH_EVENT'] == 1].value_counts('age')
#see the value counts for age for dead patients

In [None]:
#Code Block 41

death = heart[heart['DEATH_EVENT'] == 1]
plt.figure(figsize = (20,16))

sns.countplot(x = "age",  data = death, palette = 'deep')
plt.title('DEATH_EVENT Count Based on Age',fontweight='bold', fontsize='18', horizontalalignment='center')
plt.ylabel('DEATH_EVENT Count',fontweight='bold', fontsize='16', horizontalalignment='center')
plt.xlabel('Age',fontweight='bold', fontsize='16', horizontalalignment='center')

In [None]:
#Code Block 42

age70 = death[death['age'] >= 70]
age70.value_counts('age').sum()

In [None]:
#Code Block 43

plt.figure(figsize = (20,16))

sns.countplot(x = "age",  data = death, hue = 'sex', palette = 'deep')


In [None]:
#Code Block 44

smoke = heart[heart['smoking'] == 1]
smoke

In [None]:
#Code Block 45

(smoke['sex'] == 0).value_counts()
#number of women who smoke

In [None]:
#Code Block 46

smok_m = smoke[smoke['sex'] == 1]
smok_m['DEATH_EVENT'].value_counts()
#see number of men who smoked and died

In [None]:
#Code Block 47

chart = sns.countplot(x = 'DEATH_EVENT',  data = heart, palette = 'deep')
#plot a chart to show the number of death

In [None]:
#Code Block 48

plt.figure(figsize = (20,16))

chart = sns.countplot(x = 'DEATH_EVENT',  data = smoke, palette = 'deep')
plt.title('Count of dead patients who smoked', fontweight='bold', color = 'blue', fontsize='27', horizontalalignment='center')
plt.xlabel('DEATH_EVENT', fontweight='bold', color = 'green', fontsize='14', horizontalalignment='center')
plt.ylabel('Count of Death', fontweight='bold', color = 'green', fontsize='14', horizontalalignment='center')
#create a countplot for smoking patients' DEATH_EVENT

In [None]:
#Code Block 49

sns.countplot(x = 'smoking',  data = heart, hue = 'DEATH_EVENT', palette = 'deep')

In [None]:
#Code Block 50

plt.figure(figsize = (10,8))

sns.countplot(x = 'sex', data = smoke, palette = 'deep')
plt.title('Count of Smoking Women and Men', fontweight='bold', color = 'blue', fontsize='27', horizontalalignment='center')
plt.xlabel('Sex', fontweight='bold', fontsize='14', horizontalalignment='center')
plt.ylabel('Death#', fontweight='bold', color = 'green', fontsize='14', horizontalalignment='center')

In [None]:
#Code Block 51

plt.figure(figsize = (10,8))

sns.scatterplot(x = 'time', y = 'DEATH_EVENT', data = heart, palette = 'deep')
plt.title('Scatterplot for follow-up time and DEATH EVENT', fontweight='bold', color = 'blue', fontsize='27', horizontalalignment='center')
plt.xlabel('time', fontweight='bold', fontsize='14', horizontalalignment='center')
plt.ylabel('DEATH_EVENT', fontweight='bold', color = 'green', fontsize='14', horizontalalignment='center')

In [None]:
#Code Block 52

plt.figure(figsize = (16,10))

time100 = death[death["time"] <= 100]
sns.countplot(x = 'time', data = time100, palette = 'deep')
plt.title('Count of death based on folow-up Time Under 100 Days', fontweight='bold', color = 'blue', fontsize='12', horizontalalignment='center')
plt.xlabel('Time', fontweight='bold', fontsize='12', horizontalalignment='center')
plt.ylabel('Death#', fontweight='bold', color = 'green', fontsize='12', horizontalalignment='center')

In [None]:
time100

-72 patients died in less than and equals to 100 follow-up days. 
 

In [None]:
#Code Block 53

time100[time100['time'] <= 30].value_counts().sum()
#the number of patients who died before and equally of 30 follow-up days

In [None]:
#Code Block 54

time100[time100['time'] <= 50].value_counts().sum()

In [None]:
#Code Block 55

plt.figure(figsize = (20,12))

time250 = death[death["time"] > 100]
sns.countplot(x = 'time', data = time250, palette = 'deep')
plt.title('Count of death based on folow-up Time Above 100 Days', fontweight='bold', color = 'blue', fontsize='27', horizontalalignment='center')
plt.xlabel('Time', fontweight='bold', fontsize='14', horizontalalignment='center')
plt.ylabel('Death#', fontweight='bold', color = 'green', fontsize='14', horizontalalignment='center')

### Check the abnormality level of clinical records of patients and their influenced on number of death.

In [None]:
#Code Block 56

death['ejection_fraction'].value_counts()

In [None]:
#Code Block 57

(death['ejection_fraction'] < 50).value_counts()

In [None]:
#Code Block 58
chart = sns.histplot(x = "ejection_fraction",  data = death, palette = 'deep')
plt.title('Count of death based on ejection_fraction level', fontweight='bold', color = 'blue', fontsize='12', horizontalalignment='center')
plt.xlabel('ejection_fraction', fontweight='bold', fontsize='12', horizontalalignment='center')
plt.ylabel('Death#', fontweight='bold', fontsize='12', horizontalalignment='center')

In [None]:
#Code Block 59

chart = sns.histplot(x = "serum_creatinine",  data = death, palette = 'deep')
plt.title('Count of death based on serum creatinine level', fontweight='bold', color = 'blue', fontsize='12', horizontalalignment='center')
plt.xlabel('serum_creatinine', fontweight='bold', fontsize='12', horizontalalignment='center')
plt.ylabel('Death#', fontweight='bold', fontsize='12', horizontalalignment='center')

In [None]:
#Code Block 60

death_w = death[death['sex'] == 0]
#women who died
death_m = death[death['sex'] == 1]
#men who died 
w_crt = death_w[death_w['serum_creatinine'] > 1.1]
#create a new dataframe for death women with higher serum_creatinine level than normal
m_crt = death_w[death_w['serum_creatinine'] > 1.3]
#create a new dataframe for death men with higher serum_creatinine level than normal

In [None]:
#Code Block 61

chart = sns.histplot(x = "serum_creatinine",  data = w_crt, palette = 'deep')
plt.title('Count of death among Women based on abnormality level of serum creatinine', fontweight='bold', color = 'blue', fontsize='12', horizontalalignment='center')
plt.xlabel('serum_creatinine', fontweight='bold', fontsize='12', horizontalalignment='center')
plt.ylabel('Women Death#' , fontweight='bold', fontsize='12', horizontalalignment='center')

In [None]:
#Code Block 62

w_crt['serum_creatinine'].value_counts().sum()
#number of death women with abnormality level of serum_creatinine 

In [None]:
#Code Block 63

chart = sns.histplot(x = "serum_creatinine",  data = m_crt, palette = 'deep')
plt.title('Count of death among Men based on abnormality level of serum creatinine', fontweight='bold', color = 'blue', fontsize='12', horizontalalignment='center')
plt.xlabel('serum_creatinine', fontweight='bold', fontsize='12', horizontalalignment='center')
plt.ylabel('Men Death#' , fontweight='bold', fontsize='12', horizontalalignment='center')

In [None]:
#Code Block 64

m_crt['serum_creatinine'].value_counts().sum()

In [None]:
#Code Block 65

plt.title('Count of death based on serum_sodium level', fontweight='bold', color = 'blue', fontsize='12', horizontalalignment='center')
plt.chart = sns.histplot(x = "serum_sodium",  data = death, palette = 'deep')
#xlabel('serum_sodium', fontweight='bold', fontsize='12', horizontalalignment='center')
plt.ylabel('Death#', fontweight='bold', fontsize='12', horizontalalignment='center')

In [None]:
#Code Block 66

(death['serum_sodium'] < 135).value_counts()
#number of dead with lower level of serum sodium than normal 

In [None]:
#Code Block 67

(death['serum_sodium'] > 145).value_counts()
#number of dead with higher level of serum sodium than normal 

In [None]:
#Code Block 68

chart = sns.histplot(x = "platelets",  data = death, palette = 'deep')

In [None]:
#Code Block 69

(death['platelets'] < 150000).value_counts()

In [None]:
#Code Block 70

(death['platelets'] > 450000).value_counts()

In [None]:
#Code Block 71

plt.chart = sns.histplot(x = "creatinine_phosphokinase",  data = death, palette = 'deep')
plt.title('Count of death based on creatinine_phosphokinase level', fontweight='bold', color = 'blue', fontsize='12', horizontalalignment='center')
plt.xlabel('creatinine_phosphokinase', fontweight='bold', fontsize='12', horizontalalignment='center')
plt.ylabel('Death#', fontweight='bold', fontsize='12', horizontalalignment='center')

In [None]:
#Code Block 72

(heart['creatinine_phosphokinase'] > 120).value_counts()

In [None]:
#Code Block 73

(death['creatinine_phosphokinase'] > 120).value_counts()

In [None]:
#Code Block 74

(death_w['creatinine_phosphokinase'] > 120).value_counts()

In [None]:
#Code Block 75

(death_m['creatinine_phosphokinase'] > 120).value_counts()

In [None]:
#Code Block 76

cpk120 = death[death['creatinine_phosphokinase'] > 120]
cpk120['DEATH_EVENT']

In [None]:
#Code Block 77

anaem_d = death[death['anaemia'] == 1]
anaem_d_diabt = anaem_d[anaem_d['diabetes'] == 1]
anaem_d_diabt_high = anaem_d_diabt[anaem_d_diabt['high_blood_pressure'] == 1]
(anaem_d_diabt['high_blood_pressure'] == 1).value_counts()

In [None]:
#Code Block 78

anaem_d_diabt_high

- There are 6 patients with anaemia, diabets, high blodd pressure who died and they are all non-smoking men. 

In [None]:
#Code Block 79

(women['diabetes']==1).value_counts()

In [None]:
#Code Block 80

(men['diabetes']==1).value_counts()

In [None]:
#Code Block 81

(death_w['diabetes']==1).value_counts()

In [None]:
#Code Block 82

(death_m['diabetes']==1).value_counts()

In [None]:
#Code Block 83

death = heart[heart['DEATH_EVENT'] == 1]
death.info()

### Compare statistics properties of some of the features among women and men based on who survived and died 

In [None]:
#Code Block 84

death.describe()

In [None]:
#Code Block 85

heart[['age', 'anaemia', 'creatinine_phosphokinase','diabetes', 'ejection_fraction',
       'high_blood_pressure', 'platelets','serum_creatinine', 'sex', 'smoking','time',
       'serum_sodium', 'DEATH_EVENT']].describe()

In [None]:
#Code Block 86

survived = heart[heart['DEATH_EVENT'] == 0]
survived[['age', 'anaemia', 'creatinine_phosphokinase','diabetes', 'ejection_fraction',
       'high_blood_pressure', 'platelets','serum_creatinine', 'serum_sodium', 'sex', 
       'smoking','time','DEATH_EVENT']].describe()

In [None]:
#Code Block 87

death[['age', 'anaemia', 'creatinine_phosphokinase','diabetes', 'ejection_fraction',
       'high_blood_pressure', 'platelets','serum_creatinine', 'serum_sodium', 'sex', 
       'smoking','time','DEATH_EVENT']].describe()

In [None]:
#Code Block 88

death_w[['age', 'anaemia', 'creatinine_phosphokinase','diabetes', 'ejection_fraction',
       'high_blood_pressure', 'platelets','serum_creatinine', 'serum_sodium',
       'smoking','time']].describe()

In [None]:
#Code Block 78

death_m[['age', 'anaemia', 'creatinine_phosphokinase','diabetes', 'ejection_fraction',
       'high_blood_pressure', 'platelets','serum_creatinine', 'serum_sodium',
       'smoking','time']].describe()

## Logistic Regression Model

### Import all libraries we need for logistic regression, decision tree and random forest classifications

In [None]:
#Code Block 79

heart.info('type')

In [None]:
#Code Block 80

from sklearn.tree import DecisionTreeClassifier # to build a classification tree
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn.model_selection import train_test_split # to split data into training and testing sets
from sklearn.model_selection import cross_val_score # for cross validation
from sklearn.metrics import confusion_matrix, classification_report # to create a confusion matrix and classification report
from sklearn.metrics import plot_confusion_matrix # to draw a confusion matrix
from sklearn import tree
from sklearn.tree import plot_tree # to draw a classification tree
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, roc_auc_score

In [None]:
#Code Block 81

heart['age'] = heart['age'].astype(int)
heart['platelets'] = heart['platelets'].astype(int)
heart['serum_creatinine'] = heart['serum_creatinine'].astype(int)
heart.info('type')
#Resetting the data types to all integers

In [None]:
#Code Block 82

X = heart.drop(['DEATH_EVENT'], axis = 1)
y = heart['DEATH_EVENT']
#create X and y dataset
y.value_counts()

In [None]:
#Code Block 83

from sklearn.model_selection import train_test_split
#import train_test_split  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)
#split X and y datasets

### Create the standard scaled versions of X_training and X_test datasets


In [None]:
#Code Block 84

from sklearn.preprocessing import StandardScaler

#standardize the data
sc = preprocessing.StandardScaler()
sc.fit(X_train)
X_train_sc = sc.transform(X_train)
X_train_sc = pd.DataFrame(X_train_sc, columns=X_train.columns)
X_test_sc = sc.transform(X_test)
X_test_sc = pd.DataFrame(X_test_sc, columns=X_test.columns)

In [None]:
#Code Block 85

model = LogisticRegression()
model.fit(X_train, y_train)
#fit the model 
model_pred = model.predict(X_test)
#predict 
score = model.score(X_test, y_test)
print("LogisticRegression accuracy score is:",score)
report = classification_report(y_test, model_pred)
print(report)

In [None]:
#Code Block 86

score = model.score(X_test, y_test)
#score our model's accuracy
print(score)

In [None]:
#Code Block 87

print(confusion_matrix(y_test, model_pred))

In [None]:
#Code Block 88

display(y_test.value_counts())
round(y_test.value_counts(normalize=True),5)

In [None]:
#Code Block 89

df_cm = pd.DataFrame(confusion_matrix(y_test, model_pred), columns = ['Pred_0', 'Pred_1'], index = ['Actual_0', 'Actual_1'])
#make a dataframe for confusion matrix
df_cm

In [None]:
#Code Block 91

fig, axes = plt.subplots(nrows=1, ncols=1, sharex=True, figsize=(10, 6))
fig.suptitle('Confusion Matrix for Logistic Regression for Heart Filure Prediction Data', 
             fontsize=16, y=1.05)
ax = plt.subplot()
sns.heatmap(df_cm, annot=True, cmap="Greens", annot_kws={"size": 16}, ax=ax, fmt="g")
ax.set_xlabel('Predicted', fontsize=15)
ax.set_ylabel('True', fontsize=15)
ax.xaxis.set_ticklabels(['Survived', 'Dead'], fontsize=12)
ax.yaxis.set_ticklabels(['Survived', 'Dead'], fontsize=12, va='center')
plt.show()
#create a heatmap for confusion matrix 

### Classification Report

In [None]:
#Code Block 92

print('Accuracy Score:')
print(score)
print('')
print('-----------------------------------------------------------')
#Display the confusion matrix
print('Confusion Matrix:')
print(confusion_matrix(y_test, model_pred))
print('')
#Print the Classification Report
print('-----------------------------------------------------------')
print('Classification Report:')
print(classification_report(y_test, model_pred, target_names=['Survived', 'Dead'], digits=4))

### View Precision and Recall Scores

In [None]:
#Code Block 93

recall = recall_score(y_test, model_pred)
recall_format = 'Recall Score: {0:.4f}'.format(recall)
print(recall_format)
print('')
precision = precision_score(y_test, model_pred)
precision_format = 'Precision Score: {0:.4f}'.format(precision)
print(precision_format)

### ROC AUC Score

In [None]:
#Code Block 94

y_pred_prob = model.predict_proba(X_test)[:,1]

fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

plt.plot([0, 1], [0, 1],'k--')
plt.plot(fpr, tpr, label='Logistic Regression')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Logistic Regression ROC Curve')
plt.show();

# calculate roc curve
fpr, tpr, thresholds = roc_curve(y_test,model_pred)

In [None]:
#Code Block 95

roc_auc_score(y_test, y_pred_prob)

### Create a Short Version of a Defined Function for Building a Model


In [None]:
#Code Block 96

def shorttraintest(vartrain, vartest, y_train, y_test, model):

    #Fit the model
    model.fit(vartrain, y_train)

    #Predict with the model
    model_pred = model.predict(vartest)
    model_prob = model.predict_proba(vartest)


    print('Confusion Matrix:')
    print(confusion_matrix(y_test, model_pred))
    print("")

    #Assess with the model
    score = model.score(vartest, y_test)
    score_format = 'Accuracy Score: {0:.4f}'.format(score)
    print(score_format)

    recall = recall_score(y_test, model_pred)
    recall_format = 'Recall Score: {0:.4f}'.format(recall)
    print(recall_format)
    
    precision = precision_score(y_test, model_pred)
    precision_format = 'Precision Score: {0:.4f}'.format(precision)
    print(precision_format)
    
    # calculate roc curve
    y_pred_prob = model.predict_proba(vartest)[:,1]
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
    roc_auc = roc_auc_score(y_test, y_pred_prob)
    roc_auc_format = 'ROC AUC Score: {0:.4f}'.format(roc_auc)
    print(roc_auc_format)
    print('')

In [None]:
#Code Block 97

#Set the X training and test datasets
vartrain = X_train_sc
vartest = X_test_sc
model = LogisticRegression(random_state=1)

shorttraintest(vartrain, vartest, y_train, y_test, model)

### Use a For Loop to set C


In [None]:
#Code Block 98

# finding the optimal model based on c

C_penalty = [0.0001, 0.001, 0.01, 0.1, 0.5, 1, 5, 10, 100, 150, 500]
vartrain = X_train_sc
vartest = X_test_sc
for c in C_penalty:
    print('----------------------')
    vartitle = "Model with C: " + str(c)
    varC_penalty = c
    model = LogisticRegression(random_state = 1, C=varC_penalty)
    print(vartitle)
    print('')
    shorttraintest(vartrain, vartest, y_train, y_test, model)
    print('----------------------')

In [None]:
#Code Block 99

# optmial model
# we decided to set the optimal model with C = 10
vartrain = X_train_sc
vartest = X_test_sc
model = LogisticRegression(random_state = 1, C = 0.1)

shorttraintest(vartrain, vartest, y_train, y_test, model)


In [None]:
#Code Block 100

#decision model
# seeing how the different settings for class weight effect the recall and precision score with c = 10
cw = [None, 'balanced', {0:1, 1:1.5}, {0:1, 1:2}, {0:1, 1:2.5}, {0:1, 1:3}, {0:1, 1:3.5}, {0:1, 1:4}, 
      {0:1, 1:4.5},{0:1, 1:5}, {0:1, 1:5.5},{0:1, 1:10}, {0:1, 1:100}]

vartrain = X_train_sc
vartest = X_test_sc

for w in cw:
    print('----------------------')
    vartitle = "Model with Class Weight: " + str(w)
    varcw = w
    model = LogisticRegression(random_state = 1, C = 0.1, class_weight=varcw)
    print(vartitle)
    print('')
    shorttraintest(vartrain, vartest, y_train, y_test, model)
    print('----------------------')

### Using Logistic Regression create a model using scaled data


In [None]:
#Code Block 101

# we decided to set the optimal model with class weight = None which give us the highest accuracy score
vartrain = X_train_sc
vartest = X_test_sc
model = LogisticRegression(random_state = 1, C = 0.1, class_weight= None)

shorttraintest(vartrain, vartest, y_train, y_test, model)

### Randomized Search

In [None]:
#Code Block 102

from sklearn.model_selection import RandomizedSearchCV

grid={"C":[0.1, 0.5, 1, 5, 10], "penalty":["l1","l2"], 
     "class_weight": [None, 'balanced', {0:1, 1:1.5}, {0:1, 1:2.5}, {0:1, 1:3}], "solver":['lbfgs', 'liblinear']}
logreg = LogisticRegression(random_state=1)
logreg_cv = RandomizedSearchCV(logreg,grid,cv=5)
logreg_cv.fit(vartrain,y_train)

print("tuned hyperparameters :(best parameters) ",logreg_cv.best_params_)
print("accuracy :",logreg_cv.best_score_)

### Grid Search

In [None]:
#Code Block 103

from sklearn.model_selection import GridSearchCV

grid={"C":[0.1, 0.5, 1, 5, 10], "penalty":["l1","l2","elasticnet"], 
    "class_weight": [None, 'balanced', {0:1, 1:1.5}, {0:1, 1:2.5}, {0:1, 1:3}], "solver":['lbfgs', 'liblinear','saga']}
logreg = LogisticRegression(random_state = 1)
logreg_cv = GridSearchCV(logreg,grid,cv = 5)
logreg_cv.fit(vartrain,y_train)

print("Tuned hyperparameters :(best parameters) ",logreg_cv.best_params_)
print("Accuracy :",logreg_cv.best_score_)

In [None]:
#Code Block 104

vartrain = X_train_sc
vartest = X_test_sc

#Set the model properties
model = LogisticRegression(random_state = 1, C = 0.1, class_weight = None, penalty = 'l1', solver = 'saga')

shorttraintest(vartrain, vartest, y_train, y_test, model)
#run the model

In [None]:
#Code Block 105

vartrain = X_train_sc
vartest = X_test_sc

#Set the model properties
model = LogisticRegression(random_state = 1, C = 1, class_weight = None,penalty = 'l1', solver = 'saga')

shorttraintest(vartrain, vartest, y_train, y_test, model)
#run the model

In [None]:
#Code Block 106

#decision model
# seeing how the different settings for class weight effect the recall and precision score with c = 10
cw = [None, 'balanced', {0:1, 1:1.5}, {0:1, 1:2}, {0:1, 1:2.5}, {0:1, 1:3}, {0:1, 1:3.5}, {0:1, 1:4}, 
      {0:1, 1:4.5},{0:1, 1:5}, {0:1, 1:5.5},{0:1, 1:10}, {0:1, 1:100}]

vartrain = X_train_sc
vartest = X_test_sc

for w in cw:
    print('----------------------')
    vartitle = "Model with Class Weight: " + str(w)
    varcw = w
    model = LogisticRegression(random_state = 1, C = 1, class_weight=varcw,
                              penalty = 'l1', solver = 'saga')
    print(vartitle)
    print('')
    shorttraintest(vartrain, vartest, y_train, y_test, model)
    print('----------------------')

In [None]:
#Code Block 107

vartrain = X_train_sc
vartest = X_test_sc

#Set the model properties
model = LogisticRegression(random_state = 1, C = 5, class_weight ={0:1, 1:1.5}, penalty = 'l1', solver = 'saga')

shorttraintest(vartrain, vartest, y_train, y_test, model)
#run the model

In [None]:
#Code Block 108

vartrain = X_train_sc
vartest = X_test_sc

#Set the model properties
model = LogisticRegression(random_state = 1, C = 0.1, class_weight ={0:1, 1:1.5}, penalty = 'l1', solver = 'saga')

shorttraintest(vartrain, vartest, y_train, y_test, model)
#run the model

In [None]:
#Code Block 109

vartrain = X_train_sc
vartest = X_test_sc

#Set the model properties
model = LogisticRegression(random_state = 1, C = 0.1, class_weight =None, penalty = 'l1', solver = 'liblinear')

shorttraintest(vartrain, vartest, y_train, y_test, model)
#run the model

In [None]:
#Code Block 110

model_pred = model.predict(vartest)
cm_dt_dec = pd.DataFrame(confusion_matrix(y_test, model_pred))
#crete a confusion matrix dataframe for decision model
cm_dt_dec


In [None]:
#Code Block 111

vartrain = X_train_sc
vartest = X_test_sc

model = LogisticRegression(random_state = 1, C = 0.1, class_weight= None)
model.fit(vartrain, y_train)

model_pred = model.predict(vartest)

cm_dt_opt = pd.DataFrame(confusion_matrix(y_test, model_pred))
#crete a confusion matrix dataframe for optimal model

In [None]:
#Code Block 112

plt.figure(figsize=(20,6))

plt.subplot(121)
plt.title('Confusion Matrix for Logistic Regression for Decision Model-class weight = {1:2}', fontweight='bold', color = 'black', fontsize='12', horizontalalignment='center')
chart = sns.heatmap(cm_dt_dec, annot=True, cmap="Greens", annot_kws={"size": 16}, fmt="g")
chart.set_xlabel('Predicted', fontsize=15)
chart.set_ylabel('True', fontsize=15)
chart.xaxis.set_ticklabels(["Survived", "Dead"], fontsize=12)
chart.yaxis.set_ticklabels(["Survived", "Dead"], fontsize=12, va='center')

plt.subplot(122)
plt.title('Confusion Matrix for Logistic Regression for Optimal Model-class weight = None', fontweight='bold', color = 'black', fontsize='12', horizontalalignment='center')
chart = sns.heatmap(cm_dt_opt, annot=True, cmap="Blues", annot_kws={"size": 16}, fmt="g")
chart.set_xlabel('Predicted', fontsize=15)
chart.set_ylabel('True', fontsize=15)
chart.xaxis.set_ticklabels(["Survived", "Dead"], fontsize=12)
chart.yaxis.set_ticklabels(["Survived", "Dead"], fontsize=12, va='center')


In [None]:
plt.figure(figsize=(20,6))
plt.title('Confusion Matrix for Logistic Regression for Decision Model-class weight = {1:2}', fontweight='bold', color = 'black', fontsize='12', horizontalalignment='center')
chart = sns.heatmap(cm_dt_dec, annot=True, cmap="Greens", annot_kws={"size": 16}, fmt="g")
chart.set_xlabel('Predicted', fontsize=15)
chart.set_ylabel('True', fontsize=15)
chart.xaxis.set_ticklabels(["Survived", "Dead"], fontsize=12)
chart.yaxis.set_ticklabels(["Survived", "Dead"], fontsize=12, va='center')


In [None]:
#Code Block 113

importances = pd.DataFrame(data={
    'Attribute': X_train.columns,
    'Importance': model.coef_[0]
})
#create a dataframe for feature importances 
importances = importances.sort_values(by='Importance', ascending=False)

In [None]:
#Code Block 114

plt.bar(x=importances['Attribute'], height=importances['Importance'], color='#087E8B')
plt.title('Feature importances as Logistic Regression Coefficients', size=12)
plt.xticks(rotation='vertical')
plt.show()


- Explanation: the larger the coefficient is (in both positive and negative direction), the more influence it has on a prediction.

### Explanation for Final Logistic Regression Model:
- Among all paramters, I chose the last one with C=0.1, class_weight =None, penalty = 'l1', solver = 'liblinear', as it gives us better accuracy and recall scores. 
- From the confusion matrix, with this model we predict 18 times correctly for dead and just 8 times wrong out of all 26 deaths while the model predicts 57 times right and 7 times wrong among 64 survived. 
- Logistic regression model ended up with accuracy score of 0.833 and recall score of 0.6923. 

## Building Classification Models

### Rndom Forests Model

In [None]:
#Code Block 115

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import cross_val_score
#Import functions fopr Decision Tree 

### Create a Defined Function for building a model

In [None]:
#Code Block 116

def modeltraintest(vartrain, vartest, y_train, y_test, model):

    #1) Set the properties for the model (model) - by setting vartrain, vartest, and model
    
    #2) Fit the model with training data
    model.fit(vartrain, y_train)

    #3) Predict the target variable with test data
    model_pred = model.predict(vartest)
    model_prob = model.predict_proba(vartest)

    #4) Assess the accuracy with the test data
    score = model.score(vartest, y_test)

    print('XXXXXXXXXXXXXXXX ACCURACY SCORE XXXXXXXXXXXXXXXXXX')
    print(round(score, 6))
    print("")


    print('XXXXXXXXXXXXXXXX CONFUSION MATRIX XXXXXXXXXXXXXXXX')
    print(confusion_matrix(y_test, model_pred))
    print("")


    print('XXXXXXXXXXXXXX CLASSIFICATION REPORT XXXXXXXXXXXXXX')
    print(classification_report(y_test, model_pred))
    print('')


    print('XXXXXXXXXXXXXX ROC AUC SCORE AND CHART XXXXXXXXXXXXXXXXXX')
    print('')
    y_pred_prob = model.predict_proba(vartest)[:,1]

    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

    plt.plot([0, 1], [0, 1],'k--')
    plt.plot(fpr, tpr, label='Classification Model')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.show();

    # calculate roc curve
    y_pred_prob = model.predict_proba(vartest)[:,1]
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
    roc_auc = roc_auc_score(y_test, y_pred_prob)
    roc_auc_format = 'ROC AUC Score: {0:.4f}'.format(roc_auc)
    print(roc_auc_format)
    print('')


    print('XXXXXXXXXXXXXX CROSS VALIDATION XXXXXXXXXXXXXXXXXX')
    print('')
    cv_scores = cross_val_score(model, vartrain, y_train, cv=5,
    scoring='accuracy')
    print('CV Accuracy Scores:')
    print(cv_scores)
    print('')
    cv_rocauc = cross_val_score(model, vartrain, y_train, cv=5,
    scoring='roc_auc')
    print('CV ROC AUC:')
    print(cv_rocauc)

    print('')
    print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')

### Create a short version of the modeling results

In [None]:
#Code Block 117

def shorttraintest(vartrain, vartest, y_train, y_test, model):

    #Fit the model
    model.fit(vartrain, y_train)

    #Predict with the model
    model_pred = model.predict(vartest)
    model_prob = model.predict_proba(vartest)


    print('Confusion Matrix:')
    print(confusion_matrix(y_test, model_pred))
    print("")

    #Assess with the model
    score = model.score(vartest, y_test)
    score_format = 'Accuracy Score: {0:.4f}'.format(score)
    print(score_format)

    recall = recall_score(y_test, model_pred)
    recall_format = 'Recall Score: {0:.4f}'.format(recall)
    print(recall_format)
    
    precision = precision_score(y_test, model_pred)
    precision_format = 'Precision Score: {0:.4f}'.format(precision)
    print(precision_format)
    
    # calculate roc curve
    y_pred_prob = model.predict_proba(vartest)[:,1]
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
    roc_auc = roc_auc_score(y_test, y_pred_prob)
    roc_auc_format = 'ROC AUC Score: {0:.4f}'.format(roc_auc)
    print(roc_auc_format)
    print('')

In [None]:
#Code Block 118

X = heart.drop(['DEATH_EVENT'], axis = 1)
y = heart['DEATH_EVENT']
y.value_counts()


In [None]:
#Code Block 119

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)
#split X and y datasets

### Run a Random Forest Classifier with default parameters

In [None]:
#Code Block 120

vartrain = X_train
vartest = X_test
model = RandomForestClassifier(random_state=1)

modeltraintest(vartrain, vartest, y_train, y_test, model)

In [None]:
#Code Block 121

vartrain = X_train
vartest = X_test
model = RandomForestClassifier(random_state=1)

shorttraintest(vartrain, vartest, y_train, y_test, model)

### Fine-tune the model to find the OPTIMAL model
### Manually Setting the Properties for Random Forest

In [None]:
#Code Block 122

#depth = range(6,25)

#Chose to use a list instead of a rnage
depth = [2, 4, 5, 6, 8, 10, 12, 14, 16, 17, 18, 19, 20, 21, 22, 23, 30]

#Creates an empty list
scores = []

for d in depth:
    classifier = RandomForestClassifier(max_depth = d, random_state = 1)
    classifier = classifier.fit(X_train,y_train)
    score = classifier.score(X_test, y_test)
    scores.append(classifier.score(X_test, y_test))
    print("iteration {} done. Accuracy = ".format(d) + str(score))


plt.plot(depth, scores, '-o')
plt.xlabel('depth, d')
plt.ylabel('scores')
plt.xticks(depth)
plt.show()

In [None]:
#Code Block 123

#depth = range(6,25)

#Chose to use a list instead of a rnage
depth = [30, 50, 70, 90, 120,150, 200, 500]

#Creates an empty list
scores = []

for d in depth:
    classifier = RandomForestClassifier(max_depth = d, random_state = 1)
    classifier = classifier.fit(X_train,y_train)
    score = classifier.score(X_test, y_test)
    scores.append(classifier.score(X_test, y_test))
    print("iteration {} done. Accuracy = ".format(d) + str(score))


plt.plot(depth, scores, '-o')
plt.xlabel('depth, d')
plt.ylabel('scores')
plt.xticks(depth)
plt.show()

#### Max_depth of 4, 5, 6, and 8 have the highest accuracy score. We select Max_depth of 6 for now 

In [None]:
#Code Block 124

vartrain = X_train
vartest = X_test
model = RandomForestClassifier(max_depth = 6, random_state=1)

shorttraintest(vartrain, vartest, y_train, y_test, model)

### Manually set Max Features


In [None]:
#Code Block 125


figsize=(20, 5)
maxf = range(1,13)
scores = []

for d in maxf:
    classifier=RandomForestClassifier(max_depth = 6, max_features = d, random_state=1)
    classifier=classifier.fit(X_train,y_train)
    score = classifier.score(X_test, y_test)
    scores.append(classifier.score(X_test, y_test))
    print("iteration {} done. Accuracy = ".format(d) + str(score))


plt.plot(maxf, scores, '-o')
plt.xlabel('maxf, d')
plt.ylabel('scores')
plt.xticks(maxf)
plt.show()

### Select max_features = 2 as with this the accuracy score is the highest. It's possible to choose less max_features if the accuracy score is minimal difference.

In [None]:
#Code Block 126

figsize=(20, 5)
est = [1, 20, 50, 100, 150, 500,700, 1000]
scores = []

for d in est:
    classifier=RandomForestClassifier(max_depth = 6, max_features = 3, n_estimators = d, random_state=1)
    classifier=classifier.fit(X_train,y_train)
    score = classifier.score(X_test, y_test)
    scores.append(classifier.score(X_test, y_test))
    print("iteration {} done. Accuracy = ".format(d) + str(score))
#Manually set n_estimator

plt.plot(est, scores, '-o')
plt.xlabel('est, d')
plt.ylabel('scores')
plt.xticks(est)
plt.show()

### Random Forest with n_estimators = 50

In [None]:
#Code Block 127

vartrain = X_train
vartest = X_test
model = RandomForestClassifier(max_depth = 6, max_features = 3, n_estimators = 50, random_state=1)

shorttraintest(vartrain, vartest, y_train, y_test, model)

### Random Forest with n_estimators = 100

In [None]:
#Code Block 128

vartrain = X_train
vartest = X_test
model = RandomForestClassifier(max_depth = 6, max_features = 3, n_estimators = 100, random_state=1)
shorttraintest(vartrain, vartest, y_train, y_test, model)

#### n_stimatores of 100  has the higher accuracy score than the 100. 

### Run Optimal Model with class_weight = None

In [None]:
#Code Block 129

vartrain = X_train
vartest = X_test
model = RandomForestClassifier(max_depth = 6, max_features = 3, n_estimators = 100, random_state=1, class_weight = None)

shorttraintest(vartrain, vartest, y_train, y_test, model)

### Use a For Loop to determine the optimal class_weight

In [None]:
#Code Block 130

cw = [None, 'balanced', {0:1, 1:2}, {0:1, 1:2.5}, {0:1, 1:3}, {0:1, 1:3.5}, {0:1, 1:4},
      {0:1, 1:4.5},{0:1, 1:5}, {0:1, 1:5.5}, {0:1, 1:6}, {0:1, 1:10}, 
      {0:1, 1:20}, {0:1, 1:25}, {0:1, 1:30}, {0:1, 1:50}, {0:1, 1:100}]

vartrain = X_train
vartest = X_test

for w in cw:
    print('----------------------')
    vartitle = "Model with Class Weight: " + str(w) 
    varcw = w
    model = RandomForestClassifier(max_depth = 6, max_features = 3, n_estimators = 100,random_state=1,
                                   class_weight=varcw)
    print(vartitle)
    print('')
    shorttraintest(vartrain, vartest, y_train, y_test, model)


          
print('----------------------')



In [None]:
#Code Block 131

vartrain = X_train
vartest = X_test
model = RandomForestClassifier(max_depth = 6, max_features = 3, n_estimators = 100, class_weight = {0: 1, 1: 5})

modeltraintest(vartrain, vartest, y_train, y_test, model)

### Random Search for Random Forest

In [None]:
#Code Block 132

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV


In [None]:
#Code Block 133

vartrain = X_train
vartest = X_test

grid={"criterion": ['gini', 'entropy'], "max_depth" : [4, 5, 6],
      "n_estimators" : [20, 50, 100], "max_features" : [2, 3, 4],
      "class_weight": [None],
      "bootstrap": [True, False],
      "min_samples_split": [2, 5, 10]}
model_random = RandomForestClassifier(random_state = 1)
model_cv = RandomizedSearchCV(model_random,grid,cv = 5)
model_cv.fit(vartrain,y_train)

print("tuned hyperparameters :(best parameters) ",model_cv.best_params_)
print("accuracy :", model_cv.best_score_)

### Grid Search for Random Forest

In [None]:
#Code Block 134

vartrain = X_train
vartest = X_test

grid={"criterion": ['gini', 'entropy'], "max_depth" : [4, 5, 6],
      "n_estimators" : [20, 50, 100], "max_features" : [2, 3, 4],
      "class_weight": [None],
      "bootstrap": [True, False],
      "min_samples_split": [2, 5, 10]}
model_grid = RandomForestClassifier(random_state = 1, n_jobs = -2)
model_cv = GridSearchCV(model_grid,grid,cv = 5)
model_cv.fit(vartrain,y_train)

print("tuned hyperparameters :(best parameters) ",model_cv.best_params_)
print("accuracy :",model_cv.best_score_)

In [None]:

#Code Block 135

#Set the X training and test datasets
vartrain = X_train
vartest = X_test

#Set the model properties
model = RandomForestClassifier(max_depth = 4, max_features = 3, n_estimators = 100,
                               class_weight = None, criterion = 'entropy', random_state = 1,
                               bootstrap = False, min_samples_split = 10)
shorttraintest(vartrain, vartest, y_train, y_test, model)

In [None]:
#Code Block 136

#Set the X training and test datasets
vartrain = X_train
vartest = X_test

#Set the model properties
model = RandomForestClassifier(max_depth = 4, max_features = 3, n_estimators = 100,
                               class_weight = None, criterion = 'entropy', random_state = 1,
                               bootstrap = True, min_samples_split = 10)
shorttraintest(vartrain, vartest, y_train, y_test, model)

In [None]:
#Code Block 137

cw = [None, 'balanced', {0:1, 1:2}, {0:1, 1:2.5}, {0:1, 1:3}, {0:1, 1:3.5}, {0:1, 1:4}, {0:1, 1:4.5},
      {0:1, 1:5},{0:1, 1:5.5}, {0:1, 1:6},{0:1, 1:6.5}, {0:1, 1:7}, {0:1, 1:7.5},{0:1, 1:8}, {0:1, 1:8.5}, 
      {0:1, 1:9},{0:1, 1:10}, {0:1, 1:11},{0:1, 1:12},{0:1, 1:13}]
    
vartrain = X_train
vartest = X_test

for w in cw:
    print('----------------------')
    vartitle = "Model with Class Weight: " + str(w) 
    varcw = w
    #Set the model properties
    model = RandomForestClassifier(max_depth = 4, max_features = 3, n_estimators =100, 
                                   class_weight = varcw, criterion = 'gini', random_state=1,
                                   bootstrap = True, min_samples_split = 5)
    print(vartitle)
    print('')
    shorttraintest(vartrain, vartest, y_train, y_test, model)

#run the model with the best parameters for different class_weights
          
print('----------------------')


In [None]:
#Code Block 138

#Set the X training and test datasets
vartrain = X_train
vartest = X_test

#Set the model properties
model = RandomForestClassifier(max_depth = 4, max_features = 3, n_estimators =100, 
                               class_weight = None, criterion = 'gini', random_state=1,
                               bootstrap = True, min_samples_split = 5)

shorttraintest(vartrain, vartest, y_train, y_test, model)

- We want to have higher recall score then we sacrifice a little of accuracy to achieve that. We go back up to look at the different cws to find a good accuracy and recall scores. 

In [None]:
#Code Block 139

#Set the X training and test datasets
vartrain = X_train
vartest = X_test

#Set the model properties
model = RandomForestClassifier(max_depth = 6, max_features = 3, n_estimators = 100,
                               class_weight = {0: 1, 1: 5}, criterion = 'gini', random_state = 1)
                              
                              
                               
shorttraintest(vartrain, vartest, y_train, y_test, model)

In [None]:
#Code Block 140

#Set the X training and test datasets
vartrain = X_train
vartest = X_test

#Set the model properties
model = RandomForestClassifier(max_depth = 6, max_features = 3, n_estimators = 100,
                               class_weight = {0: 1, 1: 5}, criterion = 'gini', random_state = 1)
                               
modeltraintest(vartrain, vartest, y_train, y_test, model)

In [None]:
#Code Block 141

vartrain = X_train_sc
vartest = X_test_sc

model = RandomForestClassifier(max_depth = 6, max_features = 3, n_estimators = 100,
                               class_weight = {0: 1, 1: 5}, criterion = 'gini', random_state = 1)
                               
model.fit(vartrain, y_train)

model_pred = model.predict(vartest)
cm_dt_dec = pd.DataFrame(confusion_matrix(y_test, model_pred))

In [None]:
#Code Block 142

plt.title('Confusion Matrix for the Rndom Forest Classifier-class weight=1:5', fontweight='bold', color = 'black', fontsize='12', horizontalalignment='center')
chart = sns.heatmap(cm_dt_dec, annot=True, cmap="Greens", annot_kws={"size": 16}, fmt="g")
chart.set_xlabel('Predicted', fontsize=12)
chart.set_ylabel('True', fontsize=15)
chart.xaxis.set_ticklabels(["Survived", "Dead"], fontsize=12)
chart.yaxis.set_ticklabels(["Survived", "Dead"], fontsize=12, va='center')



In [None]:
#Code Block 143

vartrain = X_train_sc
vartest = X_test_sc

model = RandomForestClassifier(max_depth = 4, max_features = 3, n_estimators = 100,
                               class_weight = None, criterion = 'gini', random_state = 1,
                               bootstrap = True, min_samples_split = 5)
    
model.fit(vartrain, y_train)

model_pred = model.predict(vartest)
cm_dt_opt = pd.DataFrame(confusion_matrix(y_test, model_pred))

In [None]:
#Code Block 144

plt.figure(figsize=(20,6))

plt.subplot(121)

plt.title('Confusion Matrix for Optimal Model in Random Forest Classifiermodel-class weight=None', fontweight='bold', color = 'black', fontsize='12', horizontalalignment='center')
chart = sns.heatmap(cm_dt_opt, annot=True, cmap="Blues", annot_kws={"size": 16}, fmt="g")
chart.set_xlabel('Predicted', fontsize=15)
chart.set_ylabel('True', fontsize=15)
chart.xaxis.set_ticklabels(["Survived", "Dead"], fontsize=12)
chart.yaxis.set_ticklabels(["Survived", "Dead"], fontsize=12, va='center')


plt.subplot(122)
plt.title('Confusion Matrix for Decision model in Random Forest Classifier model-class weight=''{1:5}', fontweight='bold', color = 'black', fontsize='12', horizontalalignment='center')
chart = sns.heatmap(cm_dt_dec, annot=True, cmap="Greens", annot_kws={"size": 16}, fmt="g")
chart.set_xlabel('Predicted', fontsize=15)
chart.set_ylabel('True', fontsize=15)
chart.xaxis.set_ticklabels(["Survived", "Dead"], fontsize=12)
chart.yaxis.set_ticklabels(["Survived", "Dead"], fontsize=12, va='center')

##### Explanation:
- Our Decision Tree final model's parameters are max_depth = 6, max_features = 3, n_estimators = 100, class_weight = {0: 1, 1: 5}, criterion = 'gini', random_state = 1

In [None]:
#Code Block 145

vartrain = X_train
vartest = X_test

#Set the model properties
model = RandomForestClassifier(max_depth = 6, max_features = 3, n_estimators = 100,
                               class_weight = {0: 1, 1: 5}, criterion = 'gini', random_state = 1)
                               
shorttraintest(vartrain, vartest, y_train, y_test, model)

### Importance Levels of the Variables


In [None]:
#Code Block 146

importance = pd.DataFrame({"Importance": model.feature_importances_ * 100},
                         index = pd.DataFrame(X_train).columns)
importance.sort_values(by = "Importance",
                       axis = 0,
                       ascending = True).plot(kind = "bar", color = "green")
plt.title('Feature Importances in Random Forest Classifier')
plt.xlabel("Importance levels of the variables")
plt.ylabel("Variables")

In [None]:
#Code Block 147

plot_confusion_matrix(model,
                      X_test,
                      y_test,
                      cmap = plt.cm.Reds,
                      normalize='true');

In [None]:
#Code Block 148

#Set the X training and test datasets
vartrain = X_train
vartest = X_test

#Set the model properties
model = RandomForestClassifier(max_depth = 6, max_features = 3, n_estimators = 100,
                               class_weight = {0: 1, 1: 5}, criterion = 'gini', random_state = 1)
modeltraintest(vartrain, vartest, y_train, y_test, model)

### Explanation:
- The final  Random Forest model's parameters is max_depth = 6, max_features = 3, n_estimators = 100,
                               class_weight = {0: 1, 1: 5}, criterion = 'gini', random_state = 1.  

## Classification using Decision Tree Modeling

### Run the Decision Tree model with default parameters

In [None]:
#Code Block 149

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)
#split X and y datasets

In [None]:
#Code Block 150

vartrain = X_train
vartest = X_test
model = DecisionTreeClassifier(random_state=1)  

modeltraintest(vartrain, vartest, y_train, y_test, model)

In [None]:
#Code Block 151

model = DecisionTreeClassifier(random_state=1)
model.fit(X_train, y_train)

model_pred = model.predict(X_test)

score = model.score(X_test, y_test)
print(score)
print("")
print(confusion_matrix(y_test, model_pred))

print(classification_report(y_test, model_pred))
#run the model

### Visualize a Decision Tree

In [None]:
#Code Block 152

model
#show the model

In [None]:
#Code Block 153

X_names=X_train.columns.values
#create a list of the column names

In [None]:
#Code Block 154

model = DecisionTreeClassifier(max_leaf_nodes=13, random_state = 1)
model.fit(X_train, y_train)
model_pred = model.predict(X_test)
model

In [None]:
#Code Block 155

print(confusion_matrix(y_test, model_pred))

print(classification_report(y_test, model_pred))

In [None]:
#Code Block 156

from sklearn import tree
#import tree from sklearn library

plt.rcParams['text.color'] = 'black'
plt.figure(figsize=(27, 16), dpi=400)

plot_tree(model, 
          filled=True, 
          rounded=True,
          fontsize=14,
          proportion=False,
          class_names=["Survived", "Dead"], 
          feature_names=X_names); 
#create a decision tree

In [None]:
#Code Block 157

print(confusion_matrix(y_test, model_pred))

### Using Decision Tree Modeling create a model using scaled data


In [None]:
#Code Block 158

#Set the X training and test datasets
vartrain = X_train_sc
vartest = X_test_sc

#Set the model properties
model = DecisionTreeClassifier(max_leaf_nodes=13, random_state = 1)

modeltraintest(vartrain, vartest, y_train, y_test, model)

In [None]:
#Code Block 159

vartrain = X_train
vartest = X_test
model = DecisionTreeClassifier(max_leaf_nodes=13, random_state = 1)

modeltraintest(vartrain, vartest, y_train, y_test, model)

#### Explanation:
- Standard Scaled data doesn't perform as well as raw data and decision tree classifiers work better with the raw data. So, we use raw data for OPTIMAL and DECISION. 
- Just with setting the max_leaf_nodes=13 we got an accuracy sore of 0.83

### Fine-tune the model to find the OPTIMAL model
#### Manually Setting the Properties for Decision Tree

In [None]:
#Code Block 160

est = range(1,20)
scores = []
var_est = 'max depth'

for d in est:
    classifier=DecisionTreeClassifier(max_depth = d, random_state = 1)
    classifier=classifier.fit(X_train,y_train)
    scores.append(classifier.score(X_test, y_test))
    print("iteration {} done".format(d))


plt.plot(est, scores, '-o')
plt.xlabel(var_est)
plt.ylabel('scores')
plt.xticks(est)
plt.show()
#show the scores for max_depth range in a plot 

In [None]:
#Code Block 161

est = range(20,40)
scores = []
var_est = 'max depth'

for d in est:
    classifier=DecisionTreeClassifier(max_depth = d, random_state = 1)
    classifier=classifier.fit(X_train,y_train)
    scores.append(classifier.score(X_test, y_test))
    print("iteration {} done".format(d))


plt.plot(est, scores, '-o')
plt.xlabel(var_est)
plt.ylabel('scores')
plt.xticks(est)
plt.show()
#show the scores for max_depth range in a plot 

In [None]:
#Code Block 162

est = range(40,60)
scores = []
var_est = 'max depth'

for d in est:
    classifier=DecisionTreeClassifier(max_depth = d, random_state = 1)
    classifier=classifier.fit(X_train,y_train)
    scores.append(classifier.score(X_test, y_test))
    print("iteration {} done".format(d))


plt.plot(est, scores, '-o')
plt.xlabel(var_est)
plt.ylabel('scores')
plt.xticks(est)
plt.show()
#show the scores for max_depth range in a plot 

#### We see the max score for max_depth  2 and 3

In [None]:
#Code Block 163

vartrain = X_train
vartest = X_test
model = DecisionTreeClassifier(max_depth=9, random_state = 1)

shorttraintest(vartrain, vartest, y_train, y_test, model)
#run the model based on max_depth = 4

In [None]:
#Code Block 164

figsize=(20, 6)
est = range(9, 30)
scores = []
var_est = 'max leaf nodes'

for d in est:
    classifier=DecisionTreeClassifier(max_depth = 9, max_leaf_nodes = d, random_state = 1)
    classifier=classifier.fit(X_train,y_train)
    scores.append(classifier.score(X_test, y_test))
    print("iteration {} done".format(d))


plt.plot(est, scores, '-o')
plt.xlabel(var_est)
plt.ylabel('scores')
plt.xticks(est)
plt.show()
##show the scores for max leaf nodes range (9,30) in a plot 

In [None]:
#Code Block 165

figsize=(20, 5)
est = range(30, 50)
scores = []
var_est = 'max leaf nodes'

for d in est:
    classifier=DecisionTreeClassifier(max_depth = 9, max_leaf_nodes = d, random_state = 1)
    classifier=classifier.fit(X_train,y_train)
    scores.append(classifier.score(X_test, y_test))
    print("iteration {} done".format(d))


plt.plot(est, scores, '-o')
plt.xlabel(var_est)
plt.ylabel('scores')
plt.xticks(est)
plt.show()
##show the scores for max leaf nodes range (9,30) in a plot 

In [None]:
#Code Block 166

vartrain = X_train
vartest = X_test
model = DecisionTreeClassifier(max_depth = 9, max_leaf_nodes = 23, class_weight=None, random_state = 1)
#set max_depth = 9 and max_leaf_nodes = 23

shorttraintest(vartrain, vartest, y_train, y_test, model)
#run the model with the new parameters 

### Checking our model with different class_weight

In [None]:
#Code Block 167

cw = [None, 'balanced', {0:1, 1:1.5}, {0:1, 1:2}, {0:1, 1:2.5}, {0:1, 1:3}, {0:1, 1:3.5}, {0:1, 1:4},
     {0:1, 1:4.5}, {0:1, 1:5}, {0:1, 1:6}, {0:1, 1:8}, {0:1, 1:10}, {0:1, 1:11}, {0:1, 1:12},
      {0:1, 1:13}, {0:1, 1:14},{0:1, 1:15}, {0:1, 1:18}, {0:1, 1:20}]

vartrain = X_train
vartest = X_test

for w in cw:
    print('----------------------')
    vartitle = "Model with Class Weight: " + str(w) 
    varcw = w
    model = DecisionTreeClassifier(max_depth = 9, max_leaf_nodes =23, class_weight=varcw, random_state = 1)
    print(vartitle)
    print('')
    shorttraintest(vartrain, vartest, y_train, y_test, model)


          
print('----------------------')



### Grid Search for Decision Tree

In [None]:
#Code Block 168

from sklearn.model_selection import GridSearchCV


grid={"criterion": ['gini', 'entropy'], "max_depth" : range(9,19),
      "max_leaf_nodes" : range(22,29),
      "class_weight": [None, 'balanced',{0:1, 1:2}, {0:1, 1:2.5}, {0:1, 1:3}, {0:1, 1:3.5},
                       {0:1, 1:10}, {0:1, 1:11}, {0:1, 1:12}]}
model_random = DecisionTreeClassifier(random_state = 1)
model_cv=GridSearchCV(model_random,grid,cv=5)
model_cv.fit(vartrain,y_train)


print("tuned hyperparameters :(best parameters) ",model_cv.best_params_)
print("accuracy :",model_cv.best_score_)

### Running Decision Tree model with best parameters

In [None]:
#Code Block 169

#Set the X training and test datasets
vartrain = X_train
vartest = X_test

#Set the model properties
model_dt = DecisionTreeClassifier(max_depth = 9, max_leaf_nodes = 26, 
                               class_weight={0: 1, 1: 11}, criterion = 'entropy', random_state = 1)

#Assess with the model
model_dt.fit(vartrain, y_train)

#Predict with the model
model_pred = model_dt.predict(vartest)
model_prob = model_dt.predict_proba(vartest)


print('Confusion Matrix:')
cm_dt = confusion_matrix(y_test, model_pred)
print(cm_dt)
print("")

#Assess with the model
score = model_dt.score(vartest, y_test)
score_format = 'Accuracy Score: {0:.4f}'.format(score)
print(score_format)

recall = recall_score(y_test, model_pred)
recall_format = 'Recall Score: {0:.4f}'.format(recall)
print(recall_format)

precision = precision_score(y_test, model_pred)
precision_format = 'Precision Score: {0:.4f}'.format(precision)
print(precision_format)

y_pred_prob = model_dt.predict_proba(vartest)[:,1]

dt_fpr, dt_tpr, thresholds = roc_curve(y_test, y_pred_prob)

plt.plot([0, 1], [0, 1],'k--')
plt.plot(dt_fpr, dt_tpr, label='Decision Tree')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Decision Tree ROC Curve ROC Curve')
plt.show();


In [None]:
#Code Block 170

vartrain = X_train
vartest = X_test
model = DecisionTreeClassifier(max_depth = 9, max_leaf_nodes = 26, class_weight={0:1, 1:11},
                               criterion = 'entropy', random_state = 1)
#set max_depth = 2 and max_leaf_nodes = 9 

shorttraintest(vartrain, vartest, y_train, y_test, model)
#run the model with the new parameters 

In [None]:
#Code Block 171

vartrain = X_train
vartest = X_test
model = DecisionTreeClassifier(max_depth = 9, max_leaf_nodes =23, class_weight={0:1, 1:11},
                               criterion = 'gini', random_state = 1)
#set max_depth = 4 and max_leaf_nodes = 9 

shorttraintest(vartrain, vartest, y_train, y_test, model)
#run the model with the new parameters 

### Fine-tune the model to find the DECISION model


In [None]:
#Code Block 172

cw = [None, 'balanced', {0:1, 1:1.5}, {0:1, 1:2}, {0:1, 1:2.5},{0:1, 1:2.6},{0:1, 1:2.65},{0:1, 1:12.7},{0:1, 1:2.75},{0:1, 1:3}, {0:1, 1:3.5}, {0:1, 1:4},
      {0:1, 1:4.5}, {0:1, 1:5}, {0:1, 1:10},{0:1, 1:11}, {0:1, 1:12},{0:1, 1:13}, {0:1, 1:16.5},{0:1, 1:17},
      {0:1, 1:17.5},{0:1, 1:18}]

vartrain = X_train
vartest = X_test

for w in cw:
    print('----------------------')
    vartitle = "Model with Class Weight: " + str(w) 
    varcw = w
    #Set the model properties
    model = DecisionTreeClassifier(max_depth = 9, max_leaf_nodes = 23, 
                               class_weight=varcw, criterion = 'gini', random_state = 1)
    #Assess with the model
    print(vartitle)
    print('')
    shorttraintest(vartrain, vartest, y_train, y_test, model)


          
print('----------------------')


### Explanation : Model with Class Weight: {0: 1, 1: 3} works good as it gives us an accuracy score of 0.7889 and Recall score of 0.8108. Optimal and Decision model are the same here. 

In [None]:
#Code Block 173

#Set the X training and test datasets
vartrain = X_train
vartest = X_test

#Set the model properties
model_dt_10 = DecisionTreeClassifier(max_depth = 9, max_leaf_nodes = 23, 
                               class_weight={0: 1, 1: 10}, criterion = 'gini', random_state = 1)

#Assess with the model
model_dt_10.fit(vartrain, y_train)

#Predict with the model
model_pred = model_dt_10.predict(vartest)
model_prob = model_dt_10.predict_proba(vartest)


print('Confusion Matrix:')
cm_dt_10 = confusion_matrix(y_test, model_pred)
print(cm_dt_10)
print("")

#Assess with the model
score = model_dt_10.score(vartest, y_test)
score_format = 'Accuracy Score: {0:.4f}'.format(score)
print(score_format)

recall = recall_score(y_test, model_pred)
recall_format = 'Recall Score: {0:.4f}'.format(recall)
print(recall_format)

precision = precision_score(y_test, model_pred)
precision_format = 'Precision Score: {0:.4f}'.format(precision)
print(precision_format)

y_pred_prob = model_dt_10.predict_proba(vartest)[:,1]

dt_10_fpr, dt_10_tpr, thresholds = roc_curve(y_test, y_pred_prob)

plt.plot([0, 1], [0, 1],'k--')
plt.plot(dt_10_fpr, dt_10_tpr, label='Decision Tree ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Decision Tree Classifier ROC Curve')
plt.show();

In [None]:
#Code Block 174

vartrain = X_train
vartest = X_test
model = DecisionTreeClassifier(max_depth = 9, max_leaf_nodes =23, class_weight={0:1, 1:10},
                               criterion = 'gini', random_state = 1)
#set max_depth = 4 and max_leaf_nodes = 9 

shorttraintest(vartrain, vartest, y_train, y_test, model)
#run the model with the new parameters 

In [None]:
#Code Block 175

from sklearn import tree

plt.rcParams['text.color'] = 'black'
plt.figure(figsize=(27, 16), dpi=400)

plot_tree(model, 
          filled=True, 
          rounded=True,
          fontsize=14,
          proportion=False,
          class_names=["Survived", "Dead"], 
          feature_names=X_names); 

In [None]:
#Code Block 176

vartrain = X_train_sc
vartest = X_test_sc

model = DecisionTreeClassifier(max_depth = 9, max_leaf_nodes = 23, 
                               class_weight=None, criterion = 'gini', random_state = 1)
model.fit(vartrain, y_train)

model_pred = model.predict(vartest)

cm_dt_opt = pd.DataFrame(confusion_matrix(y_test, model_pred))
#crete a confusion matrix dataframe for optimal model

In [None]:
#Code Block 177

plt.figure(figsize=(20,6))

plt.subplot(121)

plt.title('Confusion Matrix for Optimal Model in Decision Tree model-class weight=None', fontweight='bold', color = 'black', fontsize='12', horizontalalignment='center')
chart = sns.heatmap(cm_dt_opt, annot=True, cmap="Blues", annot_kws={"size": 16}, fmt="g")
chart.set_xlabel('Predicted', fontsize=15)
chart.set_ylabel('True', fontsize=15)
chart.xaxis.set_ticklabels(["Survived", "Dead"], fontsize=12)
chart.yaxis.set_ticklabels(["Survived", "Dead"], fontsize=12, va='center')


plt.subplot(122)
plt.title('Confusion Matrix for Decision model in Decision Tree model-class weight=''{1:10}', fontweight='bold', color = 'black', fontsize='12', horizontalalignment='center')
chart = sns.heatmap(cm_dt_10, annot=True, cmap="Greens", annot_kws={"size": 16}, fmt="g")
chart.set_xlabel('Predicted', fontsize=15)
chart.set_ylabel('True', fontsize=15)
chart.xaxis.set_ticklabels(["Survived", "Dead"], fontsize=12)
chart.yaxis.set_ticklabels(["Survived", "Dead"], fontsize=12, va='center')

In [None]:
#Code Block 178

# Create Overall ROC Curve 
plt.figure(figsize = (13,12))
plt.suptitle('ROC Curves - Competing Classification Methodologies', fontweight='bold', fontsize=18, y=.92)
plt.plot(dt_fpr, dt_tpr, label='cw=None')
plt.plot(dt_10_fpr, dt_10_tpr, label='cw={1:10}')
plt.plot([0,1], [0,1], linestyle='--')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('FPR', fontsize=14)
plt.ylabel('TPR', fontsize=14)
plt.legend(loc="lower right", frameon=False, fontsize=12)
plt.show()

- The orange on with cw ={1:10} seems the best.

In [None]:
#Code Block 179

model_pred = pd.DataFrame(model_pred)
#create a dataframe for model_pred

model_pred = model_pred.rename(columns = {0:'Pred_DEC_dt'})
#rename column
model_pred.head()

In [None]:
#Code Block 180

X_names = pd.DataFrame(list(X.columns))

In [None]:
#Code Block 181

df_fi = pd.DataFrame(model_dt.feature_importances_)
#create a dataframe for feature importance for OPTICAL model
df_fi

In [None]:
#Code Block 182

df_fi_10 = pd.DataFrame(model_dt_10.feature_importances_)
#create a dataframe for feature importance for DECISION model

In [None]:
#Code Block 183

df_feat_imp = pd.concat([X_names, df_fi, df_fi_10], axis = 1)
#concatenate X_names, df_fi, and df_fi_10 to create a feature importance table 
df_feat_imp.columns = ['Features', 'dt_importance', 'dt_10_importance']
#create name of the columns for feature importance table 
df_feat_imp

### Shows all features that are above 0

In [None]:
#Code Block 184

df_feat_imp[(df_feat_imp['dt_10_importance']!=0)].sort_values('dt_10_importance', ascending = False)

In [None]:
#Code Block 185

df_feat_imp1 = pd.concat([X_names, df_fi_10], axis = 1)
#concatenate X_names and df_fi_2 to create a feature importance table 
df_feat_imp1.columns = ['Features','dt_10_importance']
#create name of the columns for feature importance table 
df_feat_imp1

In [None]:
#Code Block 186

df_feat_imp1[(df_feat_imp1['dt_10_importance']!=0)].sort_values('dt_10_importance', ascending = False)

In [None]:
#Code Block 187

importance = pd.DataFrame({"Importance": model.feature_importances_ * 100},
                         index = pd.DataFrame(X_train).columns)
importance.sort_values(by = "Importance",
                       axis = 0,
                       ascending = True).plot(kind = "bar", color = "green")
plt.title('Feature Importances in Decision Tree Classifier')
plt.xlabel("Importance levels of the variables")
plt.ylabel("Variables")

In [None]:
#Code Block 188

model = [
         
#Logistic Regression 
        
         (X_train_sc, X_test_sc, y_train,'Logistic',
          LogisticRegression(random_state = 1, C = 0.1, class_weight =None, penalty = 'l1', solver = 'liblinear')),
        
         
#Decision Tree 
        
         (X_train, X_test, y_train,'DecisionTree',
          DecisionTreeClassifier(max_depth = 9, max_leaf_nodes =23, class_weight={0:1, 1:10},
                               criterion = 'gini', random_state = 1)),
         

#Random Forest 
    
         (X_train, X_test, y_train, 'RandomForest',
          RandomForestClassifier(max_depth = 6, max_features = 3, n_estimators = 100,
                               class_weight = {0: 1, 1: 5}, criterion = 'gini', random_state = 1)),
]         
         
          
cm_all = pd.DataFrame(columns=['Type', 'pred_survived', 'pred_dead','Score', 'Recall', 'Precision','F1'])
#create a new dataframe with above columns 


for tr, tst, yt, n, m in model:
    m.fit(tr, yt)
    model_pred = m.predict(tst)
    model_prob = m.predict_proba(tst)
    score = m.score(tst, y_test)
    score_format = '{0:.4f}'.format(score)
    
    recall = recall_score(y_test, model_pred)
    recall_format = '{0:.4f}'.format(recall)
    
    f1 = f1_score(y_test, model_pred)
    f1_format = '{0:.4f}'.format(f1) 
    
    precision = precision_score(y_test, model_pred)
    precision_format = '{0:.4f}'.format(precision)
    
    y_pred_prob = m.predict_proba(tst)[:,1]
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
    exec(f'fpr_{n} = fpr')
    exec(f'tpr_{n} = tpr')
    exec(f'thresholds_{n} = thresholds')
    exec(f'{n} = n')

    cm = pd.DataFrame(confusion_matrix(y_test, model_pred))
    cm=cm.rename(columns = {0:'pred_survived', 1:'pred_dead'})
    
    exec(f'cm_{n} = cm')
    cm['Type'] = n
    cm['Score'] = score_format
    cm['Recall'] = recall_format
    cm['Precision'] = precision_format
    cm['F1'] = f1_format
    cm_all = pd.concat([cm_all, cm], axis=0)
    print(n + " - Score: " + str(score_format) + " - Recall: " +
    str(recall_format) + " - Precision: " + str(precision_format) + " - F1: " +
    str(f1_format))
    print('----------------------------------------------------------------')


cm_all = cm_all.reset_index()
cm_all['index'] = np.where(cm_all['index']==0, 'Survived', 'Dead')
cm_all = cm_all.rename(columns={'index':'actual'})

display(cm_all)

print('--------------------------------------------------------------------')


In [None]:
fpr_Logistic

In [None]:
#Code Block 189

# Create Overall ROC Curve 
plt.figure(figsize = (13,12))
plt.suptitle('ROC Curves - Competing Classifications', fontweight='bold', fontsize=18, y=.92)
plt.plot(fpr_Logistic, tpr_Logistic, label='Logistic')
plt.plot(fpr_DecisionTree, tpr_DecisionTree, label='DecisionTree')
plt.plot(fpr_RandomForest, tpr_RandomForest, label='RandomForest')

plt.plot([0,1], [0,1], linestyle='--')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate', fontsize=14)
plt.ylabel('True Positive Rate', fontsize=14)
plt.legend(loc="lower right", frameon=False, fontsize=12)
plt.show()

### Explanation: 
- The above graph shows random forest model performs better among all models in this project. 

In [None]:
#Code Block 190

# Create Overall ROC Curve 
plt.figure(figsize = (13,12))
plt.suptitle('ROC Curves - Competing Classifications', fontweight='bold', fontsize=18, y=.92)
plt.plot(fpr_Logistic, tpr_Logistic, label='Logistic')
plt.plot(fpr_RandomForest, tpr_RandomForest, label='RandomForest')

plt.plot([0,1], [0,1], linestyle='--')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate', fontsize=14)
plt.ylabel('True Positive Rate', fontsize=14)
plt.legend(loc="lower right", frameon=False, fontsize=12)
plt.show()