## Import libraries and data


In [None]:
import numpy as np
import pandas as pd
from plotly.offline import iplot, init_notebook_mode
import plotly.express as px 
from plotly.subplots import make_subplots
import plotly.graph_objects as go 
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn import preprocessing 
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

import pickle # to save and load the model

In [None]:
#convert file into dataset
data = pd.read_csv(r"C:\Users\amris\Downloads\DPS LAB\heart.csv")

## Understand the data

In [None]:
#explore first five rows in the dataset
data.head()

In [None]:
#information about dataset, type, columns names, null 
data.info()

<font size="3">The dataset contains 12 columns with 918 rows. 7 columns are numeric and the rest are categorical, there are no missing values. </font>



In [None]:
#statistical description of data numeric columns
data.describe()

In [None]:
#description of data object columns
data.select_dtypes(include=['object']).describe()

In [None]:
#description of percent of data object columns
col=data.select_dtypes(include=['object']).columns.tolist()
#create iteration of object columns
for i in col:
    count=data[i].value_counts()
    percent=data.groupby(['HeartDisease'])[i].value_counts(normalize=True)[1]
    display(pd.DataFrame({"Patients":count, "Percent":percent*100})\
            .sort_values("Percent", ascending=False)
            .style.set_caption('Variable: {}'.format(i))\
            .format({"Percent": "{:,.1f}%"}))

## Outliers

In [None]:
# Set up the subplots grid
fig = make_subplots(rows=2, cols=3, 
                    # Set the subplot titles
                    subplot_titles=['Age', 'RestingBP', 'Cholesterol','MaxHR','Oldpeak'])
#create boxplot visualization of numeric columns
fig.add_trace(go.Box(x=data.Age, name='', showlegend=False), row=1, col=1)
fig.add_trace(go.Box(x=data.RestingBP, name='', showlegend=False), row=1, col=2)
fig.add_trace(go.Box(x=data.Cholesterol, name='', showlegend=False), row=1, col=3)
fig.add_trace(go.Box(x=data.MaxHR, name='', showlegend=False), row=2, col=1)
fig.add_trace(go.Box(x=data.Oldpeak, name='', showlegend=False), row=2, col=2)

#config size
fig.update_layout(height=500, width=900)
#show visualizations
fig.show()

In [None]:
#creating conditions to change values to nan
conC = (data["Cholesterol"] < 78) | (data["Cholesterol"] >457)
conR = (data["RestingBP"] < 80) | (data["RestingBP"] >192)
#change values to nan
data.loc[conC,'Cholesterol'] = np.nan
data.loc[conR,'RestingBP'] = np.nan
#fill nan values to mean by group of heart disease
data['Cholesterol'] = data['Cholesterol'].fillna(data.groupby('HeartDisease')['Cholesterol'].transform('mean'))
data['RestingBP'] = data['RestingBP'].fillna(data.groupby('HeartDisease')['RestingBP'].transform('mean'))

In [None]:
# Set up the subplots grid
fig = make_subplots(rows=1, cols=2, 
                    # Set the subplot titles
                    subplot_titles=['RestingBP', 'Cholesterol'])
fig.add_trace(go.Box(x=data.RestingBP, name='', showlegend=False), row=1, col=1)
fig.add_trace(go.Box(x=data.Cholesterol, name='', showlegend=False), row=1, col=2)
#config size
fig.update_layout(height=300, width=700)
#show visualizations
fig.show()

# 🔎 3 Data Exploration

In [None]:
#creating a copy of dataset for the visualization
eda= data.copy()
#change values to make better visualizations
eda['Sex'] = np.where(eda['Sex'] == 'F', 'Female', 'Male')
eda['HeartDisease'] = np.where(eda['HeartDisease'] == 0, 'Normal', 'Heart Disease')
eda['ExerciseAngina'] = np.where(eda['ExerciseAngina'] == 'N', 'No', 'Yes')
eda["ChestPainType"].replace({'TA': 'Typical Angina', 'ATA': 'Atypical Angina', 
                              'NAP': 'Non-Anginal Pain', 'ASY': 'Asymptomatic'}, inplace= True)



In [None]:
#barplot of heart failure by gender 
my_scale = ['rgb(70,81,242)','rgb(255,105,180)']
#histogram with plotly
df = px.data.tips()
fig = px.histogram(eda, x="HeartDisease",
             color='Sex', barmode='group', 
             color_discrete_sequence =my_scale, opacity=0.9,text_auto=True,
             height=450, width = 450)
fig.update_layout(title_text='Heart failure by Sex',title_font_size=16, title_x=0.5,
                  font_family='Bahnschrift SemiBold', 
                  yaxis_title=None, xaxis_title=None)
fig.update_traces(textfont_size=14, textangle=0, textposition="outside", cliponaxis=False,
                 marker_line_width=1,marker_line_color="black")


fig.show()

In [None]:
df = px.data.tips()
fig = px.histogram(eda, x="HeartDisease",
             color='ExerciseAngina', barmode='group', 
                       color_discrete_sequence= px.colors.qualitative.T10,opacity=0.9,text_auto=True,
             height=450, width = 450)
fig.update_layout(title_text='Heart failure by Exercise Angina',title_font_size=16, title_x=0.5,
                  font_family='Bahnschrift SemiBold', 
                  yaxis_title=None, xaxis_title=None)
fig.update_traces(textfont_size=14, textangle=0, textposition="outside", cliponaxis=False,
                 marker_line_width=1,marker_line_color="black")


fig.show()

In [None]:
#barplot of heart failure by Chest pain type 
df = px.data.tips()
fig = px.histogram(eda, x="HeartDisease",
             color='ChestPainType', barmode='group', 
                       color_discrete_sequence= px.colors.qualitative.T10,opacity=0.9,text_auto=True,
             height=450, width = 450)
fig.update_layout(title_text='Heart failure by Chest pain type',title_font_size=16, title_x=0.5,
                  font_family='Bahnschrift SemiBold', 
                  yaxis_title=None, xaxis_title=None)
fig.update_traces(textfont_size=14, textangle=0, textposition="outside", cliponaxis=False,
                 marker_line_width=1,marker_line_color="black")


fig.show()

In [None]:
#barplot of heart failure by Resting ECG
df = px.data.tips()
fig = px.histogram(eda, x="HeartDisease",
             color='RestingECG', barmode='group', 
              color_discrete_sequence= px.colors.qualitative.T10, opacity=0.9,text_auto=True,
             height=450, width = 450)
fig.update_layout(title_text='Heart failure by Resting ECG',title_font_size=16, title_x=0.5,
                  font_family='Bahnschrift SemiBold', 
                  yaxis_title=None, xaxis_title=None)
fig.update_traces(textfont_size=14, textangle=0, textposition="outside", cliponaxis=False,
                 marker_line_width=1,marker_line_color="black")


fig.show()

In [None]:
#barplot of heart failure by Age
df = px.data.tips()
fig = px.histogram(eda, x="Age",
             color='HeartDisease',color_discrete_sequence= px.colors.qualitative.T10,
                   nbins = 40,opacity=0.8,height=500, width = 700)
fig.update_layout(title_text='Heart failure by Age',title_font_size=16, title_x=0.5,
                  font_family='Bahnschrift SemiBold', 
                  yaxis_title=None, xaxis_title=None)
fig.update_traces(textfont_size=14, textangle=0, textposition="outside", cliponaxis=False,
                 marker_line_width=1,marker_line_color="black")


fig.show()

In [None]:
#barplot of heart failure by Age and Sex
df = px.data.tips()
fig = px.histogram(eda, x="Cholesterol",
             color='HeartDisease',color_discrete_sequence= px.colors.qualitative.T10,
                   nbins = 40,opacity=0.8,height=500, width = 700)
fig.update_layout(title_text='Heart failure by Cholesterol',title_font_size=16, title_x=0.5,
                  font_family='Bahnschrift SemiBold', 
                  yaxis_title=None, xaxis_title=None)
fig.update_traces(textfont_size=14, textangle=0, textposition="outside", cliponaxis=False,
                 marker_line_width=1,marker_line_color="black")


fig.show()

In [None]:
#barplot of heart failure by Age and Sex
df = px.data.tips()
fig = px.histogram(eda, x="RestingBP",
             color='HeartDisease',color_discrete_sequence= px.colors.qualitative.T10,
                   nbins = 60,opacity=0.8,height=500, width = 700)
fig.update_layout(title_text='Heart failure by RestingBP',title_font_size=16, title_x=0.5,
                  font_family='Bahnschrift SemiBold', 
                  yaxis_title=None, xaxis_title=None)
fig.update_traces(textfont_size=14, textangle=0, textposition="outside", cliponaxis=False,
                 marker_line_width=1,marker_line_color="black")


fig.show()

**bold text**#  4. Model *builing*

## 4.1 Prepare Data

In [None]:
#convert columns to binary, just these 2 columns
data['Sex'] = np.where(data['Sex'] == 'M', 1, 0)
data['ExerciseAngina'] = np.where(data['ExerciseAngina'] == 'Y', 1, 0)

#create target value and label
y=data.HeartDisease
X=data.drop('HeartDisease', axis=1)
#convert to binary
X=pd.get_dummies(X)

## 4.2 Create model

In [None]:
#creating the parameters
c_space = np.logspace(-5, 8, 15)
param_grid = {'C': c_space, 'penalty': ['l1', 'l2']}

#ML model
logreg = LogisticRegression(solver='liblinear',max_iter=10000)

#separete data 75% train 25% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)


#find the best parameter
logreg_cv = GridSearchCV(logreg, param_grid, cv=5)

# fit the model with data
logreg_cv.fit(X_train, y_train)

print("Tuned Logistic Regression Parameter: {}".format(logreg_cv.best_params_))
print("Tuned Logistic Regression Accuracy: {}".format(logreg_cv.best_score_))



In [None]:
#create the ML model with the parameters
logreg2=LogisticRegression(C=0.051, penalty="l2")
logreg2.fit(X_train,y_train)
print("score",logreg2.score(X_test,y_test))

In [None]:
#predict data
y_pred=logreg2.predict(X_test.iloc[0,:].to_numpy().reshape(1,-1))

In [None]:
filename = 'model.sav'
pickle.dump(logreg2, open(filename, 'wb'))

In [None]:
y_pred

In [None]:
X_test

# 📈 5. Results

## 5.1 Confusion Matrix

In [None]:
#create confusion matrix
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

In [None]:
#create hetmap of confusion matrix
class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))

## 5.2 Roc Curve

In [None]:
y_pred_proba = logreg2.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc=4)
plt.show()