In [3]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.datasets import make_blobs


In [4]:
df = pd.read_csv('heart.csv')
df.rename(columns ={'age':'Age','sex':'Sex','cp':'Chest_pain','trestbps':'Resting_blood_pressure','chol':'Cholesterol',
                    'fbs':'Fasting_blood_sugar','restecg':'Resting_ECG_results','thalach':'Max_heart_rate',
                    'exang':'Exercise_induced_angina','oldpeak':'ST_depression','slope':'ST_slope',
                    'ca':'Major_vessels','thal':'Thalassemia_types','target':'Heart_disease'},inplace = True)
df = df.drop(labels=[49,93,159,164,165,252,282], axis=0)#drop wrong Major_vessels and Thalassemia_types values
print(f'The length of the data now is {len(df)} instead of 303')


The length of the data now is 296 instead of 303


In [6]:
dummy1 = pd.get_dummies(df.Chest_pain)
dummy2 = pd.get_dummies(df.Resting_ECG_results)
dummy3 = pd.get_dummies(df.ST_slope)
dummy4 = pd.get_dummies(df.Major_vessels)
dummy5 = pd.get_dummies(df.Thalassemia_types)
merge = pd.concat([df,dummy1,dummy2,dummy3,dummy4,dummy5],axis = 'columns')
fdf = merge.drop(['Chest_pain','Resting_ECG_results','ST_slope','Major_vessels','Thalassemia_types'],axis = 1)
y = fdf["Heart_disease"]
X = fdf.drop('Heart_disease',axis=1)
fdf = merge.drop(['Chest_pain','Resting_ECG_results','ST_slope','Major_vessels','Thalassemia_types'],axis = 1)
fdf.head()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 42)

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train.values)
X_test = scaler.transform(X_test.values)

In [7]:
#Logistic Regression
Log = LogisticRegression()

parameters_log = {'C': [0.01, 0.1, 0.2, 0.5, 2, 5, 10]}

log_reg = GridSearchCV(Log, parameters_log, scoring='neg_mean_squared_error' ,cv =5)
log_reg.fit(X_train, y_train)
print(log_reg.best_params_)

model = LogisticRegression(C=0.5)
model.fit(X_train,y_train)
accuracy_test = model.score(X_test,y_test)
accuracy_train = model.score(X_train,y_train)
print('Logistic Regression Train Accuracy: {:.2f}%'.format((accuracy_train)*100))
pred_train=model.predict(X_train)
log_train_matrix=confusion_matrix(y_train,pred_train)
print("Confusion Matrix")
print(log_train_matrix)
print(classification_report(y_train,pred_train))

print('Logistic Regression Test Accuracy: {:.2f}%'.format((accuracy_test)*100))
pred = model.predict(X_test)
pred_prob=model.predict_proba(X_test)
log_matrix = confusion_matrix(y_test,pred)
print("Confusion Matrix")
print(log_matrix)
print(classification_report(y_test,pred))

{'C': 0.5}
Logistic Regression Train Accuracy: 85.59%
Confusion Matrix
[[ 90  17]
 [ 17 112]]
              precision    recall  f1-score   support

           0       0.84      0.84      0.84       107
           1       0.87      0.87      0.87       129

    accuracy                           0.86       236
   macro avg       0.85      0.85      0.85       236
weighted avg       0.86      0.86      0.86       236

Logistic Regression Test Accuracy: 86.67%
Confusion Matrix
[[24  4]
 [ 4 28]]
              precision    recall  f1-score   support

           0       0.86      0.86      0.86        28
           1       0.88      0.88      0.88        32

    accuracy                           0.87        60
   macro avg       0.87      0.87      0.87        60
weighted avg       0.87      0.87      0.87        60



In [12]:
#predict new
Xnew = [[53,1,150,200,0,160,0,2.1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]]
ynew = model.predict(Xnew)
print("X=%s, Predicted=%s" % (Xnew[0], ynew[0]))

X=[53, 1, 150, 200, 0, 160, 0, 2.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], Predicted=0
