In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv(r"D:\2.DS heart disease\heart_disease_data.csv")
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
df.shape

(303, 14)

In [4]:
df.isna().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [5]:
df['target'].value_counts()

1    165
0    138
Name: target, dtype: int64

In [6]:
df.groupby('target')['target'].agg('count')

target
0    138
1    165
Name: target, dtype: int64

In [7]:
y = df['target']
x = df.drop(['target'],axis=1)

## Note: Data cleaning not needed this time

## TTS

In [8]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,stratify=y,random_state=101)
## stratify prevents x train and x test from having only 0s or 1s

## MODEL BUILDING

used two methods, sklearn logistic regression and statsmodels logit

In [9]:
model = LogisticRegression()

In [10]:
model.fit(x_train,y_train)

## USING STATSMODELS AND LOGIT

In [11]:
y = df['target']
x = df.drop(['target'],axis=1)
import statsmodels.api as sm

In [12]:
reg_log = sm.Logit(y,x)
res = reg_log.fit()

Optimization terminated successfully.
         Current function value: 0.351932
         Iterations 7


In [13]:
np.set_printoptions(formatter={'float':lambda x: "{0:0.2f}".format(x)})
res.predict()
#getting probability of heart disease for all 303 data points

array([0.81, 0.67, 0.95, 0.93, 0.84, 0.75, 0.82, 0.85, 0.82, 0.94, 0.65,
       0.98, 0.89, 0.73, 0.98, 0.97, 0.99, 0.81, 0.62, 0.95, 0.43, 0.92,
       0.84, 0.52, 0.87, 0.85, 0.88, 0.89, 0.90, 0.64, 0.98, 0.55, 0.93,
       0.50, 0.51, 0.78, 0.99, 0.68, 0.97, 0.94, 0.84, 0.82, 0.10, 0.73,
       0.93, 0.91, 0.95, 0.89, 0.99, 0.91, 0.97, 0.45, 0.13, 0.99, 0.99,
       0.74, 0.85, 0.84, 0.98, 0.82, 0.95, 0.79, 0.99, 0.81, 0.94, 0.95,
       0.71, 0.94, 0.93, 0.97, 0.63, 0.72, 0.94, 0.62, 0.99, 0.87, 0.71,
       0.81, 0.96, 0.53, 0.98, 0.84, 0.98, 0.86, 0.63, 0.76, 0.71, 0.85,
       0.97, 0.65, 0.88, 0.50, 0.54, 0.80, 0.95, 0.07, 0.63, 0.13, 0.73,
       0.64, 0.83, 0.11, 0.95, 0.84, 0.96, 0.86, 0.68, 0.59, 0.97, 0.93,
       0.73, 0.87, 0.93, 0.70, 0.88, 0.99, 0.75, 0.80, 0.99, 0.62, 0.19,
       0.80, 0.97, 0.99, 1.00, 0.99, 0.75, 0.98, 0.98, 0.65, 0.98, 0.96,
       0.89, 0.90, 0.97, 0.95, 0.96, 0.78, 0.35, 0.03, 0.98, 0.67, 0.99,
       0.79, 0.95, 0.71, 0.95, 0.99, 0.97, 0.94, 0.

In [14]:
res.pred_table()
# confusion matrix basically for full dataset
# [ TN FP ]
# [ FN TP ]

array([[106.00, 32.00],
       [13.00, 152.00]])

In [15]:
def confusion_matrix(data,actual,model):

    pred = model.predict(data)
    bins = np.array([0,0.5,1])
    cm = np.histogram2d(actual, pred, bins=bins)[0]
    accuracy = (cm[0,0]+cm[1,1])/cm.sum()
    return cm, accuracy

cm = confusion_matrix(x,y,res) 
cm

(array([[106.00, 32.00],
        [13.00, 152.00]]),
 0.8514851485148515)

## EVALUATION

of sklearn logistic regression

In [16]:
from sklearn.metrics import confusion_matrix

In [17]:
## accuracy on TRAINING data
x_train_pred = model.predict(x_train)
train_acc = accuracy_score(x_train_pred,y_train)
train_acc

0.871900826446281

In [18]:
## accuracy on TESTING data
predicted_y = model.predict(x_test)
test_acc = accuracy_score(predicted_y,y_test)
test_acc

0.819672131147541

In [19]:
confusion_matrix(y_test,predicted_y)
# [ TN FP ]
# [ FN TP ]

array([[21,  7],
       [ 4, 29]], dtype=int64)

## CROSSVALIDATION

only for sklearn models

In [20]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=10)

cross_val_score(LogisticRegression(), x, y, cv=cv)

array([0.77, 0.85, 0.87, 0.87, 0.87])

## PREDICTIVE SYSTEM BUILDING:
1. testing on logistic regression

In [21]:
a = (41,1,1,120,157,0,1,182,0,0,2,0,2,1)
b =  (38,1,2,138,175,0,1,173,0,0,2,4,2,1)
c =  (38,1,2,138,175,0,1,173,0,0,2,4,2,1)
d =  (67,1,0,160,286,0,0,108,1,1.5,1,3,2,0)
e =  (67,1,0,120,229,0,0,129,1,2.6,1,2,3,0)
f =  (62,0,0,140,268,0,0,160,0,3.6,0,2,2,0)

In [22]:
ip = (38,1,2,138,175,0,1,173,0,0,2,4,2)

#change to np array
ip_data = np.asarray(ip)


#reshape to predict for one instance
final = ip_data.reshape(1,-1)
print(model.predict(final)[0])

1


2. testing on logit

In [23]:
ip = (62,0,0,140,268,0,0,160,0,3.6,0,2,2)

ip_data = np.asarray(ip)

round(res.predict(ip_data)[0])

0