# Logistic Regression on Heart Disease Dataset

### Anshu Kumar Agrawal
### 207108 CSE A

In [25]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 

In [26]:
data = pd.read_csv("./heart.csv")
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [27]:
data.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0
mean,54.434146,0.69561,0.942439,131.611707,246.0,0.149268,0.529756,149.114146,0.336585,1.071512,1.385366,0.754146,2.323902,0.513171
std,9.07229,0.460373,1.029641,17.516718,51.59251,0.356527,0.527878,23.005724,0.472772,1.175053,0.617755,1.030798,0.62066,0.50007
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,132.0,0.0,0.0,1.0,0.0,2.0,0.0
50%,56.0,1.0,1.0,130.0,240.0,0.0,1.0,152.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,275.0,0.0,1.0,166.0,1.0,1.8,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


Clearly, there are no missing values.
Next we will normalize the data using min-max normalization into the range [0-1]

In [28]:
def min_max(column):
    mx, mn = max(column), min(column)
    column = (column-mn)/(mx-mn)
    return column

In [29]:
for x in data.columns:
    data[x] = min_max(data[x])
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,0.479167,1.0,0.0,0.292453,0.196347,0.0,0.5,0.740458,0.0,0.16129,1.0,0.5,1.0,0.0
1,0.5,1.0,0.0,0.433962,0.175799,1.0,0.0,0.641221,1.0,0.5,0.0,0.0,1.0,0.0
2,0.854167,1.0,0.0,0.481132,0.109589,0.0,0.5,0.412214,1.0,0.419355,0.0,0.0,1.0,0.0
3,0.666667,1.0,0.0,0.509434,0.175799,0.0,0.5,0.687023,0.0,0.0,1.0,0.25,1.0,0.0
4,0.6875,0.0,0.0,0.415094,0.383562,1.0,0.5,0.267176,0.0,0.306452,0.5,0.75,0.666667,0.0


In [30]:
def test_train_split(x, y, percent=0.75):
    x = x.to_numpy()
    y = y.to_numpy()
    xtrain = np.array([])
    xtest = np.array([])
    ytrain = np.array([])
    ytest = np.array([])
    choice = np.random.uniform(size=(len(x)))
    for i in range(len(x)):
        if choice[i] < percent:
            xtrain = np.append(xtrain,x[i])
            ytrain = np.append(ytrain,y[i])
        else :
            xtest = np.append(xtest,x[i])
            ytest = np.append(ytest,y[i])
    xtrain = xtrain.reshape(-1, len(x[0]))
    xtest = xtest.reshape(-1, len(x[0]))
    return xtest, ytest, xtrain, ytrain

In [31]:
class LogisticRegression:
    
    def train(self, x, y, learning_rate=0.01, max_iter=10000):
        x = np.c_[np.ones(len(x)),x]
        self.ncols = len(x[0])
        self.params = np.zeros(self.ncols)
        for it in range(max_iter):
            y_pred = self.predict(x)
            self.params = self.params - (learning_rate/(len(y)))*(x.T @ (y_pred-y))

    def predict(self, x):
        if self.ncols != len(x[0]):
            x = np.c_[np.ones(len(x)),x]
        return 1/ (1 + np.exp(-np.matmul(x, self.params)))

In [32]:
x_test, y_test, x_train, y_train = test_train_split(data.loc[:,data.columns != 'target'], data['target'])

In [33]:
lr = LogisticRegression()
lr.train(x_train, y_train)
y_pred = np.round(lr.predict(x_test))
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))
from sklearn.metrics import accuracy_score
print(accuracy_score(y_pred,y_test))

              precision    recall  f1-score   support

         0.0       0.93      0.81      0.87       134
         1.0       0.82      0.93      0.87       119

    accuracy                           0.87       253
   macro avg       0.87      0.87      0.87       253
weighted avg       0.88      0.87      0.87       253

0.8695652173913043
