<a href="https://colab.research.google.com/github/As2909/Projects/blob/main/ML_Heart_Disease_Logistic_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Data Collection and Preprocessing

In [None]:
# loading the csv file to a pandas dataframe

heart_df = pd.read_csv("/content/heart_disease_data.csv")
heart_df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [None]:
# number of rows and columns in dataset
heart_df.shape

(303, 14)

In [None]:
# Remove the duplicates from the data
heart_df.drop_duplicates(inplace=True)

In [None]:
# Datatypes values for all columns
heart_df.dtypes

age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal          int64
target        int64
dtype: object

In [None]:
heart_df.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [None]:
# Statistical measures about the data
heart_df.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0
mean,54.42053,0.682119,0.963576,131.602649,246.5,0.149007,0.52649,149.569536,0.327815,1.043046,1.397351,0.718543,2.31457,0.543046
std,9.04797,0.466426,1.032044,17.563394,51.753489,0.356686,0.526027,22.903527,0.470196,1.161452,0.616274,1.006748,0.613026,0.49897
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,133.25,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.5,1.0,1.0,130.0,240.5,0.0,1.0,152.5,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.75,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [None]:
# checking the distribution of Target variable
heart_df["target"].value_counts()

target
1    164
0    138
Name: count, dtype: int64

1 --> Defective Heart\
0 --> Healthy Heart

Splitting the data into Features and target

In [None]:
X = heart_df.drop(columns="target", axis=1)
Y = heart_df["target"]

In [None]:
print(X)

     age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
0     63    1   3       145   233    1        0      150      0      2.3   
1     37    1   2       130   250    0        1      187      0      3.5   
2     41    0   1       130   204    0        0      172      0      1.4   
3     56    1   1       120   236    0        1      178      0      0.8   
4     57    0   0       120   354    0        1      163      1      0.6   
..   ...  ...  ..       ...   ...  ...      ...      ...    ...      ...   
298   57    0   0       140   241    0        1      123      1      0.2   
299   45    1   3       110   264    0        1      132      0      1.2   
300   68    1   0       144   193    1        1      141      0      3.4   
301   57    1   0       130   131    0        1      115      1      1.2   
302   57    0   1       130   236    0        0      174      0      0.0   

     slope  ca  thal  
0        0   0     1  
1        0   0     2  
2        2   0    

In [None]:
print(Y)

0      1
1      1
2      1
3      1
4      1
      ..
298    0
299    0
300    0
301    0
302    0
Name: target, Length: 302, dtype: int64


In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=5)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

(302, 13) (241, 13) (61, 13)


Model Training \

Logistic Regression

In [None]:
# Standardised the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply the logistic Regression model on scaled data
model = LogisticRegression()
model.fit(X_train_scaled, Y_train)


In [None]:


# Model Evaluation
# Accuracy Score
X_train_scaled_prediction = model.predict(X_train_scaled)
training_data_accuracy = accuracy_score(X_train_scaled_prediction, Y_train)
print("Accuracy on Training data: ", training_data_accuracy)

X_test_scaled_prediction = model.predict(X_test_scaled)
test_data_accuracy = accuracy_score(X_test_scaled_prediction, Y_test)
print("Accuracy on Test data: ", test_data_accuracy)


Accuracy on Training data:  0.8423236514522822
Accuracy on Test data:  0.7868852459016393


Building a Predictive System

In [19]:
input = (57,1,0,165,289,1,0,124,0,1,1,3,3)

# convert the input_data into a array
input_array = np.asarray(input)

# reshape the array as we are predicting for only one instance
input_reshaped = input_array.reshape(1,-1)

# Standardised the data
input_std = scaler.transform(input_reshaped)

# Make prediction
prediction = model.predict(input_std)
print(prediction)

if (prediction[0] == 0):
  print("The person does not have a Heart Disease")
else:
  print("The person has Heart Disease")

[0]
The person does not have a Heart Disease


