In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


**Dependencies**

In [33]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# **Data Collection and Processing**

In [6]:
heart_data = pd.read_csv("/content/drive/MyDrive/NIELIT/Heart Disease Prediction/Datasets/heart_disease.csv")
print(heart_data)

      Gender  age  currentSmoker  cigsPerDay  BPMeds  prevalentStroke  \
0          1   39              0         0.0     0.0                0   
1          0   46              0         0.0     0.0                0   
2          1   48              1        20.0     0.0                0   
3          0   61              1        30.0     0.0                0   
4          0   46              1        23.0     0.0                0   
...      ...  ...            ...         ...     ...              ...   
4233       1   50              1         1.0     0.0                0   
4234       1   51              1        43.0     0.0                0   
4235       0   48              1        20.0     NaN                0   
4236       0   44              1        15.0     0.0                0   
4237       0   52              0         0.0     0.0                0   

      prevalentHyp  diabetes  totChol  sysBP  diaBP    BMI  heartRate  \
0                0         0    195.0  106.0   70.

In [7]:
heart_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4238 entries, 0 to 4237
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Gender           4238 non-null   int64  
 1   age              4238 non-null   int64  
 2   currentSmoker    4238 non-null   int64  
 3   cigsPerDay       4209 non-null   float64
 4   BPMeds           4185 non-null   float64
 5   prevalentStroke  4238 non-null   int64  
 6   prevalentHyp     4238 non-null   int64  
 7   diabetes         4238 non-null   int64  
 8   totChol          4188 non-null   float64
 9   sysBP            4238 non-null   float64
 10  diaBP            4238 non-null   float64
 11  BMI              4219 non-null   float64
 12  heartRate        4237 non-null   float64
 13  glucose          3850 non-null   float64
 14  Target           4238 non-null   int64  
dtypes: float64(8), int64(7)
memory usage: 496.8 KB


In [8]:
heart_data.isnull().sum()

Gender               0
age                  0
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
Target               0
dtype: int64

Dealing With the missing values

In [9]:
heart_data.fillna(heart_data.mean(), inplace=True)

In [10]:
heart_data.isnull().sum()

Gender             0
age                0
currentSmoker      0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
totChol            0
sysBP              0
diaBP              0
BMI                0
heartRate          0
glucose            0
Target             0
dtype: int64

In [11]:
heart_data.head()

Unnamed: 0,Gender,age,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,Target
0,1,39,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [12]:
heart_data.tail()

Unnamed: 0,Gender,age,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,Target
4233,1,50,1,1.0,0.0,0,1,0,313.0,179.0,92.0,25.97,66.0,86.0,1
4234,1,51,1,43.0,0.0,0,0,0,207.0,126.5,80.0,19.71,65.0,68.0,0
4235,0,48,1,20.0,0.02963,0,0,0,248.0,131.0,72.0,22.0,84.0,86.0,0
4236,0,44,1,15.0,0.0,0,0,0,210.0,126.5,87.0,19.16,86.0,81.966753,0
4237,0,52,0,0.0,0.0,0,0,0,269.0,133.5,83.0,21.47,80.0,107.0,0


In [13]:
heart_data.describe()

Unnamed: 0,Gender,age,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,Target
count,4238.0,4238.0,4238.0,4238.0,4238.0,4238.0,4238.0,4238.0,4238.0,4238.0,4238.0,4238.0,4238.0,4238.0,4238.0
mean,0.429212,49.584946,0.494101,9.003089,0.02963,0.005899,0.310524,0.02572,236.721585,132.352407,82.893464,25.802008,75.878924,81.966753,0.151958
std,0.495022,8.57216,0.500024,11.87923,0.16852,0.076587,0.462763,0.158316,44.326453,22.038097,11.91085,4.070953,12.025177,22.836603,0.359023
min,0.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,107.0,83.5,48.0,15.54,44.0,40.0,0.0
25%,0.0,42.0,0.0,0.0,0.0,0.0,0.0,0.0,206.0,117.0,75.0,23.08,68.0,72.0,0.0
50%,0.0,49.0,0.0,0.0,0.0,0.0,0.0,0.0,234.0,128.0,82.0,25.41,75.0,80.0,0.0
75%,1.0,56.0,1.0,20.0,0.0,0.0,1.0,0.0,262.0,144.0,89.875,28.0375,83.0,85.0,0.0
max,1.0,70.0,1.0,70.0,1.0,1.0,1.0,1.0,696.0,295.0,142.5,56.8,143.0,394.0,1.0


Checking the distribution of the target variable

In [15]:
heart_data['Target'].value_counts()

0    3594
1     644
Name: Target, dtype: int64

1 = Defective heart
2 = Not defevctive heart

# **Spleating the features and the target**

In [18]:
X = heart_data.drop(columns='Target', axis=1)
Y = heart_data['Target']

In [20]:
print(X)

      Gender  age  currentSmoker  cigsPerDay   BPMeds  prevalentStroke  \
0          1   39              0         0.0  0.00000                0   
1          0   46              0         0.0  0.00000                0   
2          1   48              1        20.0  0.00000                0   
3          0   61              1        30.0  0.00000                0   
4          0   46              1        23.0  0.00000                0   
...      ...  ...            ...         ...      ...              ...   
4233       1   50              1         1.0  0.00000                0   
4234       1   51              1        43.0  0.00000                0   
4235       0   48              1        20.0  0.02963                0   
4236       0   44              1        15.0  0.00000                0   
4237       0   52              0         0.0  0.00000                0   

      prevalentHyp  diabetes  totChol  sysBP  diaBP    BMI  heartRate  \
0                0         0    195.0 

In [21]:
print(Y)

0       0
1       0
2       0
3       1
4       0
       ..
4233    1
4234    0
4235    0
4236    0
4237    0
Name: Target, Length: 4238, dtype: int64


**Splitting the data into training and testing data**

In [23]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=42)

In [25]:
print(X.shape, X_train.shape, X_test.shape)

(4238, 14) (3390, 14) (848, 14)


# **Model Training**
**Logestic regression**

In [27]:
model = LogisticRegression()

In [28]:
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# **Model Evaluation**


Accuracy scores on training data

In [29]:
X_train_pre = model.predict(X_train)
training_data_acc = accuracy_score(X_train_pre, Y_train)
print('Accuracy on training data : ', training_data_acc)

Accuracy on training data :  0.852212389380531


Accuracy scores on test prediction

In [31]:
X_test_pre = model.predict(X_test)
test_data_acc = accuracy_score(X_test_pre, Y_test)
print('Accuracy on test data : ', test_data_acc)

Accuracy on test data :  0.8490566037735849


**Other Scores**

In [34]:
predictions = model.predict(X_test)
precision = precision_score(Y_test, predictions)
recall = recall_score(Y_test, predictions)
f1 = f1_score(Y_test, predictions)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Precision: 0.5384615384615384
Recall: 0.05426356589147287
F1-score: 0.09859154929577466


# **Model Building**

In [41]:
input_data = (0,61,1,30,0,0,1,0,225,150,95,28.58,65,103)

#change input into a numpy array

input_data_as_np_array = np.asarray(input_data)

#reshaping the array as prediction is only for 1 instance
input_data_reshape = input_data_as_np_array.reshape(1,-1)
pred = model.predict(input_data_reshape)
print(pred)

if (pred[0] == 0):
  print("The person does not have heart disease.")
else:
  print("The person has heart disease.")

[0]
The person does not have heart disease.


