Importing the Dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Data Collection and Processing

In [2]:
# loading the csv data to a Pandas DataFrame
heart_data = pd.read_csv('/content/data.csv')

In [3]:
# print first 5 rows of the dataset
heart_data.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [4]:
# print last 5 rows of the dataset
heart_data.tail()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
915,916,54,Female,VA Long Beach,asymptomatic,127.0,333.0,True,st-t abnormality,154.0,False,0.0,,,,1
916,917,62,Male,VA Long Beach,typical angina,,139.0,False,st-t abnormality,,,,,,,0
917,918,55,Male,VA Long Beach,asymptomatic,122.0,223.0,True,st-t abnormality,100.0,False,0.0,,,fixed defect,2
918,919,58,Male,VA Long Beach,asymptomatic,,385.0,True,lv hypertrophy,,,,,,,0
919,920,62,Male,VA Long Beach,atypical angina,120.0,254.0,False,lv hypertrophy,93.0,True,0.0,,,,1


In [5]:
# number of rows and columns in the dataset
heart_data.shape

(920, 16)

In [6]:
# getting some info about the data
heart_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    object 
 3   dataset   920 non-null    object 
 4   cp        920 non-null    object 
 5   trestbps  861 non-null    float64
 6   chol      890 non-null    float64
 7   fbs       830 non-null    object 
 8   restecg   918 non-null    object 
 9   thalch    865 non-null    float64
 10  exang     865 non-null    object 
 11  oldpeak   858 non-null    float64
 12  slope     611 non-null    object 
 13  ca        309 non-null    float64
 14  thal      434 non-null    object 
 15  num       920 non-null    int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 115.1+ KB


In [7]:
# checking for missing values
heart_data.isnull().sum()

id            0
age           0
sex           0
dataset       0
cp            0
trestbps     59
chol         30
fbs          90
restecg       2
thalch       55
exang        55
oldpeak      62
slope       309
ca          611
thal        486
num           0
dtype: int64

In [8]:
# statistical measures about the data
heart_data.describe()

Unnamed: 0,id,age,trestbps,chol,thalch,oldpeak,ca,num
count,920.0,920.0,861.0,890.0,865.0,858.0,309.0,920.0
mean,460.5,53.51087,132.132404,199.130337,137.545665,0.878788,0.676375,0.995652
std,265.725422,9.424685,19.06607,110.78081,25.926276,1.091226,0.935653,1.142693
min,1.0,28.0,0.0,0.0,60.0,-2.6,0.0,0.0
25%,230.75,47.0,120.0,175.0,120.0,0.0,0.0,0.0
50%,460.5,54.0,130.0,223.0,140.0,0.5,0.0,1.0
75%,690.25,60.0,140.0,268.0,157.0,1.5,1.0,2.0
max,920.0,77.0,200.0,603.0,202.0,6.2,3.0,4.0


In [10]:
heart_data['sex']=heart_data['sex'].replace(['Male','Female'],[1,2])

In [11]:
# checking the distribution of Target Variable
heart_data['chol'].value_counts()

0.0      172
220.0     10
254.0     10
223.0      9
230.0      9
        ... 
360.0      1
412.0      1
358.0      1
321.0      1
385.0      1
Name: chol, Length: 217, dtype: int64

1 --> Defective Heart

0 --> Healthy Heart

Splitting the Features and Target

In [12]:
X = heart_data.drop(columns='chol', axis=1)
Y = heart_data['chol']

In [13]:
print(X)

      id  age  sex        dataset               cp  trestbps    fbs  \
0      1   63    1      Cleveland   typical angina     145.0   True   
1      2   67    1      Cleveland     asymptomatic     160.0  False   
2      3   67    1      Cleveland     asymptomatic     120.0  False   
3      4   37    1      Cleveland      non-anginal     130.0  False   
4      5   41    2      Cleveland  atypical angina     130.0  False   
..   ...  ...  ...            ...              ...       ...    ...   
915  916   54    2  VA Long Beach     asymptomatic     127.0   True   
916  917   62    1  VA Long Beach   typical angina       NaN  False   
917  918   55    1  VA Long Beach     asymptomatic     122.0   True   
918  919   58    1  VA Long Beach     asymptomatic       NaN   True   
919  920   62    1  VA Long Beach  atypical angina     120.0  False   

              restecg  thalch  exang  oldpeak        slope   ca  \
0      lv hypertrophy   150.0  False      2.3  downsloping  0.0   
1      lv hyp

In [14]:
print(Y)

0      233.0
1      286.0
2      229.0
3      250.0
4      204.0
       ...  
915    333.0
916    139.0
917    223.0
918    385.0
919    254.0
Name: chol, Length: 920, dtype: float64


Splitting the Data into Training data & Test Data

In [15]:
from sklearn.model_selection import train_test_split
import numpy as np

# Assuming X and Y are your feature and target variables

# Remove rows with NaN values from both X and Y
mask = ~np.isnan(Y)
X = X[mask]
Y = Y[mask]

# Perform train-test split without stratification
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

In [16]:
print(X.shape, X_train.shape, X_test.shape)

(890, 15) (712, 15) (178, 15)


Model Training

Logistic Regression

In [36]:
import tensorflow as tf
from tensorflow.keras import layers, models

In [37]:
model=models.Sequential()

In [38]:
heart_data['sex']=heart_data['sex'].replace(['Male','Female'],[1,2])

In [43]:
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

# Assuming X and Y are your feature and target variables

# Remove rows with NaN values from both X and Y
mask = ~np.isnan(Y)
X = X[mask]
Y = Y[mask]

# Identify categorical columns (replace 'categorical_column_names' with your actual categorical column names)
categorical_column_names = ['sex', 'cp']

# Identify numerical columns (replace 'numerical_column_names' with your actual numerical column names)
numerical_column_names = ['oldpeak', 'num']

# Create transformers for numerical and categorical columns
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean'))])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_column_names),
        ('cat', categorical_transformer, categorical_column_names)
    ])

# Apply preprocessing and train-test split
X_processed = preprocessor.fit_transform(X)

X_train, X_test, Y_train, Y_test = train_test_split(X_processed, Y, test_size=0.2, random_state=2)

# Instantiate and fit your model
model = RandomForestClassifier()  # You can replace this with your preferred classifier
model.fit(X_train, Y_train)


Model Evaluation

Accuracy Score

In [44]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [50]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.4311797752808989


In [46]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [47]:
print('Accuracy on Test data : ', test_data_accuracy)

Accuracy on Test data :  0.1348314606741573


In [48]:
from sklearn.model_selection import cross_val_score

# Replace 'model' with your instantiated model
cv_scores = cross_val_score(model, X_processed, Y, cv=5, scoring='accuracy')
print(f'Cross-validated Accuracy: {np.mean(cv_scores):.2f} +/- {np.std(cv_scores):.2f}')



Cross-validated Accuracy: 0.13 +/- 0.02


In [49]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# Replace 'model' with RandomForestClassifier or another algorithm you want to try
model = RandomForestClassifier(random_state=42)

# Cross-validated accuracy
cv_scores = cross_val_score(model, X_processed, Y, cv=5, scoring='accuracy')
print(f'Cross-validated Accuracy: {np.mean(cv_scores):.2f} +/- {np.std(cv_scores):.2f}')



Cross-validated Accuracy: 0.12 +/- 0.03


Building a Predictive System

In [61]:
input_data = (62,20,10,140,268,78,160,40,36,45,78,65,96,37,21,85)

# change the input data to a numpy array
input_data_as_numpy_array= np.asarray(input_data)

# reshape the numpy array as we are predicting for only on instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = model.predict(input_data_reshaped)
print(prediction)

if (prediction[0]== 0):
  print('The Person does not have a Heart Disease')
else:
  print('The Person has Heart Disease')

[0.]
The Person does not have a Heart Disease
