In [1]:
# Importing modules
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#Loading the dataset
data = pd.read_csv('Cardiovascular_Disease_Dataset.csv')
data

Unnamed: 0,patientid,age,gender,chestpain,restingBP,serumcholestrol,fastingbloodsugar,restingrelectro,maxheartrate,exerciseangia,oldpeak,slope,noofmajorvessels,target
0,103368,53,1,2,171,0,0,1,147,0,5.3,3,3,1
1,119250,40,1,0,94,229,0,1,115,0,3.7,1,1,0
2,119372,49,1,2,133,142,0,0,202,1,5.0,1,0,0
3,132514,43,1,0,138,295,1,1,153,0,3.2,2,2,1
4,146211,31,1,1,199,0,0,2,136,0,5.3,3,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,9949544,48,1,2,139,349,0,2,183,1,5.6,2,2,1
996,9953423,47,1,3,143,258,1,1,98,1,5.7,1,0,0
997,9965859,69,1,0,156,434,1,0,196,0,1.4,3,1,1
998,9988507,45,1,1,186,417,0,1,117,1,5.9,3,2,1


In [6]:
# Exploratory data analysis
# print(data.describe())
print(data.shape)
data.info()

(1000, 14)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   patientid          1000 non-null   int64  
 1   age                1000 non-null   int64  
 2   gender             1000 non-null   int64  
 3   chestpain          1000 non-null   int64  
 4   restingBP          1000 non-null   int64  
 5   serumcholestrol    1000 non-null   int64  
 6   fastingbloodsugar  1000 non-null   int64  
 7   restingrelectro    1000 non-null   int64  
 8   maxheartrate       1000 non-null   int64  
 9   exerciseangia      1000 non-null   int64  
 10  oldpeak            1000 non-null   float64
 11  slope              1000 non-null   int64  
 12  noofmajorvessels   1000 non-null   int64  
 13  target             1000 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 109.5 KB


In [4]:
#Checking to ensure data is clean, no null values
data.isnull().sum()

patientid            0
age                  0
gender               0
chestpain            0
restingBP            0
serumcholestrol      0
fastingbloodsugar    0
restingrelectro      0
maxheartrate         0
exerciseangia        0
oldpeak              0
slope                0
noofmajorvessels     0
target               0
dtype: int64

In [22]:
# Feature selection
# Using correlation analysis to determine which features have the higher correlation with heart disease
from scipy.stats import pearsonr
import numpy as np
correlation_p_values = {}

for column in data.columns:
    if column != 'target':  # Exclude 'target' itself
        correlation, p_value = pearsonr(data[column], data['target'])
        correlation_p_values[column] = {'Correlation': correlation, 'P-value': p_value}

# Convert the dictionary to a DataFrame
correlation_df = pd.DataFrame(correlation_p_values).T
correlation_df['P-value'] = np.round(correlation_df['P-value'], 4)
correlation_df


Unnamed: 0,Correlation,P-value
patientid,-0.005637,0.8587
age,0.008356,0.7918
gender,0.015769,0.6184
chestpain,0.554228,0.0
restingBP,0.482387,0.0
serumcholestrol,0.19534,0.0
fastingbloodsugar,0.303233,0.0
restingrelectro,0.426837,0.0
maxheartrate,0.228343,0.0
exerciseangia,-0.039874,0.2077


Looking at the correlations and p-values, any attribute with a p-value > 0.001 will not be used, it is also important to note that the attributes with large p-values also had no correlation with heart disease.

In [25]:
# Removing attributes with p-value > 0.001 and 'target' for the next steps
selected_data = data.drop(['patientid', 'age', 'gender', 'exerciseangia', 'oldpeak', 'target'], axis=1)
selected_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   chestpain          1000 non-null   int64
 1   restingBP          1000 non-null   int64
 2   serumcholestrol    1000 non-null   int64
 3   fastingbloodsugar  1000 non-null   int64
 4   restingrelectro    1000 non-null   int64
 5   maxheartrate       1000 non-null   int64
 6   slope              1000 non-null   int64
 7   noofmajorvessels   1000 non-null   int64
dtypes: int64(8)
memory usage: 62.6 KB


In [28]:
# Using a machine learning model here to train and test the data provided
# If it has a good accuracy rate we can use it to estimate if other people have heart disease

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

x = selected_data
y = data['target']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

# Initialize and train the model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions
predictions = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

# Show classification report
print("\nClassification Report:")
print(classification_report(y_test, predictions))

# Show confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, predictions))

Accuracy: 0.955

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.94      0.94        81
           1       0.96      0.97      0.96       119

    accuracy                           0.95       200
   macro avg       0.95      0.95      0.95       200
weighted avg       0.95      0.95      0.95       200


Confusion Matrix:
[[ 76   5]
 [  4 115]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [32]:
# The model has a 95% accuracy rate so we will use it to predict heart disease based on the attribute values
# Refer to the dataset description to determine what the attribute values are for the individual
chestpain = 1
restingBP = 130
serumcholestrol = 220
fastingbloodsugar = 0
restingelectro = 0
maxheartrate = 132
slope = 1
noofmajorvessels = 0

# Create a new data sample for prediction
new_data_sample = [[chestpain, restingBP, serumcholestrol, fastingbloodsugar, restingelectro, maxheartrate, slope, noofmajorvessels]]

# Use the model to predict
predicted = model.predict(new_data_sample)
print(predicted)


[0]


