<a href="https://colab.research.google.com/github/Fabbb16/Story/blob/master/HeartDiseasePrediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing the Libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

Data collection and processing

In [None]:
# csv to dataframe with pandas
heart_data = pd.read_csv("data.csv")

In [None]:
# print first 5 rows of dataset
print(heart_data.head())

   Age Sex ChestPainType  RestingBP  ...  ExerciseAngina  Oldpeak ST_Slope  HeartDisease
0   40   M           ATA        140  ...               N      0.0       Up             0
1   49   F           NAP        160  ...               N      1.0     Flat             1
2   37   M           ATA        130  ...               N      0.0       Up             0
3   48   F           ASY        138  ...               Y      1.5     Flat             1
4   54   M           NAP        150  ...               N      0.0       Up             0

[5 rows x 12 columns]


In [None]:
# print the last 5 rows of the dataset
print(heart_data.tail())

     Age Sex ChestPainType  RestingBP  ...  ExerciseAngina  Oldpeak ST_Slope  HeartDisease
913   45   M            TA        110  ...               N      1.2     Flat             1
914   68   M           ASY        144  ...               N      3.4     Flat             1
915   57   M           ASY        130  ...               Y      1.2     Flat             1
916   57   F           ATA        130  ...               N      0.0     Flat             1
917   38   M           NAP        138  ...               N      0.0       Up             0

[5 rows x 12 columns]


In [None]:
# number of rows and columns
heart_data.shape

(918, 12)

In [None]:
# getting some info about the data
heart_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [None]:
# checking for missing values
heart_data.isnull().sum()

Unnamed: 0,0
Age,0
Sex,0
ChestPainType,0
RestingBP,0
Cholesterol,0
FastingBS,0
RestingECG,0
MaxHR,0
ExerciseAngina,0
Oldpeak,0


In [None]:
# statistical measures about the data
heart_data.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [None]:
# checking the distribution of the HeartDisease Variable
heart_data["HeartDisease"].value_counts()

Unnamed: 0_level_0,count
HeartDisease,Unnamed: 1_level_1
1,508
0,410


1 -> represents that the person has a heart disease
2 --> represents that the person does not have a heart disease

Splitting the features and targets

In [None]:
X = heart_data.drop(columns='HeartDisease', axis=1)
Y = heart_data['HeartDisease']

In [None]:
print(X)

     Age Sex ChestPainType  RestingBP  ...  MaxHR  ExerciseAngina Oldpeak  ST_Slope
0     40   M           ATA        140  ...    172               N     0.0        Up
1     49   F           NAP        160  ...    156               N     1.0      Flat
2     37   M           ATA        130  ...     98               N     0.0        Up
3     48   F           ASY        138  ...    108               Y     1.5      Flat
4     54   M           NAP        150  ...    122               N     0.0        Up
..   ...  ..           ...        ...  ...    ...             ...     ...       ...
913   45   M            TA        110  ...    132               N     1.2      Flat
914   68   M           ASY        144  ...    141               N     3.4      Flat
915   57   M           ASY        130  ...    115               Y     1.2      Flat
916   57   F           ATA        130  ...    174               N     0.0      Flat
917   38   M           NAP        138  ...    173               N     0.0   

In [None]:
print(Y)

0      0
1      1
2      0
3      1
4      0
      ..
913    1
914    1
915    1
916    1
917    0
Name: HeartDisease, Length: 918, dtype: int64


Splitting the data into Training and Test data

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

(918, 11) (734, 11) (184, 11)


Model Training

Logistic Regression

In [None]:
model = LogisticRegression()

In [None]:
# training the LogisticRegression model with training data
label_encoders = {}
for column in ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']:
    le = LabelEncoder()
    X_train[column] = le.fit_transform(X_train[column])
    X_test[column] = le.transform(X_test[column])  # Ensure same encoding on test data
    label_encoders[column] = le

# Train the model
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy Score

In [None]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [None]:
print(f"Accuracy on training data: {training_data_accuracy}")

Accuracy on training data: 0.8501362397820164


In [None]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [None]:
print(f"Accuracy on test data: {test_data_accuracy}")

Accuracy on test data: 0.8532608695652174


Building a predictive system

In [None]:
# Input data for prediction (replace with actual input data)
input_data = (40, "M", "ATA", 140, 289, 0, "Normal", 172, "N", 0, "Up")

# Assume encoders were created during training and are now loaded:
# Example: label_encoders = {'Sex': LabelEncoder(), 'ChestPainType': LabelEncoder(), ...}
# Replace `label_encoders` with your actual encoder dictionary.
label_encoders = {
    'Sex': LabelEncoder().fit(["M", "F"]),
    'ChestPainType': LabelEncoder().fit(["ATA", "NAP", "ASY"]),
    'RestingECG': LabelEncoder().fit(["Normal", "ST", "Flat"]),
    'ExerciseAngina': LabelEncoder().fit(["Y", "N"]),
    'ST_Slope': LabelEncoder().fit(["Up", "Flat", "Down"])
}

# Transform categorical features using the encoders
input_data_list = list(input_data)  # Convert tuple to list for easier modification
input_data_list[1] = label_encoders['Sex'].transform([input_data[1]])[0]  # Encode 'Sex'
input_data_list[2] = label_encoders['ChestPainType'].transform([input_data[2]])[0]  # Encode 'ChestPainType'
input_data_list[6] = label_encoders['RestingECG'].transform([input_data[6]])[0]  # Encode 'RestingECG'
input_data_list[8] = label_encoders['ExerciseAngina'].transform([input_data[8]])[0]  # Encode 'ExerciseAngina'
input_data_list[10] = label_encoders['ST_Slope'].transform([input_data[10]])[0]  # Encode 'ST_Slope'

# Convert the transformed list into a numpy array
input_data_as_numpy_array = np.asarray(input_data_list)

# Reshape the numpy array for a single instance
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)

# Make prediction
prediction = model.predict(input_data_reshaped)
print(f"Prediction: {prediction}")




Prediction: [0]


