# بسم الله الرحمن الرحيم

# Heart Disease Prediction

## Problem Statement

Develop a machine learning model to predict the likelihood of heart disease in individuals based on their medical history, lifestyle factors, and physiological attributes. The model should provide a binary classification (presence or absence of heart disease) to aid in early detection and prevention. This project aims to improve cardiac healthcare by enabling timely interventions for at-risk individuals.

<center>
<img src="https://www.cardio.com/hubfs/human%20heart%20illustration.jpeg" alt="drawing" width="500" />
</center>

Dataset Description:
> 1. age - age in years
2. sex - (1 = male; 0 = female)
3. cp - chest pain type
4. trestbps - resting blood pressure (in mm Hg on admission to the hospital)
5. chol - serum cholestoral in mg/dl
6. fbs - (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
7. restecg - resting electrocardiographic results
8. thalach - maximum heart rate achieved
9. exang - exercise induced angina (1 = yes; 0 = no)
10. oldpeak - ST depression induced by exercise relative to rest
11. slope - the slope of the peak exercise ST segment
12. ca - number of major vessels (0-3) colored by flourosopy
13. thal - 3 = normal; 6 = fixed defect; 7 = reversable defect
14. target - have disease or not (1=yes, 0=no)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.tree import plot_tree


import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from sklearn.naive_bayes import GaussianNB


In [None]:
# read data set
df = pd.read_csv('/content/heart.csv')

# display first 5 rows
df.head()

In [None]:
# display last 5 rows
df.tail()

In [None]:
df.shape

`df.info()` It provides a concise summary of the DataFrame's structure, including details such as the number of rows and columns, data types of each column, and the presence of missing values

In [None]:
df.info()

`df.describe()` generate various statistics for each numerical column 1. in the DataFrame. These statistics include:¶
2. Count: The number of non-null (non-missing) values in each column.
3. Mean: The average value of each column.
4. Std: The standard deviation, which measures the amount of variation 5. or dispersion in each column.
6. Min: The minimum value in each column.
7. 25%: The 25th percentile value, also known as the first quartile.
8. 50%: The 50th percentile value, also known as the median or second quartile.
10. 75%: The 75th percentile value, also known as the third quartile.
11. Max: The maximum value in each column.

In [None]:
df.describe().T

In [None]:
# count number of duplicate rows
num_duplicate_rows = df.duplicated().sum()
print("Number of duplicate rows: ", num_duplicate_rows)

## Data Visualization

In [None]:
plt.figure(figsize=(10,10))
ax = sns.pairplot(data=df, hue='target', corner=True)

## correlation matrix

In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(df.corr(), annot=True, fmt='.1f', cmap=sns.cubehelix_palette(as_cmap=True), linewidth=.5)
plt.show()

Correlation between ['trestbps', 'fbts', 'restecg', 'chol'] and target close to 0 so i will drop this features

In [None]:
# display counts of each class
sns.countplot(data=df, x='target', palette='deep')
plt.legend(["not normal"])
plt.xlabel('target')
plt.ylabel('Frequency')

In [None]:
sns.countplot(data=df, x='sex', hue='target', palette='deep')
plt.xlabel("Sex (0 = female, 1= male)")
plt.legend(["Disease", "Normal"])
plt.show()

In [None]:
# define Seaborn color palette to use
palette_color = sns.color_palette('dark')
plt.pie(df['sex'].value_counts(), labels=['Male', 'Female'], colors=palette_color, autopct='%.0f%%')
plt.show()

In [None]:
pd.crosstab(df['age'], df['target']).plot(kind="bar",figsize=(15,6))
plt.title('Heart Disease Frequency for Ages')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.legend(["Abnormal", "Normal"])
plt.show()

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(10, 8))
plt.figure(figsize=(12,12))

fig.suptitle('')

sns.barplot(ax =axes[0,0], x=df["exang"],y=df['target'])
sns.barplot(ax =axes[0,1], x=df["slope"],y=df['target'])

sns.barplot(ax =axes[1,0], x=df["cp"],y=df['target'])
sns.barplot(ax =axes[1,1], x=df["thal"],y=df['target'])
plt.show()

In [None]:


sns.scatterplot(data=df, x="age", y="thalach", hue='target', palette='deep')
plt.legend(["Abnormal", "Normal"])
plt.xlabel("Age")
plt.ylabel("Maximum Heart Rate")
plt.show()



In [None]:
sns.histplot(data=df, x='age', hue='target', kde=True)
plt.legend(["Abnormal", "Normal"])


In [None]:
sns.countplot(data=df, x='cp',hue='target', palette='deep')
plt.title('Heart Disease Frequency According To Chest Pain Type')
plt.xlabel('Chest Pain Type')
plt.ylabel('Frequency of Disease or Not')
plt.legend(["Abnormal", "Normal"])
plt.show()

In [None]:
sns.countplot(data=df, x='exang',hue='target', palette='deep')
plt.title('Heart Disease Frequency According To Chest Pain Type')
plt.xlabel('exercise induced angina (1 = yes; 0 = no)')
plt.ylabel('Frequency of Disease or Not')
plt.legend(["Abnormal", "Normal"])
plt.show()

In [None]:
sns.countplot(data=df, x='fbs', hue='target', palette='deep')
plt.title('Heart Disease Frequency According To FBS')
plt.xlabel('FBS - (Fasting Blood Sugar > 120 mg/dl) (1 = true; 0 = false)')
plt.legend(["Abnormal", "Normal"])
plt.ylabel('Frequency of Disease or Not')
plt.show()

In [None]:
plt.title('Heart Disease Frequency for Slope')
sns.countplot(data=df, x='slope', hue='target', palette='deep')
plt.xlabel('The Slope of The Peak Exercise ST Segment ')
plt.ylabel('Frequency')
plt.legend(["Abnormal", "Normal"])
plt.show()

# Modeling

In [None]:
df = df.drop(columns=['trestbps', 'fbs', 'restecg', 'chol'])
# since correlation between them and target close to 0 so i will drop this features

### Split Data

In [None]:
X, Y = df.drop("target",axis=1), df["target"]
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2,random_state=42)

# this dictionary will hold accuracy of each model
accuracies = dict()

## LogisticRegression

In [None]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)

In [None]:
accuracy = accuracy_score(predictions, y_test) * 100
accuracies['Logistic Regression'] = accuracy
print(f'Logistic Regression Accuracy: {accuracy:.2f}')

## Normalize Data

In [None]:
X_train = preprocessing.normalize(X_train)
X_test = preprocessing.normalize(X_test)


## SVM

In [None]:
sv = svm.SVC(random_state = 1)
sv.fit(X_train, y_train)

In [None]:
predictions = sv.predict(X_test)
accuracy = accuracy_score(predictions, y_test) * 100
accuracies['SVM'] = accuracy
print(f'SVM Accuracy: {accuracy:.2f}')

 ## Decision tree

In [None]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
predictions = dt.predict(X_test)

In [None]:
accuracy = accuracy_score(predictions, y_test) * 100
accuracies['Decision tree'] = accuracy
print(f'Decision Tree Classifier Accuracy: {accuracy:.2f}')

## Random Forest

In [None]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
predictions = rf.predict(X_test)
accuracy = accuracy_score(predictions, y_test) * 100
accuracies['Random Forest'] = accuracy
print(f'Decision Tree Classifier Accuracy: {accuracy:.2f}')

## Naive Bayes

In [None]:
nb = GaussianNB()
nb.fit(X_train,y_train)
predictions = nb.predict(X_test)

In [None]:
accuracy = accuracy_score(predictions, y_test) * 100
accuracies['Naive Bayes'] = accuracy
print(f'Decision Tree Classifier Accuracy: {accuracy:.2f}')

## KNN

In [None]:
# KNN Model
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 30)  # n_neighbors means k
knn.fit(X_train, y_train)
prediction = knn.predict(X_test)
accuracies['KNN'] = accuracy
print("{} NN Score: {:.2f}%".format(30, knn.score(X_test, y_test)*100))

In [None]:

# sns.set_style("whitegrid")
plt.figure(figsize=(16,5))
sns.barplot(y=list(accuracies.keys()), x=list(accuracies.values()),palette='deep')
plt.xticks(np.arange(0,100,5))
plt.xlabel("Accuracy %")
plt.ylabel("Algorithms")
sns.despine(left=True, bottom=True)

plt.show()

# Thank You :)