Importing all the necessaty libraries

# Heart Disease Prediction System Using Machine Learning

##### In this project we will try to predict weather a person have heart disease or not using the clinical data and machine learning. We Will Try To Achieve 85% Accuracy Initially. For This Project We Will Use The Heart Data set from Kaggle.

The Dataset contains the following data 

1. age
2. sex (1 = male; 0 = female) 
3. chest pain type (4 values)
    0. Typical angina: chest pain related to decreased blood supply to the heart.
    1. Atypical angina: chest pain not related to the heart.
    2. Non-anginal pain: typically esophageal spasms (non-heart related).
    3. Asymptomatic: chest pain not showing signs of disease.
4. resting blood pressure
5. serum cholestoral in mg/dl
6. fasting blood sugar > 120 mg/dl
7. resting electrocardiographic results (values 0,1,2)
8. maximum heart rate achieved
9. exercise induced angina
10. oldpeak = ST depression induced by exercise relative to rest
11. the slope of the peak exercise ST segment
12. number of major vessels (0-3) colored by flourosopy
13. thal: 0 = normal; 1 = fixed defect; 2 = reversable defect
14. target: have disease or not (1 = yes, 0 = no)(= Predicted Value)

In [397]:
# importing libraries for Exploratory Data Analysis(EDA)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk

#importing libraries for machine learning
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score, f1_score
from sklearn.metrics import RocCurveDisplay


In [None]:
#loading the data
dataset = pd.read_csv("heart.csv")
dataset.head(3)

In [None]:
#checking the shape of dataset
dataset.shape
#the dataset has 1025 cols and 14 rows

Checking Null Values In The Dataset
##### We will use the `isnull()` function to identify the null values in the dataset.

In [None]:
#There's No Null Values in the Dataset
dataset.isnull().sum().sum()

In [None]:
# Now We Will Check For Outliers using the describe function
# The Describe function will also provide us with mean, std and IQR values
dataset.describe()

In [None]:
#The Target Value Contains 2 values 0 and 1
# If the Value is 1 it means the person have disease
# If the Value is 0 it means the person don't have disease
dataset.target.unique()

In [None]:
# dataset.target.value_counts() provide us with the total count of 1's and 0's in the dataset using that we found the %age of
# people Suferring from Heart Disease and %age of People who are not

# We Found That 51% people are sufferring and 48% are not

temp_target = dataset.target.value_counts()
people_Without_Heart_Disease = temp_target[0]/1025 *100
people_With_Heart_Disease = temp_target[1]/1025 *100

print("People With Heart Disease is : ", people_With_Heart_Disease)
print("People Without Heart Disease is : ", people_Without_Heart_Disease)

In [None]:
# The Bar Graph Shows the above data in pictorial format
temp_target.plot(kind="bar", color={"salmon", "lightblue"}).set_xticklabels(["With Heart Disease","Without Heart Disease"], rotation=0)

In [None]:
# Here We get how many males and females are there in the dataset
# There are 713 males and 312 females
dataset.sex.value_counts()

In [None]:
#From The Figure We also find that Heart Disease is more common in males than female
pd.crosstab(dataset.target, dataset.sex).plot(kind="bar").set_xticklabels(["Heart Disease","Without Heart Disease"],rotation=0)
plt.legend(["Female","Male"])

In [None]:
# From The Plot Below We can deduce that most patient suffer from Non-anginal Chest pain  
# We also found out that Typical Angina is most common but it does result in chest pain always, majority Typical Angina Pain resulted in No Heart Disease
# Where as Most Heart disease is causes by Non-Anginal Pain and ATypical Angina
pd.crosstab(dataset.cp, dataset.target).plot(kind="bar").set_xticklabels(["Typical Angina","ATypical Angina","Non-anginal pain","Asymptomatic"],rotation=0)
plt.legend(["No Heart Disease", "Heart Disease"])

In [None]:
#From The Figure We also find that All Chest Pain Symptoms is more common in males than female

pd.crosstab(dataset.cp, dataset.sex,).plot(kind="bar").set_xticklabels(["Typical Angina","ATypical Angina","Non-anginal pain","Asymptomatic"],rotation=0)
plt.legend(["Female","Male"])

In [None]:
dataset.age.plot.hist()

In [None]:
#  correlation matrix 
corr_matrix = dataset.corr()
fig, ax = plt.subplots(figsize=(15, 5))
ax = sns.heatmap(corr_matrix,
                 annot=True,
                 linewidths=0.5,
                 fmt=".2f",
                 cmap="YlGnBu");
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)

Starting Machine Learning

In [411]:
model = {"Logistic Regression" : LogisticRegression(),
         "K Nearest Neighbour" : KNeighborsClassifier(),
         }
x = dataset.drop(columns='target', axis=1)
y = dataset.target
x, y = shuffle(x, y, random_state=42)
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.2, random_state=42, stratify=y)


In [412]:
def fit_and_score_model(model,x_train,x_test,y_train,y_test):
    # Fit the model
    model_score = {}

    for name, model in model.items() : 
        model.fit(x_train,y_train)
        model_score[name] = model.score(x_test,y_test)
    return model_score

In [None]:
model_score=fit_and_score_model(model,x_train,x_test,y_train,y_test)

In [414]:
def fit_and_score_train_model(model,x_train,x_test,y_train,y_test):
    # Fit the model
    train_model_score = {}

    for name, model in model.items() : 
        model.fit(x_train,y_train)
        train_model_score[name] = model.score(x_train,y_train)
    return train_model_score

In [None]:
fit_and_score_train_model(model,x_train,x_test,y_train,y_test)


In [None]:
model_compare = pd.DataFrame(model_score, index=["accuracy"])
model_compare.T.plot.bar();

#### Hyperparamter Tuning for KNN

In [None]:
train_score = []
test_score = []
neighbors = range(1, 100)
knn = KNeighborsClassifier()


for neighbor in neighbors:
    knn.set_params(n_neighbors=neighbor)
    knn.fit(x_train, y_train)
    train_score.append(knn.score(x_train,y_train))
    test_score.append(knn.score(x_test,y_test))

plt.plot(neighbors, test_score, label="Test Accuracy")
plt.plot(neighbors, train_score, label="Train Accuracy")
plt.xlabel("Number Of Neighbors")
plt.ylabel("Accuracy")
plt.title("KNN: Varying number of Neighbors")
plt.legend()
plt.xlim(0, 100)
plt.ylim(0.65, 1.0)
plt.grid()
plt.show()



#### Hyperparamter Tuning for Logistic Regression and Random Forest using Randomized Search CV

In [418]:
# Hyperparameter Tuning for Logistic Regression 
#Initial Score : 0.8682926829268293

log_reg_grid = {'solver': ['liblinear', 'lbfgs', 'newton-cg'], "max_iter":[100,200,300],'penalty': ['l1', 'l2', 'elasticnet'],         # Try different regularization penalties
    'C': [0.01, 0.1, 1, 10, 100]}


In [None]:
rs_log_reg = RandomizedSearchCV(LogisticRegression(),param_distributions=log_reg_grid,cv=5,n_iter=20,verbose=True)
rs_log_reg.fit(x_train,y_train)

In [None]:
rs_log_reg.best_params_

In [None]:
rs_log_reg.score(x_test,y_test)

In [422]:
gs_log_reg = GridSearchCV(
    LogisticRegression(),param_grid=log_reg_grid,cv=5, verbose=True
)

In [None]:
gs_log_reg.fit(x_train,y_train)


In [None]:
gs_log_reg.score(x_test,y_test)

In [None]:
RocCurveDisplay.from_estimator(gs_log_reg,x_test,y_test)

In [426]:
y_preds = gs_log_reg.predict(x_test)

In [None]:
confusion_matrix(y_test,y_preds)

In [None]:
print(classification_report(y_preds, y_test))

In [None]:
y.value_counts()