# HEART DISEASE PREDICTION

## Importing Libraries

In [None]:
import warnings
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from scipy.stats import boxcox
from sklearn.pipeline import Pipeline
from matplotlib.colors import ListedColormap
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV,StratifiedKFold
from sklearn.metrics import accuracy_score,classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline
#All the included Libraries are above mentiones


In [None]:
plt.rcParams['figure.dpi']=400
sns.set(rc={'axes.facecolor': '#faded9'}, style='darkgrid')

In [None]:
data=pd.read_csv('Project 4 dataset.csv')
data
data["age"]

## Dataset Description for the model understanding


Variable	Description:
age:Age of the patient in years
sex:Gender of the patient (0 = male, 1 = female)
cp:Chest pain type:
  0: Typical angina
  1: Atypical angina
  2: Non-anginal pain
  3: Asymptomatic
trestbps:Resting blood pressure in mm Hg
chol:Serum cholesterol in mg/dl
fbs:Fasting blood sugar level, categorized as above 120 mg/dl (1 = true, 0 = false)
restecg:Resting electrocardiographic results:
  0: Normal
  1: Having ST-T wave abnormality
  2: Showing probable or definite left ventricular hypertrophy
thalach:Maximum heart rate achieved during a stress test
exang:Exercise-induced angina (1 = yes, 0 = no)
oldpeak:ST depression induced by exercise relative to rest
slope:Slope of the peak exercise ST segment:
  0: Upsloping
  1: Flat
  2: Downsloping
ca:Number of major vessels (0-4) colored by fluoroscopy
thal:Thalium stress test result:
  0: Normal
  1: Fixed defect
  2: Reversible defect
  3: Not described
target:Heart disease status (0 = no disease, 1 = presence of disease)

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
continuous_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak','sex']
featurestoconvert=[feature for feature in data.columns if feature not in continuous_features]
data[featurestoconvert]=data[featurestoconvert].astype('object')
data.dtypes

In [None]:
data.describe().T

In [None]:
data.describe(include='object')

In [None]:
df_continuous = data[continuous_features]


figure, axes = plt.subplots(nrows=2, ncols=3, figsize=(15, 10))

# Loop to plot histograms for each continuous feature
for i, col in enumerate(df_continuous.columns):
    x = i // 3
    y = i % 3
    values, bin_edges = np.histogram(df_continuous[col],
                                     range=(np.floor(df_continuous[col].min()), np.ceil(df_continuous[col].max())))

    graph = sns.histplot(data=df_continuous, x=col, bins=bin_edges, kde=True, ax=axes[x, y],
                         edgecolor='none', color='red', alpha=0.6, line_kws={'lw': 3})
    axes[x, y].set_xlabel(col, fontsize=15)
    axes[x, y].set_ylabel('Count', fontsize=12)
    axes[x, y].set_xticks(np.round(bin_edges, 1))
    axes[x, y].set_xticklabels(axes[x, y].get_xticks(), rotation=45)
    axes[x, y].grid(color='lightgrey')


    for j, p in enumerate(graph.patches):
        axes[x, y].annotate('{}'.format(p.get_height()), (p.get_x() + (p.get_width()) / 2, p.get_height() + 1),
                          ha='center', fontsize=10, fontweight="bold")

    textstr = '\n'.join((
        r'$\mu=%.2f$' % df_continuous[col].mean(),
        r'$\sigma=%.2f$' % df_continuous[col].std()
    ))
    axes[x, y].text(0.75, 0.9, textstr, transform=axes[x, y].transAxes, fontsize=12, verticalalignment='top',
                  color='white', bbox=dict(boxstyle='round', facecolor='blue', edgecolor='white', pad=0.5))

axes[1,2].axis('off')
plt.suptitle('Distribution of Continuous Variables', fontsize=20)
plt.tight_layout()
plt.subplots_adjust(top=.8)
plt.show()



In [None]:
figure, axes = plt.subplots(nrows=5, ncols=2, figsize=(15, 12))
categorical_features = data.columns.difference(continuous_features)
df_categorical = data[categorical_features]
for i, col in enumerate(categorical_features):
    row = i // 2
    col_idx = i % 2
    value_counts = data[col].value_counts(normalize=True).mul(100).sort_values()
    value_counts.plot(kind='barh', ax=axes[row, col_idx], width=0.8, color='red')


for index, value in enumerate(value_counts):
        axes[row, col_idx].text(value, index, str(round(value, 1)) + '%', fontsize=15, weight='bold', va='center')

axes[row, col_idx].set_xlim([0, 100])
axes[row, col_idx].set_xlabel('Frequency Percentage', fontsize=12)
axes[row, col_idx].set_title(f'{col}', fontsize=20)
axes[4,1].axis('off')
plt.suptitle('Distribution of Categorical Variables', fontsize=30)
plt.tight_layout()
plt.subplots_adjust(top=0.8)
plt.show()


In [None]:
data.isnull().sum()

In [None]:
data.columns.difference(continuous_features)


In [None]:
continuous_features

In [None]:
a = data[continuous_features].quantile(0.40)
b = data[continuous_features].quantile(0.60)
IQR = b - a
outliers_count_specified = ((data[continuous_features] < (a - 1.5 * IQR)) | (data[continuous_features] > (b + 1.5 * IQR))).sum()

outliers_count_specified

In [None]:
df_encoded = pd.get_dummies(data, columns=['cp', 'thal','slope'], drop_first=True)
df_encoded

features_to_convert = ['sex', 'fbs', 'exang', 'ca', 'target']
for feature in features_to_convert:
    df_encoded[feature] = df_encoded[feature].astype(int)

features_to_convert

df_encoded.dtypes

In [None]:
df_encoded.describe()


In [None]:
df_encoded.head()

In [None]:
X=df_encoded.drop('target',axis=1)
X

In [None]:
df_encoded

In [None]:
y=df_encoded['target']
y

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X, y, test_size=.30,random_state=42)
X_train

In [None]:
X_test

In [None]:
y_test

In [None]:
y_train

In [None]:
sc= StandardScaler()
X_train = sc.fit_transform(X_train)

X_train



In [None]:
X_test=sc.transform(X_test)
X_test

## LOGISTIC REGRESSION

In [None]:
classifier=LogisticRegression(random_state=1)
classifier.fit(X_train,y_train)
y_pred=classifier.predict(X_test)
y_pred


In [None]:
y_test

In [None]:
cm=confusion_matrix(y_test,y_pred)
cm=print(cm)

In [None]:
accuracy_score(y_test,y_pred)

### Hypertuning of model

In [None]:
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
X, y = make_blobs(n_samples=100, centers=2, n_features=100, cluster_std=20)
model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, y)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

### K-NN MODEL

In [None]:
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
y_pred


In [None]:
y_test

In [None]:
cm=confusion_matrix(y_test,y_pred)
cm=print(cm)


In [None]:
accuracy_score(y_test,y_pred)

## Hyperparametres in k-NN

In [None]:
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
X, y = make_blobs(n_samples=1000, centers=2, n_features=100, cluster_std=20)
model = KNeighborsClassifier()
n_neighbors = range(1, 21, 2)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']
grid = dict(n_neighbors=n_neighbors,weights=weights,metric=metric)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, y)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

## SVM modelling

In [None]:
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
y_pred

In [None]:
y_test

In [None]:
cm=confusion_matrix(y_test,y_pred)
cm=print(cm)

In [None]:
accuracy_score(y_test,y_pred)

## Hyperparameters in SVM

In [None]:
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
model = SVC()
kernel = ['poly', 'rbf', 'sigmoid']
C = [50, 10, 1.0, 0.1, 0.01]
gamma = ['scale']
grid = dict(kernel=kernel,C=C,gamma=gamma)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, y)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

## Kernel SVM

In [None]:
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
y_pred

In [None]:
y_test

In [None]:
cm=confusion_matrix(y_test,y_pred)
cm=print(cm)

In [None]:
accuracy_score(y_test,y_pred)

## HyperParameters in Kernal SVM

In [None]:
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
model = SVC()
kernel = ['poly', 'rbf', 'sigmoid','linear']
C = [50, 10, 1.0, 0.1, 0.01]
gamma = ['scale']
grid = dict(kernel=kernel,C=C,gamma=gamma)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, y)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

## Naive-Bayers modelling

In [None]:
classifier = GaussianNB()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
y_pred

In [None]:
y_test

In [None]:
cm=confusion_matrix(y_test,y_pred)
cm=print(cm)

In [None]:
accuracy_score(y_test,y_pred)

### Decision Tree Modelling

In [None]:
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
y_pred

In [None]:
y_test

In [None]:
cm=confusion_matrix(y_test,y_pred)
cm=print(cm)

In [None]:
accuracy_score(y_test,y_pred)

In [None]:
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingClassifier
X, y = make_blobs(n_samples=1000, centers=2, n_features=100, cluster_std=20)
model = BaggingClassifier()
n_estimators = [10, 100, 1000]
grid = dict(n_estimators=n_estimators)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, y)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

### Random Forest Classification Modelling





In [None]:
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
y_pred

In [None]:
y_test

In [None]:
cm=confusion_matrix(y_test,y_pred)
cm=print(cm)

In [None]:
accuracy_score(y_test,y_pred)

### Hyperparameters in rfc

In [None]:

from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
X, y = make_blobs(n_samples=1000, centers=2, n_features=100, cluster_std=20)
model = RandomForestClassifier()
n_estimators = [10, 100, 1000]
max_features = ['sqrt', 'log2']
grid = dict(n_estimators=n_estimators,max_features=max_features)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, y)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
     print("%f (%f) with: %r" % (mean, stdev, param))

### Gradient Boosting Model

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(n_estimators=300,
                                 learning_rate=0.05,
                                 random_state=100,
                                 max_features=5 )
gbc.fit(X_train,y_train)
y_pred=gbc.predict(X_test)
y_pred

In [None]:
y_test

In [None]:
cm=confusion_matrix(y_test,y_pred)
cm=print(cm)

In [None]:
accuracy_score(y_test,y_pred)

In [None]:

from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
X, y = make_blobs(n_samples=1000, centers=2, n_features=100, cluster_std=20)
model = GradientBoostingClassifier()
n_estimators = [10, 100, 1000]
learning_rate = [0.001, 0.01, 0.1]
subsample = [0.5, 0.7, 1.0]
max_depth = [3, 7, 9]
grid = dict(learning_rate=learning_rate, n_estimators=n_estimators, subsample=subsample, max_depth=max_depth)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, y)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

## XGBoost Model

In [None]:
from xgboost import XGBClassifier
classifier = XGBClassifier()
classifier.fit(X_train, y_train)


from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)