# **HEART FAILURE CLASSIFICATION**
This project uses a dataset named heart.csv which has 12 attributes based on which the training for classification is made.The total number of records found in this file is 918 with no duplicates.

# Import the required Libraries

In [667]:
import numpy as np 
import pandas as pd 
import matplotlib as mpl
import matplotlib.pyplot as plt

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Reading the Dataset
The data set consists of 12 columns including the HeartDisease which is what needs to be predicted.

In [668]:
df = pd.read_csv("../input/heart-failure-prediction/heart.csv")
df.head()
# df['ChestPainType'].unique()

In [669]:
df.describe()

In [670]:
df.info()

In [671]:
df.corr()

# Data Visualization

In [672]:
sns.heatmap(df.corr()).set_title("Correlation Map")

In [673]:
sns.pairplot(df)

In [674]:
sns.regplot(x=df["Age"], y=df["Oldpeak"])

# 1. Gender Plot

In [675]:
print(df_copy["Sex"].value_counts())
sns.set_theme(style="darkgrid")
ax = sns.countplot(data=df_copy, x="Sex")
plt.show()

In [676]:
pd.crosstab(df["Sex"], df['HeartDisease'])

In [677]:
# Pie chart for different classes 
plt.subplot(1,2,1)
classes = ['Male', 'Female']
plt.pie(df["Sex"].value_counts(), labels = classes, autopct='%.0f%%', shadow=True, colors = ['#99ff99','#e4f7e4'])
plt.title("Gender Percentage")

# Count plot of ratings based on gender
plt.subplot(1,2,2)
classes = ['Male', 'Female']
sns.countplot(x=age_groups, data=df, hue="Sex", color="#99ff99")
plt.title("Count plot of age groups with gender information")
plt.legend(classes)

plt.show()


# 2. Age Plot

In [678]:
# age_groups = pd.cut(df['Age'], bins=4)
age_groups = pd.cut(df['Age'], bins=[27, 40, 52, 64, np.inf])
age_groups.value_counts()
sns.set_theme(style="darkgrid")
ax = sns.countplot(age_groups)
plt.show()

**Gender Count in each Age Group**

In [679]:
pd.crosstab(age_groups, df['Sex'])

In [680]:
pd.crosstab(age_groups, df['HeartDisease'])

# 3. CheckPainType Plot

In [681]:
print(df["ChestPainType"].value_counts())
sns.set_theme(style="darkgrid")
ax = sns.countplot(data=df, x="ChestPainType")
plt.show()

In [682]:
pd.crosstab(df["ChestPainType"], age_groups)

In [683]:
pd.crosstab(df["ChestPainType"], df['Sex'])

In [684]:
pd.crosstab(df["ChestPainType"], df['HeartDisease'])

# 4. ExerciseAngina Plot

In [685]:
print(df["ExerciseAngina"].value_counts())
sns.set_theme(style="darkgrid")
ax = sns.countplot(data=df, x="ExerciseAngina")
plt.show()

In [686]:
pd.crosstab(df["ExerciseAngina"], df['Sex'])

In [687]:
pd.crosstab(df["ExerciseAngina"], df['HeartDisease'])

# 5. RestingECG Plot

In [688]:
print(df["RestingECG"].value_counts())
sns.set_theme(style="darkgrid")
ax = sns.countplot(data=df, x="RestingECG")
plt.show()

In [689]:
pd.crosstab(df["RestingECG"], age_groups)

In [690]:
pd.crosstab(df["RestingECG"], df['Sex'])

In [691]:
pd.crosstab(df["RestingECG"], df['HeartDisease'])

# 6. ST_Slope Plot


In [692]:
print(df["ST_Slope"].value_counts())
sns.set_theme(style="darkgrid")
ax = sns.countplot(data=df, x="ST_Slope")
plt.show()

In [693]:
pd.crosstab(df["ST_Slope"], age_groups)

In [694]:
pd.crosstab(df["ST_Slope"], df['Sex'])

In [695]:
pd.crosstab(df["ST_Slope"], df['HeartDisease'])

# Data Cleaning and Manipulation
> The data is checked for null values. Since there is no null entries in this dataset no cleaning was done.
The String or Object type Columns are converted to integer values using LabelEncoder for each category
Below are the columns which were converted.

In [696]:
sns.heatmap(df.isnull()).set_title("Null Values in Dataset")

plt.show()

# One-Hot Encoding trial

In [697]:
# from sklearn.preprocessing import OneHotEncoder
# ins_enc = OneHotEncoder(handle_unknown='ignore')
# ins_enc_df = pd.DataFrame(ins_enc.fit_transform(df[['Sex']]).toarray())

In [698]:
# ins_enc_df.head()
# df_ohe = pd.merge(left=df, right=ins_enc_df, left_index=True, right_index=True)
# # df_ohe.info()
# df_ohe.rename(columns={"0":"Male", "1":"Female"})
# df_ohe.head()

In [699]:
# sns.heatmap(df_ohe.corr())

# Label Encoding

Using Label Encoder the following fields are categorized.
* **Sex:**
        Female - 0
        Male   - 1
* **ChestPainType:**
        ASY - 0
        ATA - 1
        NAP - 2
        TA  - 3    
* **ExerciseAngina:** 
        N - 0
        Y - 1    
* **RestingECG:**  
        LVH    - 0
        Normal - 1
        ST     - 2    
* **ST_Slope:**
        Up - 2
        Flat - 1
        Down - 0

In [700]:
from sklearn.preprocessing import LabelEncoder
# gender_type = ('M', 'F')
# gender_df = pd.DataFrame(gender_type, columns=['Sex'])
label_enc = LabelEncoder()
df['Sex'] = label_enc.fit_transform(df['Sex'])
df['ExerciseAngina'] = label_enc.fit_transform(df['ExerciseAngina'])
df['ChestPainType'] = label_enc.fit_transform(df['ChestPainType'])
df['RestingECG'] = label_enc.fit_transform(df['RestingECG'])
df['ST_Slope'] = label_enc.fit_transform(df['ST_Slope'])
df.head()

In [701]:
# df['ChestPainType'].unique()

In [702]:
df.corr()

In [703]:
sns.heatmap(df.corr(), cmap = "PiYG").set_title("Correlation Map")

# Model Training with just one Feature - Age

***Trying to understand how Logistic Regression works with just one feature which is selected in to be Age.***

In [704]:
from sklearn.model_selection import train_test_split
train_df1 = df[['Age']]
X1 = train_df1.values
y1 = df['HeartDisease'].values
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.20, random_state=42)
# print(X3)

In [705]:
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression(solver='lbfgs', max_iter=1000)
#data_fit.score(X, y)
# print(X_train2.shape)
log_model.fit(X_train1, y_train1)
print("model score: %.3f" % log_model.score(X_test1, y_test1))

# Model Training Using just 2 columns - Age and Sex

Trying out Logistic regression and checking the accuracy using 2 random features

In [706]:
train_df2 = df[['Age', 'Sex']]
X2 = train_df2.values
y2 = df['HeartDisease'].values
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.20, random_state=42)
# print(X3)

In [707]:
log_model.fit(X_train2, y_train2)
print("model score: %.3f" % log_model.score(X_test2, y_test2))

# Model Training - All features

In [708]:
train_df3 = df
data3 = train_df3.values
X3 = train_df3.drop(columns = ['HeartDisease']).values
y3 = train_df3['HeartDisease'].values
# y = data1[:, -1]
# print(y)

In [709]:
X_train3, X_test3, y_train3, y_test3 = train_test_split(X3, y3, test_size=0.20, random_state=42)
log_model.fit(X_train3, y_train3)
print("model score: %.3f" % log_model.score(X_test3, y_test3))

# Model Training without Resting ECG and Resting BP

***Since the correlation is very less in the RestingECG and RestingBP with HeartDisease, these columns are dropped and then training is done.***

In [710]:
train_df4 = df
X4 = train_df4.drop(columns = ['RestingBP', 'RestingECG', 'HeartDisease']).values
y4 = train_df4['HeartDisease'].values
print(X4)

In [711]:
X_train4, X_test4, y_train4, y_test4 = train_test_split(X4, y4, test_size=0.20, random_state=42)
# print(X_train2.shape)
log_model.fit(X_train4, y_train4)
print("model score: %.3f" % log_model.score(X_test4, y_test4))

In [712]:
from sklearn.datasets import make_blobs
X, y = make_blobs(n_samples=1000, centers=2, random_state=1)
print(X.shape, y.shape)

In [713]:
from collections import Counter
counter = Counter(y4)
print(counter)

In [714]:
for i in range(10):
    print(X4[i], y4[i])

In [715]:
from matplotlib import pyplot
from numpy import where
for label, _ in counter.items():
	row_ix = where(y4 == label)[0]
	pyplot.scatter(X4[row_ix, 0], X4[row_ix, 1], label=str(label))
pyplot.legend()
pyplot.show()

# Multiple Model Trainings

In [716]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [717]:
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, ConfusionMatrixDisplay, precision_score, recall_score, f1_score, classification_report, roc_curve, plot_roc_curve, auc, precision_recall_curve, plot_precision_recall_curve, average_precision_score
from sklearn.model_selection import cross_val_score

In [718]:
# df_copy = df.drop(columns = ['RestingBP', 'RestingECG', 'HeartDisease']).values
x = df.iloc[:, 1:-1].values
y = df.iloc[:, -1].values

In [719]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size= 0.2, random_state= 0)

In [720]:
from sklearn.preprocessing import StandardScaler 
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [721]:
lgr= ['Logistic Regression', LogisticRegression(random_state=0, solver='lbfgs', max_iter=1000)]
svm = ['SVM', SVC(random_state = 0)]
gaus= ['GaussianNB', GaussianNB()]
bern=['BernoulliNB', BernoulliNB()]
dectree=['Decision Tree', DecisionTreeClassifier(random_state=0)]
randF=['Random Forest', RandomForestClassifier(random_state=45)]
xgb=['XGBoost', XGBClassifier(eval_metric= 'error', use_label_encoder=False)]

In [722]:
models = []
models.append(lgr)
models.append(svm)
models.append(xgb)
models.append(randF)
models.append(dectree)

In [723]:
print(len(models))

In [724]:
lst_1= []

for m in range(len(models)):
    lst_2= []
    model = models[m][1]
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    cm = confusion_matrix(y_test, y_pred)  #Confusion Matrix
    accuracies = cross_val_score(estimator = model, X = x_train, y = y_train, cv = 10)   #K-Fold Validation
    roc = roc_auc_score(y_test, y_pred)  #ROC AUC Score
    precision = precision_score(y_test, y_pred)  #Precision Score
    recall = recall_score(y_test, y_pred)  #Recall Score
    f1 = f1_score(y_test, y_pred)  #F1 Sc
    print(models[m][0],':')
    print(cm)
    print('Accuracy Score: ',accuracy_score(y_test, y_pred))
    print('')
    print("K-Fold Validation Mean Accuracy: {:.2f} %".format(accuracies.mean()*100))
    print('')
    print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))
    print('')
    print('ROC AUC Score: {:.2f}'.format(roc))
    print('')
    print('Precision: {:.2f}'.format(precision))
    print('')
    print('Recall: {:.2f}'.format(recall))
    print('')
    print('F1: {:.2f}'.format(f1))
    print('-----------------------------------')
    print('')
    lst_2.append(models[m][0])
    lst_2.append((accuracy_score(y_test, y_pred))*100) 
    lst_2.append(accuracies.mean()*100)
    lst_2.append(accuracies.std()*100)
    lst_2.append(roc)
    lst_2.append(precision)
    lst_2.append(recall)
    lst_2.append(f1)
    lst_1.append(lst_2)

In [725]:
df_model = pd.DataFrame(lst_1, columns= ['Model', 'Accuracy', 'K-Fold Mean Accuracy', 'Std. Deviation', 'ROC AUC', 'Precision', 'Recall', 'F1'])

In [726]:
df_model.sort_values(by= ['Accuracy', 'K-Fold Mean Accuracy'], inplace= True, ascending= False)
df_model