In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [None]:
df=pd.read_csv('/kaggle/input/heart-failure-prediction/heart.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
df.keys()

## Data Vizualization

In [None]:
viz=sns.pairplot(df)
viz

## Resting BP

In [None]:
print(df['RestingBP'].describe())
plt.figure(figsize=(8,6))
plt.scatter(df['RestingBP'],df['Age'])
plt.xlabel("RestingBP")
plt.ylabel("Age")

## Chestpain

In [None]:
chest_pain = df['ChestPainType'].value_counts()

plt.figure(figsize=(8, 8))
plt.pie(chest_pain, labels=chest_pain.index, autopct='%1.1f%%', startangle=140)
plt.title('Distribution of Chest Pain Types')
plt.axis('equal') 

plt.show()

## Comparision

In [None]:
from scipy.stats import ttest_ind
from scipy.stats import mannwhitneyu

exercise_angina_no_heart_disease = df[df['HeartDisease'] == 0]['ExerciseAngina']
exercise_angina_heart_disease = df[df['HeartDisease'] == 1]['ExerciseAngina']

oldpeak_no_heart_disease = df[df['HeartDisease'] == 0]['Oldpeak']
oldpeak_heart_disease = df[df['HeartDisease'] == 1]['Oldpeak']

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist([exercise_angina_no_heart_disease, exercise_angina_heart_disease], bins=20, alpha=0.5, label=['No Heart Disease', 'Heart Disease'])
plt.title('Exercise Angina Distribution')
plt.xlabel('Exercise Angina (0: No, 1: Yes)')
plt.ylabel('Frequency')
plt.legend()

plt.subplot(1, 2, 2)
plt.hist([oldpeak_no_heart_disease, oldpeak_heart_disease], bins=20, alpha=0.5, label=['No Heart Disease', 'Heart Disease'])
plt.title('Oldpeak Distribution')
plt.xlabel('Oldpeak')
plt.ylabel('Frequency')
plt.legend()

plt.tight_layout()
plt.show()

## Oldpeak

In [None]:
sns.kdeplot(oldpeak_heart_disease, color='red', label='Heart Disease', shade=True)
sns.kdeplot(oldpeak_no_heart_disease, color='blue', label='No Heart Disease', shade=True)
plt.title('Oldpeak Distribution (Density Plot)')
plt.xlabel('Oldpeak')
plt.ylabel('Density')
plt.legend()

plt.tight_layout()
plt.show()

## Cholesterol

In [None]:
plt.hist(df.Cholesterol, bins=40, rwidth=0.8)
plt.xlabel('cholesterol')
plt.ylabel('Count')
plt.show()

In [None]:
from scipy.stats import norm
import numpy as np
plt.hist(df.Cholesterol, bins=40, rwidth=0.8)
plt.xlabel('Cholesterol')
plt.ylabel('Count')
plt.show()
rng = np.arange(df.Cholesterol.min(), df.Cholesterol.max(), 0.1)
plt.plot(rng, norm.pdf(rng,df.Cholesterol.mean(),df.Cholesterol.std()))

In [None]:
df.Cholesterol.mean()

In [None]:
df.Cholesterol.std()

In [None]:
df['Cholesterol'].describe()

## Outlier

In [None]:
upper_limit = df.Cholesterol.mean() + 108.38*df.Cholesterol.std()
upper_limit

In [None]:
lower_limit = df.Cholesterol.mean() - 108.38*df.Cholesterol.std()
lower_limit

In [None]:
df1=df[(df.Cholesterol<upper_limit) | (df.Cholesterol>lower_limit)]
df1

In [None]:
df_no_outlier=df1
df_no_outlier.head()

In [None]:
df_no_outlier.shape

### Zscore=z = (x-μ)/σ

In [None]:
df['zscore'] = ( df.Cholesterol - df.Cholesterol.mean() ) / df.Cholesterol.std()    
df.head(5)

In [None]:
df[df['zscore']>3]


In [None]:
df[df['zscore']<-3]


In [None]:
df[(df.zscore<-3) | (df.zscore>3)]

In [None]:
df0=df[(df.zscore>-3) & (df.zscore<3)]
df0.head()

In [None]:
df0.shape

In [None]:
df.shape

In [None]:
upper_limit = df.RestingBP.mean() + 108.38*df.RestingBP.std()
print(upper_limit)
lower_limit = df.RestingBP.mean() - 108.38*df.RestingBP.std()
print(lower_limit)

In [None]:
df2=df[(df.RestingBP<upper_limit) | (df.RestingBP>lower_limit)]
df2

In [None]:
df2.shape

In [None]:
df['zscore'] = ( df.RestingBP - df.RestingBP.mean() ) / df.RestingBP.std()    
df.head()

In [None]:
df[df['zscore']>3]


In [None]:
df[df['zscore']<-3]


In [None]:
df[(df.zscore<-3) | (df.zscore>3)]

In [None]:
df1=df[(df.zscore>-3) & (df.zscore<3)]
df1

## Preprocessing

In [None]:
from sklearn.preprocessing import LabelEncoder
#from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [None]:
Encoder = LabelEncoder()
#scalar = StandardScaler()
scalar = MinMaxScaler()

In [None]:
df['Sex'] = Encoder.fit_transform(df['Sex'])
df['ChestPainType'] = Encoder.fit_transform(df['ChestPainType'])
df['RestingECG'] = Encoder.fit_transform(df['RestingECG'])
df['ExerciseAngina'] = Encoder.fit_transform(df['ExerciseAngina'])
df['ST_Slope'] = Encoder.fit_transform(df['ST_Slope'])

In [None]:
df[['MaxHR', 'Cholesterol', 'Age', 'RestingBP']] = scalar.fit_transform(df[['MaxHR', 'Cholesterol', 'Age', 'RestingBP']])

In [None]:
df.head()

In [None]:
X = df.iloc[:, :-2]
y = df.iloc[:, -2]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size= 0.2)

In [None]:
pip install lazypredict

In [None]:
from lazypredict.Supervised import LazyClassifier

In [None]:
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)
print(models)

# MACHINE LEARNING MODULES 

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
#from tpot import TPOTClassifier
from sklearn.ensemble import ExtraTreesClassifier

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
acc_log = round(logreg.score(X_train, y_train) * 100, 3)
log = round(logreg.score(X_test, y_test) * 100, 3)
print("Train: ", acc_log)
print("Test: ", log)

In [None]:
SVC = SVC(kernel= 'linear', C= 1)
SVC.fit(X_train, y_train)
acc_svc = round(SVC.score(X_train, y_train) * 100, 3)
svc = round(SVC.score(X_test, y_test) * 100, 3)
print("Train: ", acc_svc)
print("Test: ", svc)

In [None]:
RandomForestClassifier = RandomForestClassifier(n_estimators=10)
RandomForestClassifier.fit(X_train, y_train)
acc_random_forest = round(RandomForestClassifier.score(X_train, y_train) * 100, 3) 
random_forest = round(RandomForestClassifier.score(X_test, y_test) * 100, 3) 
print("Train: ", acc_random_forest)
print("Test: ", random_forest)

In [None]:
DecisionTreeClassifier = DecisionTreeClassifier()
DecisionTreeClassifier.fit(X_train, y_train)
acc_decision_tree = round(DecisionTreeClassifier.score(X_train, y_train) * 100, 3)
decision_tree = round(DecisionTreeClassifier.score(X_test, y_test) * 100, 3)
print("Train: ",acc_decision_tree)
print("Test: ", decision_tree)

In [None]:
KNeighborsClassifier = KNeighborsClassifier(n_neighbors = 3)
KNeighborsClassifier.fit(X_train, y_train)
acc_knn = round(KNeighborsClassifier.score(X_train, y_train) * 100, 3)
knn = round(KNeighborsClassifier.score(X_test, y_test) * 100, 3)
print("Train: ", acc_knn)
print("Test: ", knn)

In [None]:
GradientBoostingClassifier = GradientBoostingClassifier(
    learning_rate=0.01,
    max_depth=8,
    max_features=0.5,
    min_samples_leaf=17,
    min_samples_split=6,
    n_estimators=100,
    subsample=0.6,
    random_state=42
)
GradientBoostingClassifier.fit(X_train, y_train)
acc_GBC = round(GradientBoostingClassifier.score(X_train, y_train) * 100, 3)
GBC = round(GradientBoostingClassifier.score(X_test, y_test) * 100, 3)
print("Train: ", acc_GBC)
print("Test: ", GBC)

In [None]:
ExtraTreesClassifier = ExtraTreesClassifier(
    bootstrap=False,
    max_features=0.2,
    min_samples_leaf=2,
    min_samples_split=13,
    n_estimators=100,
    random_state=20)
ExtraTreesClassifier.fit(X_train, y_train)
acc_extra_tree = round(ExtraTreesClassifier.score(X_train, y_train) * 100, 3)
extra_tree = round(ExtraTreesClassifier.score(X_test, y_test) * 100, 3)
print("Train: ", acc_extra_tree)
print("Test: ", extra_tree)

In [None]:
import lightgbm as lgb
from lightgbm import LGBMClassifier


In [None]:
clf = lgb.LGBMClassifier()
clf.fit(X_train, y_train)
acc_clf = round(clf.score(X_train, y_train) * 100, 3)
clf = round(clf.score(X_test, y_test) * 100, 3)
print("Train: ", acc_clf)
print("Test: ", clf)

## SCORE CARD

In [None]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Decision Tree', 'ExtraTreesClassifier','GradientBoostingClassifier','LGBMClassifier'],
    
    'Training_score': [acc_svc, acc_knn, acc_log, acc_random_forest,
                       acc_decision_tree, acc_extra_tree,acc_GBC,acc_clf],
    
    'Testing_score' : [svc, knn, log, random_forest,
                       decision_tree, extra_tree,GBC,clf]})

models.sort_values(by='Testing_score', ascending=False)

According to this modules we can say that Random Forest and ExtraTreeClassifier  gives the best result