<a href="https://www.kaggle.com/code/franciscomesquita/predict-heart-disease-all-preprocessing?scriptVersionId=102879846" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

<a id="1"></a>

<h1 style="background-color:orange;font-family:newtimeroman;font-size:250%;text-align:center;border-radius: 15px 50px;">Libraries And Utilities</h1>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

<a id="1"></a>

<h1 style="background-color:orange;font-family:newtimeroman;font-size:250%;text-align:center;border-radius: 15px 50px;">Pre-Processing</h1>

**TODO:**


**Problems:**


**Similar example**
- https://www.kaggle.com/cdabakoglu/heart-disease-classifications-machine-learning

In [None]:
#Read CSV file
df = pd.read_csv("../input/heart-disease-data-combined/heart_statlog_cleveland_hungary_final.csv")

In [None]:
#Categorical ordinal variable from numeric variable
df['ST slope'] = df['ST slope'].astype(str)
df['ST slope'].replace([str(1), str(2), str(3)], ['upsloping', 'flat', 'downsloping'], inplace=True)

# Seaborn Graphs before Pre-Processing

In [None]:
#Numeric distribution of cholesterol
sns.displot(df, x="cholesterol");

In [None]:
#Numeric distribution of max hearth rate
sns.displot(df, x="max heart rate");

In [None]:
#Numeric distribution of age
sns.displot(df, x="age");

In [None]:
#Balanced dataset 
sns.countplot(x="target", data=df);
plt.xticks((0,1));

In [None]:
#imbalanced sex ratio - much more mans than womens
sns.countplot(x="sex", data=df, palette="bwr");
plt.show();

In [None]:
#relation between targe and sex
sns.countplot(x="target", data=df, hue="sex");

In [None]:
#Heart disease frequency for ages
pd.crosstab(df.age,df.target).plot(kind="bar",figsize=(20,6));
plt.title('Heart Disease Frequency for Ages');
plt.xlabel('Age');
plt.ylabel('Frequency');
plt.show();

In [None]:
#Age/sex variance
sns.displot(df, x="age", hue="sex", kind="kde");

In [None]:
#categorical variable distribution
sns.countplot(x = "ST slope", data=df);

In [None]:
sns.pairplot(df)

In [None]:
#Relation between slope and target
pd.crosstab(df['ST slope'],df.target).plot(kind="bar",figsize=(15,6),color=['#DAF7A6','#FF5733' ])
plt.title('Heart Disease Frequency for Slope')
plt.xlabel('The Slope of The Peak Exercise ST Segment ')
#name rotation - horizontal display of names
plt.xticks(rotation = 0)
plt.ylabel('Frequency')
plt.show()

In [None]:
pd.crosstab(df['fasting blood sugar'],df.target).plot(kind="bar",figsize=(15,6),color=['#FFC300','#581845' ])
plt.title('Heart Disease Frequency According To FBS')
plt.xlabel('FBS - (Fasting Blood Sugar > 120 mg/dl) (1 = true; 0 = false)')
plt.xticks(rotation = 0)
plt.legend(["Haven't Disease", "Have Disease"])
plt.ylabel('Frequency of Disease or Not')
plt.show()

In [None]:
#Relation between chest pain and target
pd.crosstab(df['chest pain type'],df.target).plot(kind="bar",figsize=(15,6),color=['#11A5AA','#AA1190' ])
plt.title('Heart Disease Frequency According To Chest Pain Type')
plt.xlabel('Chest Pain Type')
plt.xticks(rotation = 0)
plt.ylabel('Frequency of Disease or Not')
plt.show()

In [None]:
#Relation between age and max hearth rate with target
sns.jointplot(data=df,x="age", y="max heart rate", hue="target");

In [None]:
#Correlation of Pearson
pearson = df.corr()
sns.heatmap(pearson)

In [None]:
#Correlation of Spearman 
spearman = df.corr(method='spearman')
sns.heatmap(spearman)

# Dataset analysis

In [None]:
print('Shape of the dataset: ',df.shape)
print('\n',df.head())

In [None]:
#Statics of the dataset
print(df.describe())

In [None]:
#Imbalaced ratio - It is balanced
df.target.value_counts()

In [None]:
#Percentage of each target class
countNoDisease = len(df[df.target == 0])
countHaveDisease = len(df[df.target == 1])
print("Percentage of Patients Haven't Heart Disease: {:.2f}%".format((countNoDisease / (len(df.target))*100)))
print("Percentage of Patients Have Heart Disease: {:.2f}%".format((countHaveDisease / (len(df.target))*100)))

In [None]:
#Percentage of each sex type - Very much imbalanced!
countFemale = len(df[df.sex == 0])
countMale = len(df[df.sex == 1])
print("Percentage of Female Patients: {:.2f}%".format((countFemale / (len(df.sex))*100)))
print("Percentage of Male Patients: {:.2f}%".format((countMale / (len(df.sex))*100)))

In [None]:
#Mean values of each feature comparing with each target class
df.groupby('target').mean()

In [None]:
#Verification of the existence of missing values
print('Count of missing values by feature: \n',df.isnull().sum())

In [None]:
#Verification of duplicated data
print('Duplicated dataset: ',df.shape)
df.drop_duplicates(inplace=True)
print('Dataset after drop duplicates: ',df.shape)

In [None]:
#Transforming the invalid '0' value presented in ST slope using most frequent strategy

categorical_transformer = Pipeline([
    ('catInput', SimpleImputer(missing_values='0', strategy="most_frequent"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, ['ST slope'])
    ])

df['ST slope'] = preprocessor.fit_transform(df)

In [None]:
#Categorical to Numerical - ST Slope

df['ST slope'] = pd.factorize(df['ST slope'])[0]

df['ST slope']


In [None]:
#Verification of the existence of outliers
print('Before of remove the outliers: ', df.shape)

df = df[(stats.zscore(df['cholesterol']).abs()<= 3) & (stats.zscore(df['resting bp s']).abs()<= 3) & (stats.zscore(df['max heart rate']).abs()<= 3)]

print('After remove the outliers: ', df.shape)

# Seaborn Graphs after Pre-Processing

In [None]:
#Numeric distribution of cholesterol
sns.displot(df, x="cholesterol");

In [None]:
#Numeric distribution of max hearth rate
sns.displot(df, x="max heart rate");

In [None]:
#Numeric distribution of age
sns.displot(df, x="age");

In [None]:
#Balanced dataset 
sns.countplot(x="target", data=df);
plt.xticks((0,1));

In [None]:
#imbalanced sex ratio - much more mans than womens
sns.countplot(x="sex", data=df, palette="bwr");
plt.show();

In [None]:
#relation between targe and sex
sns.countplot(x="target", data=df, hue="sex");

In [None]:
#Heart disease frequency for ages
pd.crosstab(df.age,df.target).plot(kind="bar",figsize=(20,6));
plt.title('Heart Disease Frequency for Ages');
plt.xlabel('Age');
plt.ylabel('Frequency');
plt.show();

In [None]:
#Age/sex variance
sns.displot(df, x="age", hue="sex", kind="kde");

In [None]:
#categorical variable distribution
sns.countplot(x = "ST slope", data=df);

In [None]:
#Relation between slope and target
pd.crosstab(df['ST slope'],df.target).plot(kind="bar",figsize=(15,6),color=['#DAF7A6','#FF5733' ])
plt.title('Heart Disease Frequency for Slope')
plt.xlabel('The Slope of The Peak Exercise ST Segment ')
#name rotation - horizontal display of names
plt.xticks(rotation = 0)
plt.ylabel('Frequency')
plt.show()

In [None]:
pd.crosstab(df['fasting blood sugar'],df.target).plot(kind="bar",figsize=(15,6),color=['#FFC300','#581845' ])
plt.title('Heart Disease Frequency According To FBS')
plt.xlabel('FBS - (Fasting Blood Sugar > 120 mg/dl) (1 = true; 0 = false)')
plt.xticks(rotation = 0)
plt.legend(["Haven't Disease", "Have Disease"])
plt.ylabel('Frequency of Disease or Not')
plt.show()

In [None]:
#Relation between chest pain and target
pd.crosstab(df['chest pain type'],df.target).plot(kind="bar",figsize=(15,6),color=['#11A5AA','#AA1190' ])
plt.title('Heart Disease Frequency According To Chest Pain Type')
plt.xlabel('Chest Pain Type')
plt.xticks(rotation = 0)
plt.ylabel('Frequency of Disease or Not')
plt.show()

In [None]:
#Relation between age and max hearth rate with target
sns.jointplot(data=df,x="age", y="max heart rate", hue="target");

In [None]:
#Correlation of Pearson
pearson = df.corr()
sns.heatmap(pearson)

In [None]:
#Correlation of Spearman
spearman = df.corr(method='spearman')
sns.heatmap(spearman)

In [None]:
#Train and Test data split
X,y = df[['age','sex','chest pain type','resting bp s','cholesterol','fasting blood sugar','resting ecg', 'max heart rate', 'exercise angina','oldpeak','ST slope']] , df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20);

print("Training dataset shape: ",X_train.shape)
print("Testing dataset shape: ",X_test.shape)

X_train

**META 2 - CLASSIFICATION**

In [None]:
#Binary classification
# Random Forest
# Naive Bayes
# Logistic Regression
# SVM
# Decision Tree


#SVM - default parameters and linear kernel

from sklearn import svm
from sklearn import metrics
from sklearn.metrics import plot_confusion_matrix


clf = svm.SVC(kernel='linear')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
plot_confusion_matrix(clf, X_test, y_test)
plt.show()

In [None]:
#Naive Bayes -  Gaussian Naive Bayes algorithm

from sklearn.naive_bayes import GaussianNB


gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
plot_confusion_matrix(clf, X_test, y_test)
plt.show()

In [None]:
#Decision Tree

from sklearn import tree

clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
plot_confusion_matrix(clf, X_test, y_test)
plt.show()

In [None]:
#I am getting an error here

#Logistic Regression
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
# training the model
clf.fit(X_train, y_train)
plot_confusion_matrix(clf, X_test, y_test)
plt.show()

In [None]:
#Random Forest - ensemble 

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
plot_confusion_matrix(clf, X_test, y_test)
plt.show()
