In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/heart-failure-prediction/heart.csv


# Import data


In [None]:
heart = pd.read_csv("/kaggle/input/heart-failure-prediction/heart.csv")

### See the first five rows of the dataset to see if the data has been loaded well

In [None]:
heart.head()

### Let's see if the target variable is skewed. If so we have to use other techniques such as undersampling or oversampling.

In [None]:
heart["HeartDisease"].value_counts()

### Due to the fact the results are balanced (nearly 50% each one) we do not need to use those techniques

### Let's see the info of the columns

In [None]:
heart.info()

### Here we conclude that the values are not null so 

## *Exploratory Data Analysis*

In [None]:
import seaborn as sns
sns.set(rc={'figure.figsize':(11.7,8.27)}) #Bigger images

### Correlation matrix

In [None]:
sns.heatmap(heart.corr(),cmap="YlGnBu")

In [None]:
sns.pairplot(heart,hue="HeartDisease")

## Outliers

With hue we can change the color regarding heart disease
With this plots we can see that oldpeak and restingBP has very long tails.
Let's see the box plots to see more clearly the outliers.

In [None]:
sns.boxplot(data=heart)

With a quick research we can see some values with RestingBP near zero, which is not possible. Also some there are some values with
Cholesterol near zero which is not possible also. Let's analizy how many of them are like that and
Here I will replace the RestingBP equal zero with the median

In [None]:

heart[heart["RestingBP"]<50]

Let's ask which are the points with low RestingBP
Here we see the entry do not has RestingBP Cholesterol and FastingBS. I will drop it because it is
too much effort in order to obtain one entry.

In [None]:
heart[heart["Cholesterol"]==0]

Here I will split the work in two, due to the fact that 0 value for cholesterol is representing missing values
I will replace the values with the median value of the data without these.

In [None]:
heart[heart["Cholesterol"]!= 0]["Cholesterol"].median()

In [None]:
heart.loc[heart["Cholesterol"]==0,"Cholesterol"] = heart[heart["Cholesterol"]!= 0]["Cholesterol"].median()


I will impute those values with the median value of the Cholesterol dataset without the zero values, which where used as the default value.

In [None]:
sns.pairplot(data=heart,hue="HeartDisease")

Let's see if the distribution has been modified

In [None]:
heart_tree = heart

Get the name of the columns with categorical type

In [None]:
cat_cols = heart.select_dtypes("object").columns.to_list()
cat_cols

Let's enconde the labels in order to use tree models such as XGDBoost, TreeClassifier, RandomForest, etc. Because with this stimators there is no need to scale the data.


In [None]:
from sklearn.preprocessing import LabelEncoder
#Manually
heart_tree["Sex"]=heart["Sex"].apply(lambda x: 0 if x=="M" else 1)
heart_tree["ChestPainType"]=heart["ChestPainType"].apply(lambda x: 0 if x=="ATA" else (1 if x=="NAP" else (2 if x=="ASY" else 3)))
heart_tree["ExerciseAngina"]=heart["ExerciseAngina"].apply(lambda x: 0 if x=="Y" else 1)
heart_tree["RestingECG"]=heart["RestingECG"].apply(lambda x: 0 if x=="Normal" else (1 if x=="ST" else 2))
heart_tree["ST_Slope"]=heart["ST_Slope"].apply(lambda x: 0 if x=="Flat" else (1 if x=="Up" else 2))

#Automatically encode
heart_tree_2 = heart.apply(LabelEncoder().fit_transform)

See the modified dataset

In [None]:
heart_tree_2.head()

# Tree based algorithm
This kind of algorithms do not care about data scale so we do not need to use the one hot encoder to split the categorical data into dummy categories.

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

Separate the data

In [None]:
X = heart_tree_2.drop(axis=1,labels="HeartDisease")
y = heart_tree_2["HeartDisease"]

Split the data

In [None]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=10)

In [None]:
Stimator = DecisionTreeClassifier(criterion="entropy",random_state = 101)
grid = {"max_depth" : [1,2,3,4,5,6,7,8,9]}

In [None]:
gso = GridSearchCV(Stimator,grid,cv=5)

In [None]:
gso.fit(x_train,y_train)

In [None]:
y_pred = gso.predict(x_test)

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score

In [None]:
confusion_matrix(y_test,y_pred)

In [None]:
accuracy_score(y_test,y_pred)

In [None]:
gso.best_params_

In [None]:
Stimator_forest = RandomForestClassifier(criterion="entropy")
#To optimize information gain
grid_forest = {"n_estimators" : [100,200,300,400,500,600,700,800,1000,2000,3000]}
gso_forest = GridSearchCV(Stimator_forest,grid_forest,cv = 5)

In [None]:
gso_forest.fit(x_train,y_train)
y_predforest = gso_forest.predict(x_test)
accuracy_score(y_test,y_predforest)

In [None]:
Stimator_XGBC = XGBClassifier(use_label_encoder=False)
Stimator_XGBC.fit(x_train,y_train)
y_predXGBC = Stimator_XGBC.predict(x_test)
accuracy_score(y_test,y_predXGBC)

# Distance Based Algorithm
Let's import the libraries we are going to use

In [None]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler


Get the dummy variables in order to be able to scale the data without loosing information

In [None]:
heart = pd.read_csv("/kaggle/input/heart-failure-prediction/heart.csv")

In [None]:
heart_distance = pd.get_dummies(heart)

In [None]:
heart_distance.head()

In [None]:
X = heart_distance.drop(columns="HeartDisease",axis=1) 
y = heart_distance["HeartDisease"]

In [None]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=10)

In [None]:
scaler = StandardScaler()
scaled_x_train = scaler.fit_transform(x_train)
scaled_x_test = scaler.transform(x_test)

In [None]:
Stimator_logReg = SVC(kernel="rbf")
grid_svc = {"gamma":[0.001,0.01,0.1,1,10],"C":[0.1,1,10,100,1000]}
gso_logreg = GridSearchCV(Stimator_logReg,grid_svc,cv = 5)

In [None]:
gso_logreg.fit(scaled_x_train,y_train)
y_predlogreg = gso_logreg.predict(scaled_x_test)
accuracy_score(y_test,y_predlogreg)
#GOOD SCORE