In [14]:
import pandas as pd
import matplotlib.pyplot as plt
import warnings

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA

warnings.filterwarnings('ignore')

# Data Preprocessing

In [None]:
# https://www.kaggle.com/fedesoriano/heart-failure-prediction
df = pd.read_csv("heart.csv")
df

### Remove outliers

In [3]:
def remove_outliers(df, feat):
    df = df[feat<=(feat.mean()+3*feat.std())]
    if feat.mean()-3*feat.std() > 0:
        df = df[feat>=(feat.mean()-3*feat.std())]
    return df

In [4]:
df = remove_outliers(df, df.RestingBP)
df = remove_outliers(df, df.Cholesterol)
df = remove_outliers(df, df.MaxHR)
df = remove_outliers(df, df.Oldpeak)

In [None]:
df

### Label Encoder

In [6]:
le = LabelEncoder()
df.Sex = le.fit_transform(df.Sex)
df.ChestPainType = le.fit_transform(df.ChestPainType)
df.RestingECG = le.fit_transform(df.RestingECG)
df.ST_Slope = le.fit_transform(df.ST_Slope)
df.ExerciseAngina = le.fit_transform(df.ExerciseAngina)

In [None]:
df

### Scaler Data

In [10]:
y = df.HeartDisease
X = df.drop('HeartDisease', axis='columns')

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

### Split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=30)

# Model Example Random Forest

In [13]:
model_rf = RandomForestClassifier()
model_rf.fit(X_train, y_train)
model_rf.score(X_test, y_test)

0.8777777777777778

# Remove uninformative features

In [45]:
pca = PCA(n_components=0.999)
X_pca = pca.fit_transform(X)
X_pca.shape

(900, 4)

In [46]:
X_train_pca, X_test_pca, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=30)

In [47]:
model_rf = RandomForestClassifier()
model_rf.fit(X_train_pca, y_train)
model_rf.score(X_test_pca, y_test)

0.7666666666666667