In [38]:
import pandas as pd

df = pd.read_csv('heart_statlog_cleveland_hungary_final.csv')
df.head()

Unnamed: 0,age,sex,chest pain type,resting bp s,cholesterol,fasting blood sugar,resting ecg,max heart rate,exercise angina,oldpeak,ST slope,target
0,40,1,2,140,289,0,0,172,0,0.0,1,0
1,49,0,3,160,180,0,0,156,0,1.0,2,1
2,37,1,2,130,283,0,1,98,0,0.0,1,0
3,48,0,4,138,214,0,0,108,1,1.5,2,1
4,54,1,3,150,195,0,0,122,0,0.0,1,0


In [39]:
df.info()
# Tidak ada data null

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1190 entries, 0 to 1189
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  1190 non-null   int64  
 1   sex                  1190 non-null   int64  
 2   chest pain type      1190 non-null   int64  
 3   resting bp s         1190 non-null   int64  
 4   cholesterol          1190 non-null   int64  
 5   fasting blood sugar  1190 non-null   int64  
 6   resting ecg          1190 non-null   int64  
 7   max heart rate       1190 non-null   int64  
 8   exercise angina      1190 non-null   int64  
 9   oldpeak              1190 non-null   float64
 10  ST slope             1190 non-null   int64  
 11  target               1190 non-null   int64  
dtypes: float64(1), int64(11)
memory usage: 111.7 KB


In [40]:
def print_null_report(df):
    for i, j in enumerate(df.isna().sum()):
        print(f'{df.columns[i]:20} | {j*100/len(df):.2f}%')

print_null_report(df)

age                  | 0.00%
sex                  | 0.00%
chest pain type      | 0.00%
resting bp s         | 0.00%
cholesterol          | 0.00%
fasting blood sugar  | 0.00%
resting ecg          | 0.00%
max heart rate       | 0.00%
exercise angina      | 0.00%
oldpeak              | 0.00%
ST slope             | 0.00%
target               | 0.00%


In [41]:
# Handle Outliers
import numpy as np

def check_outliers(df):
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3-Q1

    outliers = df[(df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))]
    return outliers

def print_outliers(df):
    for i in df.columns:
        outliers = check_outliers(df[i])
        print(f'Outliers for column {i}: {len(outliers)}')

def handle_outliers(df): 
    for i in df.columns:
        Q1 = df[i].quantile(0.25)
        Q3 = df[i].quantile(0.75)
        IQR = Q3-Q1

        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        df[i] = np.where(df[i] < lower_bound, lower_bound, df[i])
        df[i] = np.where(df[i] > upper_bound, upper_bound, df[i])

        print(f'Column {i} has been processed')

    print('It is done UAHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH')

print(print_outliers(df))
print(handle_outliers(df))

Outliers for column age: 0
Outliers for column sex: 281
Outliers for column chest pain type: 66
Outliers for column resting bp s: 37
Outliers for column cholesterol: 193
Outliers for column fasting blood sugar: 254
Outliers for column resting ecg: 0
Outliers for column max heart rate: 1
Outliers for column exercise angina: 0
Outliers for column oldpeak: 11
Outliers for column ST slope: 0
Outliers for column target: 0
None
Column age has been processed
Column sex has been processed
Column chest pain type has been processed
Column resting bp s has been processed
Column cholesterol has been processed
Column fasting blood sugar has been processed
Column resting ecg has been processed
Column max heart rate has been processed
Column exercise angina has been processed
Column oldpeak has been processed
Column ST slope has been processed
Column target has been processed
It is done UAHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH
None


In [42]:
print_outliers(df)

Outliers for column age: 0
Outliers for column sex: 0
Outliers for column chest pain type: 0
Outliers for column resting bp s: 0
Outliers for column cholesterol: 0
Outliers for column fasting blood sugar: 0
Outliers for column resting ecg: 0
Outliers for column max heart rate: 0
Outliers for column exercise angina: 0
Outliers for column oldpeak: 0
Outliers for column ST slope: 0
Outliers for column target: 0


In [43]:
df.head()

Unnamed: 0,age,sex,chest pain type,resting bp s,cholesterol,fasting blood sugar,resting ecg,max heart rate,exercise angina,oldpeak,ST slope,target
0,40.0,1.0,2.0,140.0,289.0,0.0,0.0,172.0,0.0,0.0,1.0,0.0
1,49.0,1.0,3.0,160.0,180.0,0.0,0.0,156.0,0.0,1.0,2.0,1.0
2,37.0,1.0,2.0,130.0,283.0,0.0,1.0,98.0,0.0,0.0,1.0,0.0
3,48.0,1.0,4.0,138.0,214.0,0.0,0.0,108.0,1.0,1.5,2.0,1.0
4,54.0,1.0,3.0,150.0,195.0,0.0,0.0,122.0,0.0,0.0,1.0,0.0


In [45]:
# Data Normalization
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler()

X = df.drop(['target'], axis=1)
y = df['target']

X = sc.fit_transform(X)

In [47]:
# Model selection
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=0)

X_train, y_train = smote.fit_resample(X_train, y_train)

In [None]:
# Modeling
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score