In [51]:
import pandas as pd


In [52]:
df = pd.read_csv('heart.csv')
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [53]:
print(df['ChestPainType'].unique())
print(df['RestingECG'].unique())
print(df['ExerciseAngina'].unique())
print(df['ST_Slope'].unique())

['ATA' 'NAP' 'ASY' 'TA']
['Normal' 'ST' 'LVH']
['N' 'Y']
['Up' 'Flat' 'Down']


In [54]:
print(df.isnull().sum())

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64


In [55]:
cat = ['ChestPainType', 'RestingECG', 'ST_Slope', 'Sex', 'ExerciseAngina']

from sklearn.model_selection import train_test_split

X = df.drop('HeartDisease', axis = 1)
y = df['HeartDisease']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [65]:

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OrdinalEncoder(), cat)
    ],
    remainder = 'passthrough'
)

pipeline = Pipeline(steps = [
    ('preprocessing', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100,
                               criterion = 'gini',
                               max_depth = 15,
                               max_features = 'sqrt',
                               class_weight = 'balanced',
                               min_samples_leaf = 5,
                               min_samples_split = 10,
                               random_state=42))
])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
y_pred_t = pipeline.predict(X_train)
y_proba = pipeline.predict_proba(X_test)[:, 1]
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Accuracy train:", accuracy_score(y_train, y_pred_t))
print(roc_auc_score(y_test, y_proba))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.8695652173913043
Accuracy train: 0.9209809264305178
0.9338511955334385
              precision    recall  f1-score   support

           0       0.82      0.88      0.85        77
           1       0.91      0.86      0.88       107

    accuracy                           0.87       184
   macro avg       0.87      0.87      0.87       184
weighted avg       0.87      0.87      0.87       184

[[68  9]
 [15 92]]


In [57]:

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8695652173913043
              precision    recall  f1-score   support

           0       0.84      0.86      0.85        77
           1       0.90      0.88      0.89       107

    accuracy                           0.87       184
   macro avg       0.87      0.87      0.87       184
weighted avg       0.87      0.87      0.87       184



In [58]:
df['ExerciseAngina']  = df['ExerciseAngina'].fillna(df['ExerciseAngina'].mode())

print(df.isnull().sum())

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64


In [59]:
df['ExerciseAngina']  = df['ExerciseAngina'].fillna(df['ExerciseAngina'].mode())
df['ExerciseAngina'] = df['ExerciseAngina'].map({'Y' : 1, 'N' : 0})
cat = ['ChestPainType', 'RestingECG', 'ST_Slope', 'Sex']

from sklearn.model_selection import train_test_split

X = df.drop('HeartDisease', axis = 1)
y = df['HeartDisease']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
print(X_train.isnull().sum())

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
dtype: int64


In [60]:

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OrdinalEncoder(), cat)
    ],
    remainder = 'passthrough'
)

pipeline = Pipeline(steps = [
    ('preprocessing', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100,
                               criterion = 'gini',
                               max_depth = 15,
                               max_features = 'sqrt',
                               class_weight = 'balanced',
                               min_samples_leaf = 5,
                               min_samples_split = 10,
                               random_state=42))
])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))




Accuracy: 0.8695652173913043
              precision    recall  f1-score   support

           0       0.82      0.88      0.85        77
           1       0.91      0.86      0.88       107

    accuracy                           0.87       184
   macro avg       0.87      0.87      0.87       184
weighted avg       0.87      0.87      0.87       184



In [61]:

X = df.drop('HeartDisease', axis = 1)
y = df['HeartDisease']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OrdinalEncoder(), cat)
    ],
    remainder = 'passthrough'
)

pipeline = Pipeline(steps = [
    ('preprocessing', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100,
                               criterion = 'gini',
                               max_depth = 15,
                               max_features = 'sqrt',
                               class_weight = 'balanced',
                               min_samples_leaf = 5,
                               min_samples_split = 10,
                               random_state=42))
])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))




Accuracy: 0.8695652173913043
              precision    recall  f1-score   support

           0       0.82      0.88      0.85        77
           1       0.91      0.86      0.88       107

    accuracy                           0.87       184
   macro avg       0.87      0.87      0.87       184
weighted avg       0.87      0.87      0.87       184

