In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor, XGBClassifier

In [2]:
df=pd.read_csv("/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv")
df.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
df.describe()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,0.966997,131.623762,246.264026,0.148515,0.528053,149.646865,0.326733,1.039604,1.39934,0.729373,2.313531,0.544554
std,9.082101,0.466011,1.032052,17.538143,51.830751,0.356198,0.52586,22.905161,0.469794,1.161075,0.616226,1.022606,0.612277,0.498835
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


Here are all key features:  
Age: Age of the patient |
Sex: Gender of the patient (1 = male; 0 = female) |
CP (Chest Pain Type): Type of chest pain experienced |
Trestbps (Resting Blood Pressure): Resting blood pressure in mm Hg |
Chol (Cholesterol): Cholesterol level in mg/dl |
FBS (Fasting Blood Sugar): Whether fasting blood sugar is greater than 120 mg/dl (1 = true; 0 = false) |
Restecg (Resting Electrocardiographic Results): Results of the resting electrocardiogram |
Thalachh (Maximum Heart Rate Achieved): The maximum heart rate achieved |
Exng (Exercise Induced Angina): Whether exercise-induced angina is present (1 = yes; 0 = no) |
Oldpeak: ST depression induced by exercise relative to rest |
Slope: The slope of the peak exercise ST segment |
Caa: Number of major vessels (0-3) colored by fluoroscopy |
Thall: Thallium stress test result |
Output/Target: The target variable, indicating the presence (1) or absence (0) of heart disease |

In [4]:
df.columns

Index(['age', 'sex', 'cp', 'trtbps', 'chol', 'fbs', 'restecg', 'thalachh',
       'exng', 'oldpeak', 'slp', 'caa', 'thall', 'output'],
      dtype='object')

Read the data

In [5]:
X = df.drop(['output'], axis=1)
y = df.output


X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8,
                                                      test_size=0.2, random_state=0)


#Data Preprocessing
1.   Handling Null Values
2.   Feature Scaling





#Handling Null Values


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trtbps    303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalachh  303 non-null    int64  
 8   exng      303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slp       303 non-null    int64  
 11  caa       303 non-null    int64  
 12  thall     303 non-null    int64  
 13  output    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [7]:
df.isnull().sum()

age         0
sex         0
cp          0
trtbps      0
chol        0
fbs         0
restecg     0
thalachh    0
exng        0
oldpeak     0
slp         0
caa         0
thall       0
output      0
dtype: int64

So in our data we don't have any null values but we have missing values, so we can handle with them with imputation. I'll use mean imputation

In [8]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
cols_with_miss_vals = [col for col in X_train.columns if X_train[col].isnull().any()]

imputed_X_train = pd.DataFrame(imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(imputer.transform(X_valid))

#Imputer removed colum names, so we put them back

imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns

#Feature Scaling

For the moment I don't know this topic. I will return here after studying it

#Training The Model

 Here I will use Regression and Classification models: XGBRegressor and XGBClassifier

In [9]:
from xgboost import XGBRegressor, XGBClassifier


xgb_regressor = XGBRegressor(n_estimators=500, learning_rate=0.05, n_jobs=4)

xgb_regressor.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    verbose=False
)

xgb_classifier = XGBClassifier(n_estimators=500, learning_rate=0.05, n_jobs=4)

xgb_classifier.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    verbose=False
)

#Making predictions

In [10]:
from sklearn.metrics import mean_absolute_error, classification_report
predictions_regressor = xgb_regressor.predict(X_valid)
predictions_classifier = xgb_classifier.predict(X_valid)

errors = mean_absolute_error(predictions_regressor, y_valid)
print(errors)
errors2 = classification_report(predictions_classifier, y_valid)
print(errors2)
print(mean_absolute_error(predictions_classifier, y_valid))


0.25722518533880473
              precision    recall  f1-score   support

           0       0.81      0.76      0.79        29
           1       0.79      0.84      0.82        32

    accuracy                           0.80        61
   macro avg       0.80      0.80      0.80        61
weighted avg       0.80      0.80      0.80        61

0.19672131147540983
