In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, precision_score, precision_recall_curve, log_loss
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
import catboost as catb
import warnings
import dill

### About this dataset ###

Age : Age of the patient

Sex : Sex of the patient

exang: exercise induced angina (1 = yes; 0 = no)

ca: number of major vessels (0-3)

cp : Chest Pain type chest pain type

Value 1: typical angina

Value 2: atypical angina

Value 3: non-anginal pain

Value 4: asymptomatic

trtbps : resting blood pressure (in mm Hg)

chol : cholestoral in mg/dl fetched via BMI sensor

fbs : (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)

rest_ecg : resting electrocardiographic results

Value 0: normal

Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)

Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria

thalach : maximum heart rate achieved

target : 0= less chance of heart attack 1= more chance of heart attack

In [3]:
data = pd.read_csv("C:/Users/emmik/Downloads/Course Project/data/heart.csv")

In [4]:
data.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [5]:
feats = ['age', 'sex', 'cp', 'exng', 'trtbps', 'thalachh']
target_feature = 'output'

In [6]:
df = data[feats]
target = data[target_feature]

In [7]:
df.describe()

Unnamed: 0,age,sex,cp,exng,trtbps,thalachh
count,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,0.966997,0.326733,131.623762,149.646865
std,9.082101,0.466011,1.032052,0.469794,17.538143,22.905161
min,29.0,0.0,0.0,0.0,94.0,71.0
25%,47.5,0.0,0.0,0.0,120.0,133.5
50%,55.0,1.0,1.0,0.0,130.0,153.0
75%,61.0,1.0,2.0,1.0,140.0,166.0
max,77.0,1.0,3.0,1.0,200.0,202.0


In [8]:
disbalance=target.value_counts()[0] / target.value_counts()[1]
display(target.value_counts(), disbalance)

1    165
0    138
Name: output, dtype: int64

0.8363636363636363

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=0.20, random_state=42)
y_test = y_test.to_frame(name='target')
display(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(242, 6)

(61, 6)

(242,)

(61, 1)

In [10]:
X_test.to_csv('X_test.csv', index=None)
y_test.to_csv('y_test.csv', index=None)

X_train.to_csv('X_train.csv', index=None)
y_train.to_csv('y_train.csv', index=None)

In [11]:
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

In [12]:
transformers = list()

for col in feats:
    transformer = Pipeline([
                ('selector', NumberSelector(key=col)),
                ('scaler', StandardScaler())
            ])
    transformers.append((col, transformer))

In [13]:
feats_prep = FeatureUnion(transformers)

feature_processing = Pipeline([('feats_prep', feats_prep)])

In [14]:
params_model = {
    'eval_metric': 'F1',
    'class_weights':[1, disbalance],
    'silent': True,
    #'one_hot_max_size': 15,
    'early_stopping_rounds': 20,
    'boosting_type': 'Ordered',
    'allow_writing_files': False
}

In [15]:
pipeline = Pipeline([
    ('features',feats_prep),
    ('classifier', catb.CatBoostClassifier(**params_model, random_state=42)),
])

In [16]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('age',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='age')),
                                                                 ('scaler',
                                                                  StandardScaler())])),
                                                ('sex',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='sex')),
                                                                 ('scaler',
                                                                  StandardScaler())])),
                                                ('cp',
                                                 Pipeline(steps=[('selector',
                                                      

In [17]:
with open("sv_pipeline.dill", "wb") as f:
    dill.dump(pipeline, f)