```
Column     Description                                             Feature Type
---------------------------------------------------------------------------------
Age        Age in years                                            Numerical
Sex        (1 = male; 0 = female)                                  Categorical
CP         Chest pain type (0, 1, 2, 3, 4)                         Categorical
Trestbpd   Resting blood pressure (in mm Hg on admission)          Numerical  
Chol       Serum cholesterol in mg/dl                              Numerical
FBS        fasting blood sugar in 120 mg/dl (1 = true; 0 = false)  Categorical
RestECG    Resting electrocardiogram results (0, 1, 2)             Categorical 
Thalach    Maximum heart rate achieved                             Numerical
Exang      Exercise induced angina (1 = yes; 0 = no)               Categorical
Oldpeak    ST depression induced by exercise relative to rest      Numerical
Slope      Slope of the peak exercise ST segment                   Numerical
CA         Number of major vessels (0-3) colored by fluoroscopy    Categorical
Thal       3 = normal; 6 = fixed defect; 7 = reversible defect     Categorical
Target     Diagnosis of heart disease (1 = true; 0 = false)        Target
```

In [1]:
def load_data():

    import pandas as pd

    dataset = pd.read_csv("../files/input/heart_disease.csv")
    y = dataset.pop("target")
    x = dataset.copy()
    x["thal"] = x["thal"].map(
        lambda x: "normal" if x not in ["fixed", "fixed", "reversible"] else x
    )

    return x, y


x, y = load_data()
x

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,1,145,233,1,2,150,0,2.3,3,0,fixed
1,67,1,4,160,286,0,2,108,1,1.5,2,3,normal
2,67,1,4,120,229,0,2,129,1,2.6,2,2,reversible
3,37,1,3,130,250,0,0,187,0,3.5,3,0,normal
4,41,0,2,130,204,0,2,172,0,1.4,1,0,normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,52,1,1,118,186,0,2,190,0,0.0,2,0,fixed
299,43,0,4,132,341,1,2,136,1,3.0,2,0,reversible
300,65,1,4,135,254,0,2,127,0,2.8,2,1,reversible
301,48,1,4,130,256,1,2,150,1,0.0,1,2,reversible


In [3]:
def make_train_test_split(x, y):

    from sklearn.model_selection import train_test_split

    (x_train, x_test, y_train, y_test) = train_test_split(
        x,
        y,
        test_size=0.10,
        random_state=0,
    )
    return x_train, x_test, y_train, y_test


In [4]:
def make_pipeline(estimator):

    from sklearn.compose import ColumnTransformer
    from sklearn.feature_selection import SelectKBest, f_classif
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import OneHotEncoder

    transformer = ColumnTransformer(
        transformers=[
            ("ohe", OneHotEncoder(dtype="int"), ["thal"]),
        ],
        remainder="passthrough",
    )

    selectkbest = SelectKBest(score_func=f_classif)

    pipeline = Pipeline(
        steps=[
            ("tranformer", transformer),
            ("selectkbest", selectkbest),
            ("estimator", estimator),
        ],
        verbose=False,
    )

    return pipeline  
  

In [5]:
def make_grid_search(estimator, param_grid, cv=5):

    from sklearn.model_selection import GridSearchCV

    grid_search = GridSearchCV(
        estimator=estimator,
        param_grid=param_grid,
        cv=cv,
        scoring="balanced_accuracy",
    )

    return grid_search  
  

In [11]:
def save_estimator(estimator):

    import pickle

    with open("estimator.pickle", "wb") as file:
        pickle.dump(estimator, file)
        

In [12]:
def load_estimator():

    import os
    import pickle

    if not os.path.exists("estimator.pickle"):
        return None
    with open("estimator.pickle", "rb") as file:
        estimator = pickle.load(file)

    return estimator
  
  


In [13]:
def train_estimator(estimator):


    from sklearn.metrics import mean_absolute_error

    data, target = load_data()

    x_train, x_test, y_train, y_test = make_train_test_split(
        x=data,
        y=target,
    )

    estimator.fit(x_train, y_train)

    best_estimator = load_estimator()

    if best_estimator is not None:

        saved_mae = mean_absolute_error(
            y_true=y_test, y_pred=best_estimator.predict(x_test)
        )

        current_mae = mean_absolute_error(
            y_true=y_test, y_pred=estimator.predict(x_test)
        )

        if saved_mae < current_mae:
            estimator = best_estimator

    save_estimator(estimator)

In [14]:
def train_logistic_regression():

    from sklearn.linear_model import LogisticRegression

    pipeline = make_pipeline(
        estimator=LogisticRegression(max_iter=10000, solver="saga"),
    )

    param_grid = {
        "selectkbest__k": range(1, 11),
        "estimator__penalty": ["l1", "l2"],
        "estimator__C": [0.001, 0.01, 0.1, 1, 10, 100],
    }

    estimator = make_grid_search(
        estimator=pipeline,
        param_grid=param_grid,
        cv=5,
    )

    train_estimator(estimator)


train_logistic_regression()
  

In [15]:
def eval_metrics(
    y_train_true,
    y_test_true,
    y_train_pred,
    y_test_pred,
):

    from sklearn.metrics import accuracy_score, balanced_accuracy_score

    accuracy_train = round(accuracy_score(y_train_true, y_train_pred), 4)
    accuracy_test = round(accuracy_score(y_test_true, y_test_pred), 4)
    balanced_accuracy_train = round(
        balanced_accuracy_score(y_train_true, y_train_pred), 4
    )
    balanced_accuracy_test = round(balanced_accuracy_score(y_test_true, y_test_pred), 4)

    return (
        accuracy_train,
        accuracy_test,
        balanced_accuracy_train,
        balanced_accuracy_test,
    )  

In [16]:
def report(
    estimator,
    accuracy_train,
    accuracy_test,
    balanced_accuracy_train,
    balanced_accuracy_test,
):

    print(estimator, ":", sep="")
    print("-" * 80)
    print(f"Balanced Accuracy: {balanced_accuracy_test} ({balanced_accuracy_train})")
    print(f"         Accuracy: {accuracy_test} ({accuracy_train})")

In [None]:
def train_mlp_classifier():

    from sklearn.neural_network import MLPClassifier

    pipeline = make_pipeline(
        estimator=MLPClassifier(max_iter=10000),
    )

    param_grid = {
        "selectkbest__k": range(1, 11),
        "estimator__hidden_layer_sizes": [(h,) for h in range(1, 11)],
        "estimator__learning_rate_init": [0.0001, 0.001, 0.01, 0.1, 1.0],
    }

    estimator = make_grid_search(
        estimator=pipeline,
        param_grid=param_grid,
        cv=5,
    )

    train_estimator(estimator)


train_mlp_classifier()
check_estimator()