In [1]:
%load_ext lab_black

In [2]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVR
from sklearn.pipeline import make_pipeline

from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold

In [3]:
df = pd.read_csv("heart_disease_data.csv")

In [4]:
df = df.assign(sex=lambda df: df["sex"].replace({0: "male", 1: "female"}))

In [5]:
# thal: 0 = normal; 1 = fixed defect; 2 = reversable defect
df = df.assign(
    thal=lambda df: df["thal"].replace(
        {0: "normal", 1: "fixed defect", 2: "reversible defect"}
    )
)

In [6]:
# cp: chest pain type
df = df.assign(
    cp=lambda df: df["cp"].replace(
        {
            0: "typical angina",
            1: "atypical angina",
            2: "non-anginal pain",
            3: "asymptomatic",
        }
    )
)

In [7]:
# fbs: (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
df = df.assign(fbs=lambda df: df["fbs"].replace({0: "false", 1: "true"}))

In [8]:
# restecg: resting electrocardiographic results
df = df.assign(
    restecg=lambda df: df["restecg"].replace(
        {0: "normal", 1: "ST-T wave abnormality", 2: "left ventricular hypertrophy"}
    )
)

In [9]:
# exang: exercise induced angina (1 = yes; 0 = no)
df = df.assign(exang=lambda df: df["exang"].replace({0: "no", 1: "yes"}))

In [10]:
# slope: the slope of the peak exercise ST segment
df = df.assign(
    restecg=lambda df: df["restecg"].replace(
        {0: "upsloping", 1: "flat", 2: "downsloping"}
    )
)

In [11]:
target = "condition"

In [12]:
X = df.drop(target, axis=1)
y = df[target].values

In [13]:
X.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal'],
      dtype='object')

In [21]:
cf = ColumnTransformer(
    [
        (
            "numerical",
            "passthrough",
            ["age", "trestbps", "chol", "thalach", "oldpeak", "ca",],
        ),
        (
            "categorical",
            OneHotEncoder(drop="first"),
            ["thal", "sex", "cp", "fbs", "restecg", "exang", "slope",],
        ),
    ]
)

In [15]:
lr_pipeline = make_pipeline(cf, LinearRegression())
dt_pipeline = make_pipeline(cf, DecisionTreeRegressor())
rf_pipeline = make_pipeline(cf, RandomForestRegressor())

In [16]:
lr_pipeline.fit(X, y)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('numerical', 'passthrough',
                                                  ['age', 'trestbps', 'chol',
                                                   'thalach', 'oldpeak',
                                                   'ca']),
                                                 ('categorical',
                                                  OneHotEncoder(drop='first'),
                                                  ['thal', 'sex', 'cp', 'fbs',
                                                   'restecg', 'exang',
                                                   'slope'])])),
                ('linearregression', LinearRegression())])

In [17]:
k_fold = KFold(n_splits=6, shuffle=True, random_state=42)
k_fold

KFold(n_splits=6, random_state=42, shuffle=True)

In [18]:
cv_lr = cross_validate(lr_pipeline, X, y, scoring="neg_mean_squared_error", cv=k_fold)[
    "test_score"
].mean()
cv_lr

-0.1277330014062823

In [19]:
cv_dt = cross_validate(dt_pipeline, X, y, scoring="neg_mean_squared_error", cv=k_fold)[
    "test_score"
].mean()
cv_dt

-0.2895918367346939

In [20]:
cv_rf = cross_validate(rf_pipeline, X, y, scoring="neg_mean_squared_error", cv=k_fold)[
    "test_score"
].mean()
cv_rf

-0.13840003401360543