In [1]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_curve,
    roc_auc_score,
    f1_score,
    make_scorer,
)
from sklearn.tree import DecisionTreeRegressor, plot_tree, DecisionTreeClassifier
from category_encoders import LeaveOneOutEncoder

import matplotlib.pyplot as plt

%matplotlib inline

# https://www.kaggle.com/ronitf/heart-disease-uci?select=heart.csv

<IPython.core.display.Javascript object>

In [3]:
def eval_preds(y_true, y_pred):
    """Display some metrics for evaluating regression predictions"""
    error = y_true - y_pred

    rmse = np.mean(np.sqrt((error ** 2)))
    mae = np.mean(np.abs(error))
    mape = np.mean(np.abs((error / y_true)))

    print(f"rmse {rmse:.2f}")
    print(f"mae {mae:.2f}")
    print(f"mape {mape:.2f}")

    line_pts = [np.min(y_true), np.max(y_true)]
    plt.scatter(y_true, y_pred)
    plt.plot(line_pts, line_pts, c="red", ls="--", alpha=0.5)
    plt.xlabel("Actual")
    plt.ylabel("Fit")
    plt.show()

<IPython.core.display.Javascript object>

In [4]:
heart_df = pd.read_csv("data/heart.csv")

<IPython.core.display.Javascript object>

In [5]:
heart_df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


<IPython.core.display.Javascript object>

In [6]:
heart_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


<IPython.core.display.Javascript object>

In [7]:
heart_df.corr()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
age,1.0,-0.098447,-0.068653,0.279351,0.213678,0.121308,-0.116211,-0.398522,0.096801,0.210013,-0.168814,0.276326,0.068001,-0.225439
sex,-0.098447,1.0,-0.049353,-0.056769,-0.197912,0.045032,-0.058196,-0.04402,0.141664,0.096093,-0.030711,0.118261,0.210041,-0.280937
cp,-0.068653,-0.049353,1.0,0.047608,-0.076904,0.094444,0.044421,0.295762,-0.39428,-0.14923,0.119717,-0.181053,-0.161736,0.433798
trestbps,0.279351,-0.056769,0.047608,1.0,0.123174,0.177531,-0.114103,-0.046698,0.067616,0.193216,-0.121475,0.101389,0.06221,-0.144931
chol,0.213678,-0.197912,-0.076904,0.123174,1.0,0.013294,-0.15104,-0.00994,0.067023,0.053952,-0.004038,0.070511,0.098803,-0.085239
fbs,0.121308,0.045032,0.094444,0.177531,0.013294,1.0,-0.084189,-0.008567,0.025665,0.005747,-0.059894,0.137979,-0.032019,-0.028046
restecg,-0.116211,-0.058196,0.044421,-0.114103,-0.15104,-0.084189,1.0,0.044123,-0.070733,-0.05877,0.093045,-0.072042,-0.011981,0.13723
thalach,-0.398522,-0.04402,0.295762,-0.046698,-0.00994,-0.008567,0.044123,1.0,-0.378812,-0.344187,0.386784,-0.213177,-0.096439,0.421741
exang,0.096801,0.141664,-0.39428,0.067616,0.067023,0.025665,-0.070733,-0.378812,1.0,0.288223,-0.257748,0.115739,0.206754,-0.436757
oldpeak,0.210013,0.096093,-0.14923,0.193216,0.053952,0.005747,-0.05877,-0.344187,0.288223,1.0,-0.577537,0.222682,0.210244,-0.430696


<IPython.core.display.Javascript object>

In [8]:
# possible colinearity between thalach and [exang, oldpeak, ca]
cols_of_interest = [
    "cp",
    "thalach",
    "exang",
    "oldpeak",
    "slope",
    "ca",
    "thal",
    "age",
    "sex",
]

<IPython.core.display.Javascript object>

In [12]:
X = heart_df.drop(columns="target")
y = heart_df["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

tree1 = GridSearchCV(DecisionTreeClassifier(), )
tree1.fit(X_train, y_train)

print(tree1.score(X_train, y_train))
print(tree1.score(X_test, y_test))

1.0
0.7377049180327869


<IPython.core.display.Javascript object>

In [11]:
# rerun tree with only highly correlated columns

X = heart_df[cols_of_interest]
y = heart_df["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

tree2 = DecisionTreeClassifier()
tree2.fit(X_train, y_train)

print(tree2.score(X_train, y_train))
print(tree2.score(X_test, y_test))

1.0
0.7213114754098361


<IPython.core.display.Javascript object>