In [1]:
%load_ext lab_black

In [21]:
import pandas as pd

from matplotlib import pyplot as plt
import numpy as np

# from sklearn.tree import export_graphviz
# from sklearn.externals.six import StringIO
# from IPython.display import Image
# import pydotplus

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.dummy import DummyClassifier
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVR

import joblib

In [3]:
df = pd.read_csv("heart_disease_data.csv")

In [4]:
# df.columns
df.dtypes

age            int64
sex            int64
cp             int64
trestbps       int64
chol           int64
fbs            int64
restecg        int64
thalach        int64
exang          int64
oldpeak      float64
slope          int64
ca             int64
thal           int64
condition      int64
dtype: object

In [5]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,condition
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0,0
1,69,0,0,140,239,0,0,151,0,1.8,0,2,0,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0,0,0
3,65,1,0,138,282,1,2,174,0,1.4,1,1,0,1
4,64,1,0,110,211,0,2,144,1,1.8,1,0,0,0


In [6]:
df = (
    df.drop("restecg", axis=1)
    .drop("oldpeak", axis=1)
    .drop("slope", axis=1)
    .drop("ca", axis=1)
    .drop("thal", axis=1)
)
df.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'thalach', 'exang',
       'condition'],
      dtype='object')

In [7]:
df["condition"].value_counts()

0    160
1    137
Name: condition, dtype: int64

In [8]:
# pipeline for condition
df = df.assign(
    exang=lambda df: df["condition"].replace(
        {0: "No Heart Disease", 1: "Yes Heart Disease"}
    )
)

In [9]:
# pipeline for exang: exercise induced angina (1 = yes; 0 = no)
df = df.assign(exang=lambda df: df["exang"].replace({0: "no", 1: "yes"}))

In [10]:
# pipeline for fbs: (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
df = df.assign(fbs=lambda df: df["fbs"].replace({0: "false", 1: "true"}))

In [11]:
# pipeline for cp: chest pain type
df = df.assign(
    cp=lambda df: df["cp"].replace(
        {
            0: "typical angina",
            1: "atypical angina",
            2: "non-anginal pain",
            3: "asymptomatic",
        }
    )
)

In [12]:
target = "condition"

In [13]:
y = df[target].values
X = df.drop(target, axis=1)

In [14]:
y

array([0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1])

In [15]:
X.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'thalach', 'exang'], dtype='object')

In [27]:
cf = ColumnTransformer(
    [
        ("numerical", "passthrough", ["age", "trestbps", "chol", "thalach"],),
        ('"categorical"', OneHotEncoder(drop="first"), ["sex", "cp", "fbs", "exang"]),
    ]
)

In [28]:
lr_pipeline = make_pipeline(cf, LinearRegression())
dt_pipeline = make_pipeline(cf, DecisionTreeRegressor())
rf_pipeline = make_pipeline(cf, RandomForestRegressor())

In [29]:
k_fold = KFold(n_splits=6, shuffle=True, random_state=42)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [None]:
# Logistic Regression Model

lr = LogisticRegression()
lr.fit(X_train, y_train)
y_test_pred_lr = lr.predict(X_test)
print(classification_report(y_test, y_test_pred_lr))

In [None]:
# accuracy score for Logistic Regression Model

accuracy_score(y_test, y_test_pred_lr)

In [None]:
# Dummy Classifier

dc = DummyClassifier()
dc.fit(X_train, y_train)
y_test_pred_dc = dc.predict(X_test)
print(classification_report(y_test, y_test_pred_dc))

In [None]:
# accuracy score for Dummy Classifier

accuracy_score(y_test, y_test_pred_dc)

In [None]:
# Decision Tree Model (unpruned)

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_test_pred_dt = dt.predict(X_test)
print(classification_report(y_test, y_test_pred_dt))

In [None]:
# accuracy score for decision tree model (unpruned)

accuracy_score(y_test, y_test_pred_dt)

In [None]:
# Unpruned Tree

dot_data = StringIO()
export_graphviz(dt, out_file=dot_data)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png("tree.png")
Image(graph.create_png())

In [None]:
# Prune the tree - gini

dt_p1 = DecisionTreeClassifier(criterion="gini")
dt_p1.fit(X_train, y_train)
y_test_pred_dt_p1 = dt_p1.predict(X_test)
print("Criterion=gini", classification_report(y_test, y_test_pred_dt_p1))

In [None]:
# accuracy score for decision tree model using gini

accuracy_score(y_test, y_test_pred_dt_p1)

In [None]:
# Prune the tree - entropy

dt_p2 = DecisionTreeClassifier(criterion="entropy")
dt_p2.fit(X_train, y_train)
y_test_pred_dt_p2 = dt_p2.predict(X_test)
print("Criterion=entropy", classification_report(y_test, y_test_pred_dt_p2))

In [None]:
# accuracy score for decision tree model using entropy

accuracy_score(y_test, y_test_pred_dt_p2)

In [None]:
# Prune the tree - loop to find best depth

max_depth_dt = []
acc_gini_dt = []
acc_entropy_dt = []
for i in range(1, 30):
    tree_dt = DecisionTreeClassifier(criterion="gini", max_depth=i)
    tree_dt.fit(X_train, y_train)
    pred_g_dt = tree_dt.predict(X_test)
    acc_gini_dt.append(accuracy_score(y_test, pred_g_dt))

    tree_dt = DecisionTreeClassifier(criterion="entropy", max_depth=i)
    tree_dt.fit(X_train, y_train)
    pred_e_dt = tree_dt.predict(X_test)
    acc_entropy_dt.append(accuracy_score(y_test, pred_e_dt))

    max_depth_dt.append(i)

df_dt = pd.DataFrame(
    {
        "acc_gini_dt": pd.Series(acc_gini_dt),
        "acc_entropy_dt": pd.Series(acc_entropy_dt),
        "max_depth_dt": pd.Series(max_depth_dt),
    }
)

# visualizing changes in parameters
plt.plot("max_depth_dt", "acc_gini_dt", data=df_dt, label="gini")
plt.plot("max_depth_dt", "acc_entropy_dt", data=df_dt, label="entropy")
plt.xlabel("Depth")
plt.ylabel("Accuracy Score")
plt.legend()

In [None]:
# Prune tree using entropy at 7

dt_p_fin = DecisionTreeClassifier(criterion="entropy", max_depth=7)
dt_p_fin.fit(X_train, y_train)
y_test_pred_dt_p_fin = dt_p_fin.predict(X_test)
print("Criterion=entropy", classification_report(y_test, y_test_pred_dt_p_fin))

In [None]:
# accuracy score for decision tree model pruned at 7 using entropy

accuracy_score(y_test, y_test_pred_dt_p_fin)

In [None]:
# Graphic for decision tree model pruned at 7 using entropy

dot_data = StringIO()
export_graphviz(dt_p_fin, out_file=dot_data)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png("tree.png")
Image(graph.create_png())

In [None]:
# Random Forest Model

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_test_pred_rf = rf.predict(X_test)
print(classification_report(y_test, y_test_pred_rf))

In [None]:
# accuracy score for random forest model (unpruned)

accuracy_score(y_test, y_test_pred_rf)

In [None]:
# Prune the forest - gini

rf_p1 = RandomForestClassifier(criterion="gini")
rf_p1.fit(X_train, y_train)
y_test_pred_rf_p1 = rf_p1.predict(X_test)
print("Criterion=gini", classification_report(y_test, y_test_pred_rf_p1))

In [None]:
# accuracy score for random forest model using gini

accuracy_score(y_test, y_test_pred_rf_p1)

In [None]:
# Prune the forest - entropy

rf_p2 = RandomForestClassifier(criterion="entropy")
rf_p2.fit(X_train, y_train)
y_test_pred_rf_p2 = rf_p2.predict(X_test)
print("Criterion=entropy", classification_report(y_test, y_test_pred_rf_p2))

In [None]:
# accuracy score for random forest model using entropy

accuracy_score(y_test, y_test_pred_rf_p2)

In [None]:
# Prune the forest - loop to find best depth

max_depth_rf = []
acc_gini_rf = []
acc_entropy_rf = []
for i in range(1, 30):
    tree_rf = RandomForestClassifier(criterion="gini", max_depth=i)
    tree_rf.fit(X_train, y_train)
    pred_g_rf = tree_rf.predict(X_test)
    acc_gini_rf.append(accuracy_score(y_test, pred_g_rf))

    tree_rf = RandomForestClassifier(criterion="entropy", max_depth=i)
    tree_rf.fit(X_train, y_train)
    pred_e_rf = tree_rf.predict(X_test)
    acc_entropy_rf.append(accuracy_score(y_test, pred_e_rf))

    max_depth_rf.append(i)

df_rf = pd.DataFrame(
    {
        "acc_gini_rf": pd.Series(acc_gini_rf),
        "acc_entropy_rf": pd.Series(acc_entropy_rf),
        "max_depth_rf": pd.Series(max_depth_rf),
    }
)

# visualizing changes in parameters
plt.plot("max_depth_rf", "acc_gini_rf", data=df_rf, label="gini")
plt.plot("max_depth_rf", "acc_entropy_rf", data=df_rf, label="entropy")
plt.xlabel("Depth")
plt.ylabel("Accuracy Score")
plt.legend()

In [None]:
# Prune forest using entropy at 9

rf_p_fin = RandomForestClassifier(criterion="entropy", max_depth=9)
rf_p_fin.fit(X_train, y_train)
y_test_pred_rf_p_fin = rf_p_fin.predict(X_test)
print("Criterion=entropy", classification_report(y_test, y_test_pred_rf_p_fin))

In [None]:
# accuracy score for  forest using entropy at 9

accuracy_score(y_test, y_test_pred_rf_p_fin)

In [None]:
dt_p_fin.fit(X, y)

In [None]:
dt_p_fin.predict([[65, 0, 3, 110, 264, 1, 131, 1]])[0]

In [None]:
# joblib.dump(dt_p_fin, "clf.joblib")