In [1]:
%load_ext lab_black

In [2]:
import pandas as pd

from matplotlib import pyplot as plt
import numpy as np

# from sklearn.tree import export_graphviz
# from sklearn.externals.six import StringIO
# from IPython.display import Image
# import pydotplus

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.dummy import DummyClassifier
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold
import joblib

In [3]:
# read in data
df = pd.read_csv("heart_disease_data.csv")

In [4]:
# df["index"] = range(1, len(df) + 1)

In [5]:
# df.set_index("index")

In [6]:
# df.columns
df.dtypes

age            int64
sex            int64
cp             int64
trestbps       int64
chol           int64
fbs            int64
restecg        int64
thalach        int64
exang          int64
oldpeak      float64
slope          int64
ca             int64
thal           int64
condition      int64
dtype: object

In [7]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,condition
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0,0
1,69,0,0,140,239,0,0,151,0,1.8,0,2,0,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0,0,0
3,65,1,0,138,282,1,2,174,0,1.4,1,1,0,1
4,64,1,0,110,211,0,2,144,1,1.8,1,0,0,0


In [8]:
# drop unused columns
df = (
    df.drop("restecg", axis=1)
    .drop("oldpeak", axis=1)
    .drop("slope", axis=1)
    .drop("ca", axis=1)
    .drop("thal", axis=1)
)
df.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'thalach', 'exang',
       'condition'],
      dtype='object')

In [9]:
df["condition"].value_counts()

0    160
1    137
Name: condition, dtype: int64

In [10]:
# rename exang: exercise induced angina (1 = yes; 0 = no)
df = df.assign(exang=lambda df: df["exang"].replace({0: "no", 1: "yes"}))

In [11]:
# rename sex
df = df.assign(sex=lambda df: df["sex"].replace({0: "female", 1: "male"}))

In [12]:
# rename fbs: (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
df = df.assign(fbs=lambda df: df["fbs"].replace({0: "no", 1: "yes"}))

In [13]:
# rename cp: chest pain type
df = df.assign(
    cp=lambda df: df["cp"].replace(
        {
            0: "typical angina",
            1: "atypical angina",
            2: "non-anginal pain",
            3: "asymptomatic",
        }
    )
)

In [14]:
target = "condition"

In [15]:
y = df[target]
X = df.drop(target, axis=1)

In [16]:
cf = ColumnTransformer(
    [
        ("numerical", "passthrough", ["age", "trestbps", "chol", "thalach"],),
        ('"categorical"', OneHotEncoder(drop="first"), ["sex", "cp", "fbs", "exang"]),
    ]
)

In [17]:
# Logistic Regression Model

lr_pipeline = make_pipeline(cf, LogisticRegression(max_iter=1000))

In [18]:
k_fold = KFold(n_splits=6, shuffle=True, random_state=42)

In [19]:
lr_pipeline.fit(X, y)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('numerical', 'passthrough',
                                                  ['age', 'trestbps', 'chol',
                                                   'thalach']),
                                                 ('"categorical"',
                                                  OneHotEncoder(drop='first'),
                                                  ['sex', 'cp', 'fbs',
                                                   'exang'])])),
                ('logisticregression', LogisticRegression(max_iter=1000))])

In [20]:
# lr_pipeline.predict([[65, 0, 3, 110, 264, 1, 131, 1]])[0]

In [21]:
# joblib.dump(lr_pipeline, "clf.joblib")

In [25]:
lr_pipeline.predict(
    pd.DataFrame(
        {
            "age": 65,
            "sex": "male",
            "cp": "asymptomatic",
            "trestbps": 110,
            "chol": 264,
            "fbs": "yes",
            "thalach": 131,
            "exang": "yes",
        },
        index=[0],
    )
)[0]

1

In [None]:
# lr_pipeline.predict(pd.DataFrame({
#     "age": 65, 
#     "sex": 0, 
#     "cp": 3, 
#     "trestbps": 110, 
#     "chol": 264, 
#     "fbs": 1, 
#     "thalach": 131, 
#     "exang": 1}, index = [0]))