In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import pickle

In [19]:
# Import Data
df = pd.read_csv('../../data/Heart/heart-disease.csv')
df.head(5)

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [20]:
# Drop Columns
df = df.drop(columns=["education", "currentSmoker", "prevalentHyp", "sysBP", "glucose"])
df.head(5)

Unnamed: 0,male,age,cigsPerDay,BPMeds,prevalentStroke,diabetes,totChol,diaBP,BMI,heartRate,TenYearCHD
0,1,39,0.0,0.0,0,0,195.0,70.0,26.97,80.0,0
1,0,46,0.0,0.0,0,0,250.0,81.0,28.73,95.0,0
2,1,48,20.0,0.0,0,0,245.0,80.0,25.34,75.0,0
3,0,61,30.0,0.0,0,0,225.0,95.0,28.58,65.0,1
4,0,46,23.0,0.0,0,0,285.0,84.0,23.1,85.0,0


In [21]:
# Rename Columns
df = df.rename(columns={"male": "sex", "cigsPerDay": "daily_cigs", "BPMeds": "bp_meds", "prevalentStroke": "stroke_risk", "diaBP": "bp", "totChol": "cholestrol", "BMI": "bmi", "heartRate": "heart_rate", "TenYearCHD": "has_disease"})
df.head(5)

Unnamed: 0,sex,age,daily_cigs,bp_meds,stroke_risk,diabetes,cholestrol,bp,bmi,heart_rate,has_disease
0,1,39,0.0,0.0,0,0,195.0,70.0,26.97,80.0,0
1,0,46,0.0,0.0,0,0,250.0,81.0,28.73,95.0,0
2,1,48,20.0,0.0,0,0,245.0,80.0,25.34,75.0,0
3,0,61,30.0,0.0,0,0,225.0,95.0,28.58,65.0,1
4,0,46,23.0,0.0,0,0,285.0,84.0,23.1,85.0,0


In [22]:
# Convert Sex column for F = 1, M = 0
df['sex'] = 1 - df['sex']

# Fill Missing Values
df = df.fillna(value={'daily_cigs': 0, 'bp_meds': 0, 'cholestrol': 220, 'bmi': 22, 'heart_rate': 65})

# Convert Daily Cigs to Int
df['daily_cigs'] = df['daily_cigs'].astype(int)

# Convert BP Meds to Int
df['bp_meds'] = df['bp_meds'].astype(int)

# Convert Cholestrol to Int
df['cholestrol'] = df['cholestrol'].astype(int)

# Convert Heart Rate to Int
df['heart_rate'] = df['heart_rate'].astype(int)

df.head(5)

Unnamed: 0,sex,age,daily_cigs,bp_meds,stroke_risk,diabetes,cholestrol,bp,bmi,heart_rate,has_disease
0,0,39,0,0,0,0,195,70.0,26.97,80,0
1,1,46,0,0,0,0,250,81.0,28.73,95,0
2,0,48,20,0,0,0,245,80.0,25.34,75,0
3,1,61,30,0,0,0,225,95.0,28.58,65,1
4,1,46,23,0,0,0,285,84.0,23.1,85,0


In [23]:
# Save CSV File
df.to_csv('../../data/Heart/heart-disease-processed.csv')

In [31]:
# Get X and y vectors
predict_column = "has_disease"
X = np.array(df.drop([predict_column], 1))
y = np.array(df[predict_column])

In [25]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [27]:
# Create Logistic Model
model = LogisticRegression(max_iter=5000)
model.fit(X_train, y_train)

LogisticRegression(max_iter=5000)

In [28]:
# Test Accuracy
accuracy = model.score(X_test, y_test)
accuracy

0.8537735849056604

In [36]:
# Predict Custom Input
custom_input = np.array([[0, 19, 0, 0, 0, 0, 200, 80, 21, 80]])
prediction = model.predict(custom_input)
probability = model.predict_proba(custom_input)

# Print Values
print("Prediction: ", prediction[0])
print("Confidence: ", probability[0][0])

Prediction:  0
Confidence:  0.9872206326353129


In [37]:
# Save Model
with open("../Heart.bin", "wb") as f:
    pickle.dump(model, f)