In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [2]:
## import data
set_1_path = 'data/heart.csv'
heart_df = pd.read_csv(set_1_path)
heart_df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [3]:
# there are four different chest pain types
heart_df['ChestPainType'].value_counts()

ASY    496
NAP    203
ATA    173
TA      46
Name: ChestPainType, dtype: int64

In [4]:
# three different resting ecg responses
heart_df['RestingECG'].value_counts()

Normal    552
LVH       188
ST        178
Name: RestingECG, dtype: int64

In [5]:
# three st_slope values
heart_df['ST_Slope'].value_counts()

Flat    460
Up      395
Down     63
Name: ST_Slope, dtype: int64

In [6]:
# logistic regression
# create a data frame from the csv data
heart_df_log = pd.read_csv(set_1_path)
# one hot encoding with get_dummies
heart_df_log = pd.get_dummies(heart_df_log)
heart_df_log.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_F,Sex_M,ChestPainType_ASY,...,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,0,0,1,0,...,0,0,0,1,0,1,0,0,0,1
1,49,160,180,0,156,1.0,1,1,0,0,...,1,0,0,1,0,1,0,0,1,0
2,37,130,283,0,98,0.0,0,0,1,0,...,0,0,0,0,1,1,0,0,0,1
3,48,138,214,0,108,1.5,1,1,0,1,...,0,0,0,1,0,0,1,0,1,0
4,54,150,195,0,122,0.0,0,0,1,0,...,1,0,0,1,0,1,0,0,0,1


In [7]:
# split into training and testing data sets
X = heart_df_log.drop('HeartDisease', axis=1)
y = heart_df_log['HeartDisease']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [8]:
# Create scaler instance
X_scaler = StandardScaler(copy=False)

# Fit the scaler
X_scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
# create the classifier
# un-scaled
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.8719346049046321
Testing Data Score: 0.8641304347826086


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
# create the classifier
classifier_2 = LogisticRegression()
classifier_2.fit(X_train_scaled, y_train)
print(f"Training Data Score: {classifier_2.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier_2.score(X_test_scaled, y_test)}")

Training Data Score: 0.8732970027247956
Testing Data Score: 0.8695652173913043


In [11]:
# so it scaled all 20 columns. Is this ok since many of them are binary? It doesn't seem like those should be scaled.
X_test_scaled_df = pd.DataFrame(X_test_scaled)
X_test_scaled_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.374025,0.419253,0.60436,-0.546914,0.323294,0.088357,-0.519515,0.519515,-1.088358,2.078182,-0.532181,-0.227103,-0.5174,-1.230325,2.087502,-1.213005,1.213005,-0.270369,0.997279,-0.869472
1,-2.624009,-0.13452,0.042503,-0.546914,2.567895,-0.834171,-0.519515,0.519515,-1.088358,2.078182,-0.532181,-0.227103,1.93274,-1.230325,-0.479041,0.824399,-0.824399,-0.270369,-1.002729,1.150124
2,0.15988,0.031612,-0.132502,-0.546914,-0.031117,-0.649665,-0.519515,0.519515,-1.088358,-0.48119,1.879059,-0.227103,-0.5174,-1.230325,2.087502,0.824399,-0.824399,-0.270369,-1.002729,1.150124
3,0.374025,-0.13452,1.028056,1.828441,0.441431,1.010884,-0.519515,0.519515,0.918815,-0.48119,-0.532181,-0.227103,-0.5174,-1.230325,2.087502,-1.213005,1.213005,-0.270369,0.997279,-0.869472
4,-0.589629,-0.688293,0.779365,-0.546914,-0.66118,-0.834171,1.924871,-1.924871,-1.088358,2.078182,-0.532181,-0.227103,-0.5174,0.812794,-0.479041,0.824399,-0.824399,-0.270369,-1.002729,1.150124


In [12]:
X_test.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,Sex_F,Sex_M,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
70,57,140,265,0,145,1.0,0,1,0,1,0,0,0,0,1,0,1,0,1,0
829,29,130,204,0,202,0.0,0,1,0,1,0,0,1,0,0,1,0,0,0,1
597,55,133,185,0,136,0.2,0,1,0,0,1,0,0,0,1,1,0,0,0,1
478,57,130,311,1,148,2.0,0,1,1,0,0,0,0,0,1,0,1,0,1,0
9,48,120,284,0,120,0.0,1,0,0,1,0,0,0,1,0,1,0,0,0,1
