In [None]:
#First, we need to import all the libraries we need
import sklearn
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt


Logistic Regression; Linear Regression; Naïve Bayes Classifier; Support Vector Machines; K-nearest neighbor; Random Forest

In [3]:
#Next, we read in the Excel file
df = pd.read_excel ('C:/Users/hruss/Desktop/OneDriveDocs/OneDrive/Documents/GMU/Datasets/heart.xlsx')

In [4]:
#Let's take a look at the head of the dataset
#df.head()
#Or the tail of it
#df.tail()
#Or an overall view of types and missing values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   914 non-null    object 
 3   RestingBP       914 non-null    float64
 4   Cholesterol     746 non-null    float64
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 86.2+ KB


In [5]:
#Split train/test dataset 80/20, without regard to class size
#Before we change anything at all on the dataset, we need to split into test and training sets.
#You want to know the true error rate on new untouched data, so you need an untouched set to test that
def split_train_test(data, test_ratio): 
    np.random.seed(42)
    shuffled_indices = np.random.permutation(len( data)) 
    test_set_size = int(len( data) * test_ratio) 
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:] 
    return data.iloc[train_indices], data.iloc[test_indices]
train_set, test_set = split_train_test(df, 0.2) 
#len(train_set) 
len(test_set)

183

In [6]:
#Split train/test dataset 80/20, ensuring representative class sizes for a column
#When you have uneven classes: in this case we have many more Male observations than Female
#In this case, we want to ensure that the training set and test set are representative of the full population
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42) 
for train_index, test_index in split.split(df, df["Sex"]): 
    strat_train_set = df.loc[train_index] 
    strat_test_set = df.loc[test_index]
strat_test_set["Sex"].value_counts() / len(strat_test_set)

M    0.788043
F    0.211957
Name: Sex, dtype: float64

In [7]:
#Double check the proportion of test data split to this population split
df["Sex"].value_counts() / len(df)

M    0.78976
F    0.21024
Name: Sex, dtype: float64

In [8]:
#Let's check describe again
strat_train_set.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,734.0,730.0,591.0,734.0,734.0,734.0,734.0
mean,56.014986,132.650685,269.92555,0.241144,135.982289,0.910899,0.569482
std,35.790828,17.803836,310.772119,0.428069,25.576912,1.069363,0.495486
min,28.0,80.0,85.0,0.0,60.0,-2.6,0.0
25%,48.0,120.0,208.0,0.0,118.25,0.0,0.0
50%,55.0,130.0,239.0,0.0,137.0,0.7,1.0
75%,60.0,140.0,278.5,0.0,155.0,1.575,1.0
max,660.0,200.0,6003.0,1.0,202.0,6.2,1.0


In [9]:
#A few things are wrong with the training data. Let's fix those before we go further.
#First, the age values go up to 660. Based on life expetancy, we'll set the max to 77.
#Cholesterol also has some seriously high values. High risk is 600, let's cap at 800.
for x in strat_train_set.index:
    if strat_train_set.loc[x, "Cholesterol"] > 800:
        strat_train_set.loc[x, "Cholesterol"] = 800
    if strat_train_set.loc[x, "Age"] > 77:
        strat_train_set.loc[x, "Age"] = 77
    if strat_train_set.loc[x, "RestingBP"] < 50:
        strat_train_set.loc[x, "RestingBP"] = 50    
strat_train_set.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,734.0,730.0,591.0,734.0,734.0,734.0,734.0
mean,53.904632,132.650685,249.099831,0.241144,135.982289,0.910899,0.569482
std,9.345084,17.803836,75.645999,0.428069,25.576912,1.069363,0.495486
min,28.0,80.0,85.0,0.0,60.0,-2.6,0.0
25%,48.0,120.0,208.0,0.0,118.25,0.0,0.0
50%,55.0,130.0,239.0,0.0,137.0,0.7,1.0
75%,60.0,140.0,278.5,0.0,155.0,1.575,1.0
max,77.0,200.0,800.0,1.0,202.0,6.2,1.0


In [10]:
#Let's see a count of nulls
strat_train_set.isnull().sum()

Age                 0
Sex                 0
ChestPainType       1
RestingBP           4
Cholesterol       143
FastingBS           0
RestingECG          0
MaxHR               0
ExerciseAngina      0
Oldpeak             0
ST_Slope            0
HeartDisease        0
dtype: int64

In [11]:
#We impute missing values
strat_train_set['ChestPainType'].fillna(strat_train_set['ChestPainType'].value_counts().index[0], inplace=True)
strat_train_set['RestingBP'].fillna(strat_train_set['RestingBP'].mean(), inplace=True)
strat_train_set['Cholesterol'].fillna(strat_train_set['Cholesterol'].mean(), inplace=True)
strat_train_set.isnull().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [12]:
#Ordinal doesn't always work. In this case, we want 1 Hot encoding (1 is hot or true, 0 is not or false)
#You'll wind up with a sparse matrix because there will be a lot of zeros.
from sklearn.preprocessing import OneHotEncoder 
cat_encoder = OneHotEncoder() 
strat_train_1hot = cat_encoder.fit_transform(strat_train_set[['Sex', 'ChestPainType', 'ExerciseAngina', 'ST_Slope']]) 
enc_names = cat_encoder.get_feature_names_out()
strat_train_1hot_df = pd.DataFrame(strat_train_1hot.todense(), columns = enc_names)
train_tr = pd.merge(strat_train_1hot_df, strat_train_set.drop(['Sex', 'ChestPainType', 'ExerciseAngina', 'ST_Slope'], axis=1), left_index=True, right_index=True)
train_tr.head()

Unnamed: 0,Sex_F,Sex_M,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up,Age,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,Oldpeak,HeartDisease
0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,62,160.0,164.0,0,LVH,145,6.2,1
1,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,55,140.0,217.0,0,Normal,111,5.6,1
4,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,61,120.0,282.0,0,ST,135,4.0,1
5,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,64,134.0,273.0,0,Normal,102,4.0,1
6,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,74,150.0,258.0,1,ST,130,4.0,1


In [None]:
#Classify observations as 1 or 0 for Heart Disease
heart_disease = (train_tr == 1)