# Heart Data

### Attribute Description

1 age: age in years

2 sex: sex (1 = male; 0 = female)

3 cp: chest pain type
-- Value 1: typical angina
-- Value 2: atypical angina
-- Value 3: non-anginal pain
-- Value 4: asymptomatic

4 trestbps: resting blood pressure (in mm Hg on admission to the hospital)

5 chol: serum cholestoral in mg/dl

6 fbs: (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)

7 restecg: resting electrocardiographic results
-- Value 0: normal
-- Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
-- Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria

8 thalach: maximum heart rate achieved

9 exang: exercise induced angina (1 = yes; 0 = no)

10 oldpeak = ST depression induced by exercise relative to rest

11 slope: the slope of the peak exercise ST segment
-- Value 1: upsloping
-- Value 2: flat
-- Value 3: downsloping

12 ca: number of major vessels (0-3) colored by flourosopy

13 thal: 3 = normal; 6 = fixed defect; 7 = reversable defect

14 target: diagnosis of heart disease (angiographic disease status)
-- Value 0: < 50% diameter narrowing
-- Value 1: > 50% diameter narrowing

### Data Import

In [1]:
# importing pandas library

import pandas as pd

In [2]:
# loading a dataset into a pandas dataframe using read_csv() function

raw_data = pd.read_csv('Desktop/Data/heart.csv')

In [3]:
# viewing first five rows of dataset using head() function

raw_data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,1
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [4]:
# a quick glance at the number of rows, number of attributes and their datatypes

raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
age         303 non-null float64
sex         303 non-null float64
cp          303 non-null float64
trestbps    303 non-null float64
chol        303 non-null float64
fbs         303 non-null float64
restecg     303 non-null float64
thalach     303 non-null float64
exang       303 non-null float64
oldpeak     303 non-null float64
slope       303 non-null float64
ca          303 non-null object
thal        303 non-null object
target      303 non-null int64
dtypes: float64(11), int64(1), object(2)
memory usage: 33.2+ KB


In [5]:
# a short statistical summary of the dataset

raw_data.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.438944,0.679868,3.158416,131.689769,246.693069,0.148515,0.990099,149.607261,0.326733,1.039604,1.60066,0.458746
std,9.038662,0.467299,0.960126,17.599748,51.776918,0.356198,0.994971,22.875003,0.469794,1.161075,0.616226,0.49912
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0
25%,48.0,0.0,3.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0
50%,56.0,1.0,3.0,130.0,241.0,0.0,1.0,153.0,0.0,0.8,2.0,0.0
75%,61.0,1.0,4.0,140.0,275.0,0.0,2.0,166.0,1.0,1.6,2.0,1.0
max,77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,1.0


In [6]:
# a short statistical summary of the dataset, including categorical variables

raw_data.describe(include='all')

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
unique,,,,,,,,,,,,5.0,4.0,
top,,,,,,,,,,,,0.0,3.0,
freq,,,,,,,,,,,,176.0,166.0,
mean,54.438944,0.679868,3.158416,131.689769,246.693069,0.148515,0.990099,149.607261,0.326733,1.039604,1.60066,,,0.458746
std,9.038662,0.467299,0.960126,17.599748,51.776918,0.356198,0.994971,22.875003,0.469794,1.161075,0.616226,,,0.49912
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,,,0.0
25%,48.0,0.0,3.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,,,0.0
50%,56.0,1.0,3.0,130.0,241.0,0.0,1.0,153.0,0.0,0.8,2.0,,,0.0
75%,61.0,1.0,4.0,140.0,275.0,0.0,2.0,166.0,1.0,1.6,2.0,,,1.0


### Dealing with Missing Values

In [7]:
# checking unique values of a columns along with the number of occurences of each value

raw_data.ca.value_counts()

0.0    176
1.0     65
2.0     38
3.0     20
?        4
Name: ca, dtype: int64

In [8]:
raw_data.thal.value_counts()

3.0    166
7.0    117
6.0     18
?        2
Name: thal, dtype: int64

In [9]:
# replacing missing values denoted by '?' with NumPy NaN value

import numpy as np
raw_data['ca'] = raw_data['ca'].replace({'?':np.nan})
raw_data['thal'] = raw_data['thal'].replace({'?':np.nan})

In [10]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
age         303 non-null float64
sex         303 non-null float64
cp          303 non-null float64
trestbps    303 non-null float64
chol        303 non-null float64
fbs         303 non-null float64
restecg     303 non-null float64
thalach     303 non-null float64
exang       303 non-null float64
oldpeak     303 non-null float64
slope       303 non-null float64
ca          299 non-null object
thal        301 non-null object
target      303 non-null int64
dtypes: float64(11), int64(1), object(2)
memory usage: 33.2+ KB


In [11]:
raw_data.describe(include='all')

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,299.0,301.0,303.0
unique,,,,,,,,,,,,4.0,3.0,
top,,,,,,,,,,,,0.0,3.0,
freq,,,,,,,,,,,,176.0,166.0,
mean,54.438944,0.679868,3.158416,131.689769,246.693069,0.148515,0.990099,149.607261,0.326733,1.039604,1.60066,,,0.458746
std,9.038662,0.467299,0.960126,17.599748,51.776918,0.356198,0.994971,22.875003,0.469794,1.161075,0.616226,,,0.49912
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,,,0.0
25%,48.0,0.0,3.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,,,0.0
50%,56.0,1.0,3.0,130.0,241.0,0.0,1.0,153.0,0.0,0.8,2.0,,,0.0
75%,61.0,1.0,4.0,140.0,275.0,0.0,2.0,166.0,1.0,1.6,2.0,,,1.0


In [12]:
data = raw_data.copy()

In [13]:
data.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          4
thal        2
target      0
dtype: int64

In [14]:
data.ca = data.ca.fillna(round(data.ca.mean(), 0))

TypeError: must be str, not int

In [15]:
data['ca'] = data['ca'].astype(float)

In [16]:
data.ca = data.ca.fillna(round(data.ca.mean(), 0))

In [17]:
data.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        2
target      0
dtype: int64

In [18]:
data.thal = data.thal.fillna('3.0')

In [19]:
data.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

### Dealing with Categorical Variables

In [20]:
data_filled = data.copy()

In [21]:
pd.get_dummies(data_filled['sex'])

Unnamed: 0,0.0,1.0
0,0,1
1,0,1
2,0,1
3,0,1
4,1,0
5,0,1
6,1,0
7,1,0
8,0,1
9,0,1


In [22]:
data_dummies = pd.get_dummies(data_filled[['sex','cp','restecg','slope','thal']])

In [23]:
data_dummies.head()

Unnamed: 0,sex,cp,restecg,slope,thal_3.0,thal_6.0,thal_7.0
0,1.0,1.0,2.0,3.0,0,1,0
1,1.0,4.0,2.0,2.0,1,0,0
2,1.0,4.0,2.0,2.0,0,0,1
3,1.0,3.0,0.0,3.0,1,0,0
4,0.0,2.0,2.0,1.0,1,0,0


In [24]:
data_filled[['sex','cp','restecg','slope']] = data_filled[['sex','cp','restecg','slope']].astype(str)

In [25]:
data_filled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
age         303 non-null float64
sex         303 non-null object
cp          303 non-null object
trestbps    303 non-null float64
chol        303 non-null float64
fbs         303 non-null float64
restecg     303 non-null object
thalach     303 non-null float64
exang       303 non-null float64
oldpeak     303 non-null float64
slope       303 non-null object
ca          303 non-null float64
thal        303 non-null object
target      303 non-null int64
dtypes: float64(8), int64(1), object(5)
memory usage: 33.2+ KB


In [26]:
data_dummies = pd.get_dummies(data_filled[['sex','cp','restecg','slope','thal']])

In [27]:
data_dummies.head()

Unnamed: 0,sex_0.0,sex_1.0,cp_1.0,cp_2.0,cp_3.0,cp_4.0,restecg_0.0,restecg_1.0,restecg_2.0,slope_1.0,slope_2.0,slope_3.0,thal_3.0,thal_6.0,thal_7.0
0,0,1,1,0,0,0,0,0,1,0,0,1,0,1,0
1,0,1,0,0,0,1,0,0,1,0,1,0,1,0,0
2,0,1,0,0,0,1,0,0,1,0,1,0,0,0,1
3,0,1,0,0,1,0,1,0,0,0,0,1,1,0,0
4,1,0,0,1,0,0,0,0,1,1,0,0,1,0,0


In [28]:
data_dummies.columns

Index(['sex_0.0', 'sex_1.0', 'cp_1.0', 'cp_2.0', 'cp_3.0', 'cp_4.0',
       'restecg_0.0', 'restecg_1.0', 'restecg_2.0', 'slope_1.0', 'slope_2.0',
       'slope_3.0', 'thal_3.0', 'thal_6.0', 'thal_7.0'],
      dtype='object')

### Feature Scaling

In [29]:
data_filled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
age         303 non-null float64
sex         303 non-null object
cp          303 non-null object
trestbps    303 non-null float64
chol        303 non-null float64
fbs         303 non-null float64
restecg     303 non-null object
thalach     303 non-null float64
exang       303 non-null float64
oldpeak     303 non-null float64
slope       303 non-null object
ca          303 non-null float64
thal        303 non-null object
target      303 non-null int64
dtypes: float64(8), int64(1), object(5)
memory usage: 33.2+ KB


In [30]:
# dropping categorical variables for which dummies have been created

data_filled.drop(['sex','cp','restecg','slope','thal'], axis=1, inplace=True)

In [31]:
data_filled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 9 columns):
age         303 non-null float64
trestbps    303 non-null float64
chol        303 non-null float64
fbs         303 non-null float64
thalach     303 non-null float64
exang       303 non-null float64
oldpeak     303 non-null float64
ca          303 non-null float64
target      303 non-null int64
dtypes: float64(8), int64(1)
memory usage: 21.4 KB


In [32]:
# checking for correlation between numerical attributes

data_filled[['age','trestbps','chol','thalach','oldpeak','ca']].corr()

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,ca
age,1.0,0.284946,0.20895,-0.393806,0.203805,0.355751
trestbps,0.284946,1.0,0.13012,-0.045351,0.189171,0.099041
chol,0.20895,0.13012,1.0,-0.003432,0.046564,0.11569
thalach,-0.393806,-0.045351,-0.003432,1.0,-0.343085,-0.261638
oldpeak,0.203805,0.189171,0.046564,-0.343085,1.0,0.290651
ca,0.355751,0.099041,0.11569,-0.261638,0.290651,1.0


In [44]:
data_uncorr = data_filled.copy()

In [34]:
# importing StandardScaler class from Scikit-Learn

from sklearn.preprocessing import StandardScaler

In [35]:
# creating an object of the StandardScaler class

std_scaler = StandardScaler()

In [36]:
# standardizing 'age' column

import numpy as np
std_scaler.fit_transform(np.array(data_uncorr['age']).reshape(len(data_uncorr['age']),1))

array([[ 0.94872647],
       [ 1.39200191],
       [ 1.39200191],
       [-1.93256387],
       [-1.48928843],
       [ 0.17299446],
       [ 0.83790761],
       [ 0.28381332],
       [ 0.94872647],
       [-0.15946212],
       [ 0.28381332],
       [ 0.17299446],
       [ 0.17299446],
       [-1.15683185],
       [-0.27028098],
       [ 0.28381332],
       [-0.71355642],
       [-0.04864326],
       [-0.71355642],
       [-0.60273756],
       [ 1.05954533],
       [ 0.39463218],
       [ 0.39463218],
       [ 0.39463218],
       [ 0.61626989],
       [-0.4919187 ],
       [ 0.39463218],
       [ 1.28118305],
       [-1.26765071],
       [-1.60010729],
       [ 1.61363963],
       [ 0.61626989],
       [ 1.05954533],
       [ 0.50545103],
       [-1.15683185],
       [-1.37846957],
       [-1.26765071],
       [ 0.28381332],
       [ 0.0621756 ],
       [ 0.72708875],
       [ 1.17036419],
       [-1.60010729],
       [ 1.83527735],
       [ 0.50545103],
       [ 0.72708875],
       [ 0

In [37]:
data_uncorr.head()

Unnamed: 0,age,trestbps,chol,fbs,thalach,exang,oldpeak,ca,target
0,63.0,145.0,233.0,1.0,150.0,0.0,2.3,0.0,0
1,67.0,160.0,286.0,0.0,108.0,1.0,1.5,3.0,1
2,67.0,120.0,229.0,0.0,129.0,1.0,2.6,2.0,1
3,37.0,130.0,250.0,0.0,187.0,0.0,3.5,0.0,0
4,41.0,130.0,204.0,0.0,172.0,0.0,1.4,0.0,0


In [38]:
# standardizing all numerical columns

data_uncorr[['age','trestbps','chol','thalach','oldpeak','ca']] = std_scaler.fit_transform(data_uncorr[['age','trestbps','chol','thalach','oldpeak','ca']])

In [39]:
data_uncorr.head()

Unnamed: 0,age,trestbps,chol,fbs,thalach,exang,oldpeak,ca,target
0,0.948726,0.757525,-0.2649,1.0,0.017197,0.0,1.087338,-0.727161,0
1,1.392002,1.61122,0.760415,0.0,-1.821905,1.0,0.397182,2.497176,1
2,1.392002,-0.6653,-0.342283,0.0,-0.902354,1.0,1.346147,1.422397,1
3,-1.932564,-0.09617,0.063974,0.0,1.637359,0.0,2.122573,-0.727161,0
4,-1.489288,-0.09617,-0.825922,0.0,0.980537,0.0,0.310912,-0.727161,0


In [40]:
# using inverse_transform function to view the original values from the scaled values

std_scaler.inverse_transform(data_uncorr[['age','trestbps','chol','thalach','oldpeak','ca']].iloc[:5, :])

array([[ 63. , 145. , 233. , 150. ,   2.3,   0. ],
       [ 67. , 160. , 286. , 108. ,   1.5,   3. ],
       [ 67. , 120. , 229. , 129. ,   2.6,   2. ],
       [ 37. , 130. , 250. , 187. ,   3.5,   0. ],
       [ 41. , 130. , 204. , 172. ,   1.4,   0. ]])

In [41]:
# importing Normalizer class from Scikit-Learn

from sklearn.preprocessing import Normalizer

In [42]:
# creating an object of the Normalizer class

norm = Normalizer()

In [45]:
data_uncorr.head()

Unnamed: 0,age,trestbps,chol,fbs,thalach,exang,oldpeak,ca,target
0,63.0,145.0,233.0,1.0,150.0,0.0,2.3,0.0,0
1,67.0,160.0,286.0,0.0,108.0,1.0,1.5,3.0,1
2,67.0,120.0,229.0,0.0,129.0,1.0,2.6,2.0,1
3,37.0,130.0,250.0,0.0,187.0,0.0,3.5,0.0,0
4,41.0,130.0,204.0,0.0,172.0,0.0,1.4,0.0,0


In [46]:
# standardizing all numerical columns

data_uncorr[['age','trestbps','chol','thalach','oldpeak','ca']] = norm.fit_transform(data_uncorr[['age','trestbps','chol','thalach','oldpeak','ca']])

In [47]:
data_uncorr.head()

Unnamed: 0,age,trestbps,chol,fbs,thalach,exang,oldpeak,ca,target
0,0.197466,0.454485,0.73031,1.0,0.470156,0.0,0.007209,0.0,0
1,0.190605,0.455177,0.813629,0.0,0.307245,1.0,0.004267,0.008535,1
2,0.22588,0.404562,0.772039,0.0,0.434904,1.0,0.008766,0.006743,1
3,0.108753,0.382105,0.734817,0.0,0.549643,0.0,0.010287,0.0,0
4,0.136832,0.433857,0.680821,0.0,0.574026,0.0,0.004672,0.0,0


In [48]:
# checking the norm of each obervation (row)

s = 0

for i in range(6):
    s += data_uncorr[['age','trestbps','chol','thalach','oldpeak','ca']].iloc[0, i]**2

s

1.0

In [49]:
# merging numerical and categorical data

data_final = pd.concat([data_uncorr, data_dummies], axis=1)
data_final.head()

Unnamed: 0,age,trestbps,chol,fbs,thalach,exang,oldpeak,ca,target,sex_0.0,...,cp_4.0,restecg_0.0,restecg_1.0,restecg_2.0,slope_1.0,slope_2.0,slope_3.0,thal_3.0,thal_6.0,thal_7.0
0,0.197466,0.454485,0.73031,1.0,0.470156,0.0,0.007209,0.0,0,0,...,0,0,0,1,0,0,1,0,1,0
1,0.190605,0.455177,0.813629,0.0,0.307245,1.0,0.004267,0.008535,1,0,...,1,0,0,1,0,1,0,1,0,0
2,0.22588,0.404562,0.772039,0.0,0.434904,1.0,0.008766,0.006743,1,0,...,1,0,0,1,0,1,0,0,0,1
3,0.108753,0.382105,0.734817,0.0,0.549643,0.0,0.010287,0.0,0,0,...,0,1,0,0,0,0,1,1,0,0
4,0.136832,0.433857,0.680821,0.0,0.574026,0.0,0.004672,0.0,0,1,...,0,0,0,1,1,0,0,1,0,0


In [50]:
data_final.columns

Index(['age', 'trestbps', 'chol', 'fbs', 'thalach', 'exang', 'oldpeak', 'ca',
       'target', 'sex_0.0', 'sex_1.0', 'cp_1.0', 'cp_2.0', 'cp_3.0', 'cp_4.0',
       'restecg_0.0', 'restecg_1.0', 'restecg_2.0', 'slope_1.0', 'slope_2.0',
       'slope_3.0', 'thal_3.0', 'thal_6.0', 'thal_7.0'],
      dtype='object')

In [66]:
# exporting processed data as csv

data_final.to_csv('Desktop/Data/heart_processed.csv', index=False)

### Train-Test Split

In [52]:
# splitting input and target variables

X = data_final.drop('target', axis=1)
y = data_final['target']

In [61]:
y.value_counts()

0    164
1    139
Name: target, dtype: int64

In [62]:
# checking target variable distribution

y.value_counts()/y.value_counts().sum()

0    0.541254
1    0.458746
Name: target, dtype: float64

In [53]:
# import train_test_split class from Scikit-Learn

from sklearn.model_selection import train_test_split

In [56]:
# splitting data into train and test sets with 70:30 ratio

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [57]:
print(X_train.shape, y_train.shape)

(212, 23) (212,)


In [58]:
print(X_test.shape, y_test.shape)

(91, 23) (91,)


In [64]:
# checking target variable distribution in train set

y_train.value_counts()/y_train.value_counts().sum()

0    0.537736
1    0.462264
Name: target, dtype: float64

In [65]:
# checking target variable distribution in test set

y_test.value_counts()/y_test.value_counts().sum()

0    0.549451
1    0.450549
Name: target, dtype: float64