In [1]:
import pandas as pd


def load_data(url, col_names):
  return pd.read_csv(url, header=None, names=col_names, na_values='?')


df = load_data(
    'https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data',
    ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']
    )

In [2]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    float64
 1   sex       303 non-null    float64
 2   cp        303 non-null    float64
 3   trestbps  303 non-null    float64
 4   chol      303 non-null    float64
 5   fbs       303 non-null    float64
 6   restecg   303 non-null    float64
 7   thalach   303 non-null    float64
 8   exang     303 non-null    float64
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    float64
 11  ca        299 non-null    float64
 12  thal      301 non-null    float64
 13  target    303 non-null    int64  
dtypes: float64(13), int64(1)
memory usage: 33.3 KB


In [4]:
df.target.value_counts()


target
0    164
1     55
2     36
3     35
4     13
Name: count, dtype: int64

In [5]:
df['target'] = df['target'].apply(lambda x: 1 if x > 0 else 0)

In [6]:
df.isna().sum()[df.isna().sum() > 0]

ca      4
thal    2
dtype: int64

In [7]:
# show rows with NaN values in 'ca' and 'thal' columns
df[df[['ca', 'thal']].isna().any(axis=1)]

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
87,53.0,0.0,3.0,128.0,216.0,0.0,2.0,115.0,0.0,0.0,1.0,0.0,,0
166,52.0,1.0,3.0,138.0,223.0,0.0,0.0,169.0,0.0,0.0,1.0,,3.0,0
192,43.0,1.0,4.0,132.0,247.0,1.0,2.0,143.0,1.0,0.1,2.0,,7.0,1
266,52.0,1.0,4.0,128.0,204.0,1.0,0.0,156.0,1.0,1.0,2.0,0.0,,1
287,58.0,1.0,2.0,125.0,220.0,0.0,0.0,144.0,0.0,0.4,2.0,,7.0,0
302,38.0,1.0,3.0,138.0,175.0,0.0,0.0,173.0,0.0,0.0,1.0,,3.0,0


In [8]:
df[df[['ca', 'thal']].isna().any(axis=1)].shape[0] / df.shape[0]

0.019801980198019802

Only about 2% of the rows of data have missing values in either 'ca' or 'thal' columns. Therefore, it is reasonable to drop these rows without significantly impacting the dataset.


In [9]:
df.dropna(inplace=True)
df.shape

(297, 14)

In [10]:
df.nunique()

age          41
sex           2
cp            4
trestbps     50
chol        152
fbs           2
restecg       3
thalach      91
exang         2
oldpeak      40
slope         3
ca            4
thal          3
target        2
dtype: int64

In [11]:
df[['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']] = df[['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']].astype(int).astype('category')

In [12]:
df = pd.concat(
        [df, pd.get_dummies(df[["cp", "restecg", "slope", "thal"]], drop_first=True)],
        axis=1,
    )
df.drop(columns=["cp", "restecg", "slope", "thal"], inplace=True)

In [13]:
df.head()

Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,ca,target,cp_2,cp_3,cp_4,restecg_1,restecg_2,slope_2,slope_3,thal_6,thal_7
0,63.0,1,145.0,233.0,1,150.0,0,2.3,0,0,False,False,False,False,True,False,True,True,False
1,67.0,1,160.0,286.0,0,108.0,1,1.5,3,1,False,False,True,False,True,True,False,False,False
2,67.0,1,120.0,229.0,0,129.0,1,2.6,2,1,False,False,True,False,True,True,False,False,True
3,37.0,1,130.0,250.0,0,187.0,0,3.5,0,0,False,True,False,False,False,False,True,False,False
4,41.0,0,130.0,204.0,0,172.0,0,1.4,0,0,True,False,False,False,True,False,False,False,False


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 297 entries, 0 to 301
Data columns (total 19 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   age        297 non-null    float64 
 1   sex        297 non-null    category
 2   trestbps   297 non-null    float64 
 3   chol       297 non-null    float64 
 4   fbs        297 non-null    category
 5   thalach    297 non-null    float64 
 6   exang      297 non-null    category
 7   oldpeak    297 non-null    float64 
 8   ca         297 non-null    category
 9   target     297 non-null    int64   
 10  cp_2       297 non-null    bool    
 11  cp_3       297 non-null    bool    
 12  cp_4       297 non-null    bool    
 13  restecg_1  297 non-null    bool    
 14  restecg_2  297 non-null    bool    
 15  slope_2    297 non-null    bool    
 16  slope_3    297 non-null    bool    
 17  thal_6     297 non-null    bool    
 18  thal_7     297 non-null    bool    
dtypes: bool(9), category(4), float64(5