In [1]:
import pandas as pd
import numpy as np

In [2]:


# """
# % ======================================================================
# % John Gennari
# % 3/13/90
# %
# % This is Dr. Detrano's database modified to be a real MIXED dataset.
# %
# % Attributes: 8 symbolic, 6 numeric.
# %  Age; sex; chest pain type (angina, abnang, notang, asympt)
# %  Trestbps (resting blood pres); cholesteral; fasting blood sugar < 120
# %  (true or false); resting ecg (norm, abn, hyper); max heart rate; 
# %  exercise induced angina (true or false); oldpeak; slope (up, flat, down)
# %  number of vessels colored (???); thal (norm, fixed, rever). Finally, the
# %  class is either healthy (buff) or with heart-disease (sick).
# %
# % Original atts: 
# %   age; sex (1,0); cp (1-4); trestbps; chol; fbs (1,0); restecg (0,1,2); 
# %   thalach; exang (1,0); oldpeak; slope (1,2,3); ca; thal (3,6,7);
# %   class att: 0 is healthy, 1,2,3,4 is sick.
# % ======================================================================
# """

## Ojective - Clean the Data...

In [3]:
# Import data and add column names
names =["age", "sex", "chest_pain_type", 
          "trestbps", "cholesteral", 
          "fasting_blood_sugar", "resting_ecg", "max_heart_rate",
         "exercise_induced_angina", "oldpeak", "slope", 
         "vessels_colored", "thal", "health","X"]
df = pd.read_csv("hypertension-dataset.csv", names=names)


In [4]:
df.head()

Unnamed: 0,age,sex,chest_pain_type,trestbps,cholesteral,fasting_blood_sugar,resting_ecg,max_heart_rate,exercise_induced_angina,oldpeak,slope,vessels_colored,thal,health,X
0,63.0,male,angina,145.0,233.0,true,hyp,150.0,fal,2.3,down,0.0,fix,buff,H
1,67.0,male,asympt,160.0,286.0,fal,hyp,108.0,true,1.5,flat,3.0,norm,sick,S2
2,67.0,male,asympt,120.0,229.0,fal,hyp,129.0,true,2.6,flat,2.0,rev,sick,S1
3,37.0,male,notang,130.0,250.0,fal,norm,187.0,fal,3.5,down,0.0,norm,buff,H
4,41.0,fem,abnang,130.0,204.0,fal,hyp,172.0,fal,1.4,up,0.0,norm,buff,H


In [5]:
df.drop(columns =["X"], inplace=True)

In [6]:
# Check number of columns and rows
print("Number of columns>", df.shape[1])
print("Number of rows>", df.shape[0])

Number of columns> 14
Number of rows> 303


In [7]:
# Check if there are null values
print("Check non null infos>",df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   age                      303 non-null    float64
 1   sex                      303 non-null    object 
 2   chest_pain_type          303 non-null    object 
 3   trestbps                 303 non-null    float64
 4   cholesteral              303 non-null    float64
 5   fasting_blood_sugar      303 non-null    object 
 6   resting_ecg              303 non-null    object 
 7   max_heart_rate           303 non-null    float64
 8   exercise_induced_angina  303 non-null    object 
 9   oldpeak                  303 non-null    float64
 10  slope                    303 non-null    object 
 11  vessels_colored          303 non-null    object 
 12  thal                     303 non-null    object 
 13  health                   303 non-null    object 
dtypes: float64(5), object(9)
m

In [8]:
# No null values
# check our datatypes

print("Check non null infos>\n",df.dtypes)

Check non null infos>
 age                        float64
sex                         object
chest_pain_type             object
trestbps                   float64
cholesteral                float64
fasting_blood_sugar         object
resting_ecg                 object
max_heart_rate             float64
exercise_induced_angina     object
oldpeak                    float64
slope                       object
vessels_colored             object
thal                        object
health                      object
dtype: object


In [9]:
# The previous step will raise an error, because of the '?' in some of the vessels_colored columns.
# We'll just change all the columns that has the ? to NaN, since thats what they are
# Better still, since there are no NaN in our data, then...we'll check the whole data and change all "?" to NaN

df.replace("?", np.nan, inplace=True)

In [10]:
# The following columns dtypes needs to be changed.
# vessels_colored, 

df.vessels_colored = pd.to_numeric(df.vessels_colored)

In [11]:
print("Check columns with empty values>\n", df.columns[df.isnull().any().tolist()])

Check columns with empty values>
 Index(['vessels_colored', 'thal'], dtype='object')


In [12]:
df[df["vessels_colored"].isnull()]

Unnamed: 0,age,sex,chest_pain_type,trestbps,cholesteral,fasting_blood_sugar,resting_ecg,max_heart_rate,exercise_induced_angina,oldpeak,slope,vessels_colored,thal,health
143,52.0,male,notang,138.0,223.0,fal,norm,169.0,fal,0.0,up,,norm,buff
157,38.0,male,notang,138.0,175.0,fal,norm,173.0,fal,0.0,up,,norm,buff
165,43.0,male,asympt,132.0,247.0,true,hyp,143.0,true,0.1,flat,,rev,sick
250,58.0,male,abnang,125.0,220.0,fal,norm,144.0,fal,0.4,flat,,rev,buff
262,38.0,male,notang,138.0,175.0,fal,norm,173.0,fal,0.0,up,,norm,buff


In [13]:
df[df["thal"].isnull()]

Unnamed: 0,age,sex,chest_pain_type,trestbps,cholesteral,fasting_blood_sugar,resting_ecg,max_heart_rate,exercise_induced_angina,oldpeak,slope,vessels_colored,thal,health
82,53.0,fem,notang,128.0,216.0,fal,hyp,115.0,fal,0.0,up,0.0,,buff
198,52.0,male,asympt,128.0,204.0,true,norm,156.0,true,1.0,flat,0.0,,sick


In [14]:
# Since we still have enough data, with just 7 row having NaN values, we can drop them
df.dropna(inplace=True)

In [15]:
# Now, we convert vessels_colored to numeric
df.vessels_colored = pd.to_numeric(df.vessels_colored)
#Then confirm
df.dtypes

age                        float64
sex                         object
chest_pain_type             object
trestbps                   float64
cholesteral                float64
fasting_blood_sugar         object
resting_ecg                 object
max_heart_rate             float64
exercise_induced_angina     object
oldpeak                    float64
slope                       object
vessels_colored            float64
thal                        object
health                      object
dtype: object

In [16]:
# # next, we work on our categorical datas.
# # sex, chest_pain_type, fasting_blood_sugar, resting_ecg, exercise_induced_angina, slope, thal, health

# # Using map() method.

#sex
df["_sex"] = df.sex.map({"male": 1, "fem": 0})

#chest_pain_type
df["_cpt"] = df.chest_pain_type.map( {"angina": 1, "abnang": 2, "notang": 3, "asympt": 4})

# fasting_blood_sugar(<120)
df["_fbs"] = df.fasting_blood_sugar.map({"true": 1, "fal": 0})

# resting_ecg
df["_r_ecg"] = df.resting_ecg.map({"norm": 1, "abn": 2, "hyp": 3})

# exercises_induced_angina
df["_eig"] = df.exercise_induced_angina.map({"true": 1, "fal": 0})

#slope
df["_slope"] = df.slope.map({"up": 1, "flat": 2, "down": 3})

#thal
df["_thal"] = df.thal.map({"norm": 1, "fix": 2, "rev": 3})

#health
df["_health"] = df.health.map({"buff": 1, "sick": 0})


In [17]:
df.drop(columns = ["sex", "chest_pain_type", "fasting_blood_sugar", 
                   "resting_ecg", "exercise_induced_angina", 
                   "slope", "thal", "health"], inplace=True)

In [18]:
df.head()

Unnamed: 0,age,trestbps,cholesteral,max_heart_rate,oldpeak,vessels_colored,_sex,_cpt,_fbs,_r_ecg,_eig,_slope,_thal,_health
0,63.0,145.0,233.0,150.0,2.3,0.0,1,1,1,3,0,3,2,1
1,67.0,160.0,286.0,108.0,1.5,3.0,1,4,0,3,1,2,1,0
2,67.0,120.0,229.0,129.0,2.6,2.0,1,4,0,3,1,2,3,0
3,37.0,130.0,250.0,187.0,3.5,0.0,1,3,0,1,0,3,1,1
4,41.0,130.0,204.0,172.0,1.4,0.0,0,2,0,3,0,1,1,1


### Our data has been cleaned and now ready for visualization, Feature Selection and other processes