## Importing Dependancies

In [1]:
import pandas as pd 
import numpy as np
import altair as alt

## Loading Data

In [2]:
cleveland = pd.read_csv("./data/processed.cleveland.data.txt", header=None)
hungarian = pd.read_csv("./data/processed.hungarian.data.txt", header=None)
switzerland = pd.read_csv("./data/processed.switzerland.data.txt", header=None)
va = pd.read_csv("./data/processed.va.data.txt", header=None)
data = pd.concat([cleveland, va, switzerland, hungarian])

In [3]:
# Assigning Column names to daataframe
data.columns = [
    'age', 'sex', 'chest_pain', 'rest_bp', 'cholesterol', 'fasting_bs',
    'rest_ecg', 'max_heart_rate', 'exercise_angina', 'st_depression', 'slope',
    'fluoroscopy', 'defect', 'diagnosis'
]

# Mutating the columns to be of type int
for column in list(data.columns):
    if column in [
            "age", "rest_bp", "cholesterol", "max_heart_rate", "st_depression"
    ]:
        data[column] = pd.to_numeric(data[column], errors="coerce")
    else:
        data[column] = pd.to_numeric(data[column],
                                     errors="coerce").astype("category")
data.head()

Unnamed: 0,age,sex,chest_pain,rest_bp,cholesterol,fasting_bs,rest_ecg,max_heart_rate,exercise_angina,st_depression,slope,fluoroscopy,defect,diagnosis
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [4]:
data.describe()

Unnamed: 0,age,rest_bp,cholesterol,max_heart_rate,st_depression
count,920.0,861.0,890.0,865.0,858.0
mean,53.51087,132.132404,199.130337,137.545665,0.878788
std,9.424685,19.06607,110.78081,25.926276,1.091226
min,28.0,0.0,0.0,60.0,-2.6
25%,47.0,120.0,175.0,120.0,0.0
50%,54.0,130.0,223.0,140.0,0.5
75%,60.0,140.0,268.0,157.0,1.5
max,77.0,200.0,603.0,202.0,6.2


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 920 entries, 0 to 293
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   age              920 non-null    float64 
 1   sex              920 non-null    category
 2   chest_pain       920 non-null    category
 3   rest_bp          861 non-null    float64 
 4   cholesterol      890 non-null    float64 
 5   fasting_bs       830 non-null    category
 6   rest_ecg         918 non-null    category
 7   max_heart_rate   865 non-null    float64 
 8   exercise_angina  865 non-null    category
 9   st_depression    858 non-null    float64 
 10  slope            611 non-null    category
 11  fluoroscopy      309 non-null    category
 12  defect           434 non-null    category
 13  diagnosis        920 non-null    category
dtypes: category(9), float64(5)
memory usage: 52.6 KB


In [6]:
df = {}
for column in data.columns:
    df[column] = data[column].isnull().mean() * 100
pd.DataFrame.from_dict(
    df, orient="index").rename(columns={0: "Percentage of Missing Values"})

Unnamed: 0,Percentage of Missing Values
age,0.0
sex,0.0
chest_pain,0.0
rest_bp,6.413043
cholesterol,3.26087
fasting_bs,9.782609
rest_ecg,0.217391
max_heart_rate,5.978261
exercise_angina,5.978261
st_depression,6.73913


From the table above, we see that:
- `Flouroscopy` has 66% of its values missing
- `defect` has 52.8% of its values missing
- `Slope` has 33.5% of its values missing

Due to the large amounts of missing values in these columns, we decided to drop them

Since `rest_ecg` only has two missing values, We decided to 

In [7]:
final_df = data.copy().drop(["fluoroscopy", "defect", "slope"], axis=1)
final_df['rest_ecg'].fillna(final_df['rest_ecg'].mode().iloc[0], inplace=True)
final_df.head()

Unnamed: 0,age,sex,chest_pain,rest_bp,cholesterol,fasting_bs,rest_ecg,max_heart_rate,exercise_angina,st_depression,diagnosis
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,0


In [8]:
heart = final_df[final_df['rest_bp'].notnull()
                 & final_df['cholesterol'].notnull()
                 & final_df['fasting_bs'].notnull()
                 & final_df['max_heart_rate'].notnull()
                 & final_df['exercise_angina'].notnull()
                 & final_df['st_depression'].notnull()]
heart.head()

Unnamed: 0,age,sex,chest_pain,rest_bp,cholesterol,fasting_bs,rest_ecg,max_heart_rate,exercise_angina,st_depression,diagnosis
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,0


Checking if the final dataframe `heart` has any `NaN entries`

In [9]:
heart.isnull().any().reset_index().rename(columns={
    0: "has NaN entries?",
    "index": "Column"
})

Unnamed: 0,Column,has NaN entries?
0,age,False
1,sex,False
2,chest_pain,False
3,rest_bp,False
4,cholesterol,False
5,fasting_bs,False
6,rest_ecg,False
7,max_heart_rate,False
8,exercise_angina,False
9,st_depression,False


In [10]:
len(heart)/len(data)

0.8054347826086956

In [None]:
##