## Importing Dependancies

In [30]:
import pandas as pd 
import numpy as np
import altair as alt

## Loading Data
To begin with, we will load in the datasets to be analysed.

In [204]:
'''Reading Cleveland Data'''
cleveland = pd.read_csv("./data/processed.cleveland.data.txt", header=None)

'''Reading Hungarian Data'''
hungarian = pd.read_csv("./data/processed.hungarian.data.txt", header=None)

'''Reading Switzerland Data'''
switzerland = pd.read_csv("./data/processed.switzerland.data.txt", header=None)

'''Reading Va Data'''
va = pd.read_csv("./data/processed.va.data.txt", header=None)

'''Concatenating the data together'''
data = pd.concat([cleveland, va, switzerland, hungarian])

In [205]:
# Assigning Column names to daataframe
data.columns = [
    'age', 'sex', 'chest_pain', 'rest_bp', 'cholesterol', 'fasting_bs',
    'rest_ecg', 'max_heart_rate', 'exercise_angina', 'st_depression', 'slope',
    'fluoroscopy', 'defect', 'diagnosis'
]

# Mutating the columns to be of type integer
for column in list(data.columns):
    if column in ["age", "rest_bp", "cholesterol", "max_heart_rate", "st_depression"]:
        data[column] = pd.to_numeric(data[column], errors="coerce")
    else:
        data[column] = pd.to_numeric(data[column],
                                     errors="coerce").astype("category")
data.head()

Unnamed: 0,age,sex,chest_pain,rest_bp,cholesterol,fasting_bs,rest_ecg,max_heart_rate,exercise_angina,st_depression,slope,fluoroscopy,defect,diagnosis
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [206]:
data.describe()

Unnamed: 0,age,rest_bp,cholesterol,max_heart_rate,st_depression
count,920.0,861.0,890.0,865.0,858.0
mean,53.51087,132.132404,199.130337,137.545665,0.878788
std,9.424685,19.06607,110.78081,25.926276,1.091226
min,28.0,0.0,0.0,60.0,-2.6
25%,47.0,120.0,175.0,120.0,0.0
50%,54.0,130.0,223.0,140.0,0.5
75%,60.0,140.0,268.0,157.0,1.5
max,77.0,200.0,603.0,202.0,6.2


In [207]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 920 entries, 0 to 293
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   age              920 non-null    float64 
 1   sex              920 non-null    category
 2   chest_pain       920 non-null    category
 3   rest_bp          861 non-null    float64 
 4   cholesterol      890 non-null    float64 
 5   fasting_bs       830 non-null    category
 6   rest_ecg         918 non-null    category
 7   max_heart_rate   865 non-null    float64 
 8   exercise_angina  865 non-null    category
 9   st_depression    858 non-null    float64 
 10  slope            611 non-null    category
 11  fluoroscopy      309 non-null    category
 12  defect           434 non-null    category
 13  diagnosis        920 non-null    category
dtypes: category(9), float64(5)
memory usage: 52.6 KB


In [208]:
df = {}
for column in data.columns:
    df[column] = data[column].isnull().mean() * 100

pd.DataFrame.from_dict(
    df, orient="index").rename(columns={
        0: "Percentage of Missing Values"
    }).sort_values(
        by="Percentage of Missing Values",
        ascending=False).style.background_gradient(axis=0).set_caption(
            "A Table Showing The Proportions of missing Values in columns")

Unnamed: 0,Percentage of Missing Values
fluoroscopy,66.413043
defect,52.826087
slope,33.586957
fasting_bs,9.782609
st_depression,6.73913
rest_bp,6.413043
max_heart_rate,5.978261
exercise_angina,5.978261
cholesterol,3.26087
rest_ecg,0.217391


From the table above, we see that:
- `Flouroscopy` has 66% of its values missing
- `defect` has 52.8% of its values missing
- `Slope` has 33.5% of its values missing

Due to the large amounts of missing values in these columns, we decided to drop them

Since `rest_ecg` only has two missing values, We will replace them with the mode. This will not make a substantial impact on the outcome of out analysis.

In [209]:
'''Removing the columns with many missing values'''
final_df = data.copy().drop(["fluoroscopy", "defect", "slope"], axis=1)

'''Replacing the NaN value in the rest_ecg column with the mode of the colums'''
final_df['rest_ecg'].fillna(final_df['rest_ecg'].mode().iloc[0], inplace=True)

'''Looking at the head of the dataframe'''
final_df.head()

Unnamed: 0,age,sex,chest_pain,rest_bp,cholesterol,fasting_bs,rest_ecg,max_heart_rate,exercise_angina,st_depression,diagnosis
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,0


Removed dealt with the three rows, we will just remove all rows with NaN values for our final dataset

In [210]:
heart = final_df.copy().dropna()
heart.head()

Unnamed: 0,age,sex,chest_pain,rest_bp,cholesterol,fasting_bs,rest_ecg,max_heart_rate,exercise_angina,st_depression,diagnosis
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,0


Checking if the final dataframe `heart_data` has any `NaN entries`

In [211]:
heart.isnull().any().reset_index().rename(columns={
    0: "has NaN entries?",
    "index": "Column"
}).style.set_caption(
    "In the table below we see that all columns have no missing values")

Unnamed: 0,Column,has NaN entries?
0,age,False
1,sex,False
2,chest_pain,False
3,rest_bp,False
4,cholesterol,False
5,fasting_bs,False
6,rest_ecg,False
7,max_heart_rate,False
8,exercise_angina,False
9,st_depression,False


## Analysing Correlations and Trends

In [212]:
heart.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 741 entries, 0 to 293
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   age              741 non-null    float64 
 1   sex              741 non-null    category
 2   chest_pain       741 non-null    category
 3   rest_bp          741 non-null    float64 
 4   cholesterol      741 non-null    float64 
 5   fasting_bs       741 non-null    category
 6   rest_ecg         741 non-null    category
 7   max_heart_rate   741 non-null    float64 
 8   exercise_angina  741 non-null    category
 9   st_depression    741 non-null    float64 
 10  diagnosis        741 non-null    category
dtypes: category(6), float64(5)
memory usage: 40.0 KB


In [213]:
bar = alt.Chart(heart).mark_bar().encode(
    alt.X("age", bin=alt.Bin(step=3)), alt.Y("count()"),
    color="count()").properties(title="Distribution of Age")
bar

From the table above we see that `age` in the dataset follows a normal distribution

In [214]:
heart.columns

Index(['age', 'sex', 'chest_pain', 'rest_bp', 'cholesterol', 'fasting_bs',
       'rest_ecg', 'max_heart_rate', 'exercise_angina', 'st_depression',
       'diagnosis'],
      dtype='object')

In [215]:
from altair import X, Y, Chart, Color

In [216]:
alt.Chart(heart).mark_bar().encode(
    X("sex"), Y("count()"),
    color="sex").properties(title="Number of genders in the dataset").interactive()

This graph shows that there is a class imbalance. class `0` is so underrepresented

In [217]:
Chart(heart).mark_bar().encode(
    X("diagnosis"), Y("count()"), color="diagnosis",
    tooltip="count()").properties(title="Number of samples per diagnosis")

The graph above shows that diagnosis 4 is underrepresented. This is a form of sampling bias and it might affect the performance of our model. 

In [221]:
cholesterol_vs_angina = Chart(heart).mark_boxplot().encode(
    Y("cholesterol:Q"), X("exercise_angina:O"),
    color="exercise_angina").properties(
        title="Excercise Angina vs Cholesterol Levels", width=200, height=300)

cholesterol_vs_rest = Chart(heart).mark_boxplot().encode(
    Y("cholesterol:Q"), X("rest_ecg:O"),
    color="rest_ecg").properties(title="Cholesterol Levels Vs rest ecg",
                                 width=200,
                                 height=300)

cholesterol_vs_fasting = Chart(heart).mark_boxplot().encode(
    Y("cholesterol:Q"), X("fasting_bs:O"),
    color="fasting_bs").properties(title="Cholesterol Levels Vs rest ecg",
                                 width=200,
                                 height=300)

cholesterol_vs_angina | cholesterol_vs_rest|cholesterol_vs_fasting

In [223]:
heart_rate_vs_angina = Chart(heart).mark_boxplot().encode(
    Y("max_heart_rate:Q"), X("exercise_angina:O"),
    color="exercise_angina").properties(
        title="Excercise Angina vs Maximum Heart Rate", width=190, height=300)

heart_rate_vs_rest = Chart(heart).mark_boxplot().encode(
    Y("max_heart_rate:Q"), X("rest_ecg:O"),
    color="rest_ecg").properties(title="Max Heart Rate Vs rest ecg",
                                 width=190,
                                 height=300)

heart_rate_vs_fasting = Chart(heart).mark_boxplot().encode(
    Y("max_heart_rate:Q"), X("fasting_bs:O"),
    color="fasting_bs").properties(title="Max Heart Rate Levels Vs rest ecg",
                                   width=190,
                                   height=300)
heart_rate_vs_angina | heart_rate_vs_rest | heart_rate_vs_fasting

In [224]:
Chart(heart).mark_point().encode(
    X("age", scale=alt.Scale(zero=False)),
    Y("max_heart_rate"),
    color="max_heart_rate"
).properties(
    title= "Maximum Heart Rate vs Age"
)

## Machine learning models
- ####  Obtaining Dummies from categorical variables

In [226]:
'''Obtaining excercise angina dummy variables'''
angina_dummies = pd.get_dummies(heart["exercise_angina"],
                                prefix="exercise_angina")

'''Obtaining Rest ECG dummy variables'''
rest_dummies = pd.get_dummies(heart["rest_ecg"], prefix="rest_ecg")

'''Obtaining fasting_bs dummy variables'''
fasting_dummies = pd.get_dummies(heart["fasting_bs"], prefix="fasting")

'''Obtaining chest pain dummy variables'''
chest_pain_dummies = pd.get_dummies(heart["chest_pain"], prefix="chest_pain")

'''Obtaining sex dummy variables'''
sex_dummies = pd.get_dummies(heart["sex"], prefix="sex")

'''Concatenating the dummy columns to the dataset'''
heart_df = pd.concat([heart, angina_dummies, 
                      rest_dummies, fasting_dummies, 
                      chest_pain_dummies,sex_dummies], axis=1)

'''dropping the non_dummy categorical variables'''
heart = heart_df.drop(["sex", "chest_pain", "fasting_bs", "rest_ecg", "exercise_angina"], axis=1)
heart.head()

Unnamed: 0,age,rest_bp,cholesterol,max_heart_rate,st_depression,diagnosis,exercise_angina_0.0,exercise_angina_1.0,rest_ecg_0.0,rest_ecg_1.0,rest_ecg_2.0,fasting_0.0,fasting_1.0,chest_pain_1.0,chest_pain_2.0,chest_pain_3.0,chest_pain_4.0,sex_0.0,sex_1.0
0,63.0,145.0,233.0,150.0,2.3,0,1,0,0,0,1,0,1,1,0,0,0,0,1
1,67.0,160.0,286.0,108.0,1.5,2,0,1,0,0,1,1,0,0,0,0,1,0,1
2,67.0,120.0,229.0,129.0,2.6,1,0,1,0,0,1,1,0,0,0,0,1,0,1
3,37.0,130.0,250.0,187.0,3.5,0,1,0,1,0,0,1,0,0,0,1,0,0,1
4,41.0,130.0,204.0,172.0,1.4,0,1,0,0,0,1,1,0,0,1,0,0,1,0


- #### Spliting data into Train and Test Sets

In [288]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler

predictors = heart.drop("diagnosis", axis=1)
y = heart["diagnosis"]

X_train, X_test, y_train, y_test = train_test_split(predictors,
                                                    y,
                                                    test_size=0.2,
                                                    stratify=y, random_state=42)
scaler = StandardScaler().fit(X_train)
scaler.transform(X_train)

StandardScaler()

In [290]:
#scaler.mean_

In [286]:
X_train

Unnamed: 0,age,rest_bp,cholesterol,max_heart_rate,st_depression,exercise_angina_0.0,exercise_angina_1.0,rest_ecg_0.0,rest_ecg_1.0,rest_ecg_2.0,fasting_0.0,fasting_1.0,chest_pain_1.0,chest_pain_2.0,chest_pain_3.0,chest_pain_4.0,sex_0.0,sex_1.0
253,44.0,150.0,288.0,150.0,3.0,0,1,1,0,0,1,0,0,1,0,0,0,1
105,61.0,120.0,282.0,135.0,4.0,0,1,0,1,0,1,0,0,0,0,1,0,1
270,61.0,140.0,207.0,138.0,1.9,0,1,0,0,1,1,0,0,0,0,1,0,1
106,49.0,140.0,187.0,172.0,0.0,1,0,1,0,0,1,0,0,0,1,0,0,1
140,59.0,140.0,221.0,164.0,0.0,0,1,1,0,0,1,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142,54.0,120.0,273.0,150.0,1.5,1,0,1,0,0,1,0,0,1,0,0,1,0
55,56.0,120.0,0.0,97.0,0.0,1,0,1,0,0,1,0,0,0,1,0,0,1
121,51.0,130.0,224.0,150.0,0.0,1,0,1,0,0,1,0,0,1,0,0,0,1
193,62.0,138.0,294.0,106.0,1.9,1,0,1,0,0,0,1,0,0,0,1,1,0


In [282]:
len(scaler.transform(predictors)[0])

18

In [271]:
X_train.cholesterol.mean()

219.75

In [298]:
 """
    
    """

'\n   "svm": {\n       "model": svm.SVC(gamma="auto"),\n       "params": {\n           "C": list(range(21)),\n           "kernel": ["rbf", "linear"]\n       }\n   },\n   '

In [309]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

models = {
    "svm": {
        "model": svm.SVC(gamma="auto", random_state=42),
        "params": {
            "C": list(range(21)),
            "kernel": ["rbf", "linear"]
        }
    },
   
    "Random Forest": {
        "model": RandomForestClassifier(random_state=42),
        "params": {
            "n_estimators": list(range(1,20)),
            "max_depth": list(range(1,20))
        }
    },
    "Logistic Regression": {
        "model": LogisticRegression(solver="liblinear", multi_class="auto", random_state=42),
        "params": {
            "C": list(range(1,20))
        }
    }
}
#models

In [310]:
scores = []
for model_name, model_parameter in models.items():
    clf = GridSearchCV(model_parameter["model"],
                       model_parameter["params"],
                       cv=5,
                       return_train_score=False)
    clf.fit(StandardScaler().fit_transform(X_train), y_train)
    scores.append({
        "model": model_name,
        "best_score": clf.best_score_,
        "best_params": clf.best_params_
    })

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/svm/_base.py", line 226, in fit
    fit(X, y, sample_weight, solver_type, kernel, random_seed=seed)
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/svm/_base.py", line 277, in _dense_fit
    self._probB, self.fit_status_ = libsvm.fit(
  File "sklearn/svm/_libsvm.pyx", line 192, in sklearn.svm._libsvm.fit
ValueError: C <= 0

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/svm/_base.py", line 226, in fit
    fit(X, y, sample_weight, solver_type, kernel, random_seed=seed)
  File "/opt/anaconda3/lib/python3.

In [311]:
scores

[{'model': 'svm',
  'best_score': 0.5845463609172482,
  'best_params': {'C': 1, 'kernel': 'rbf'}},
 {'model': 'Random Forest',
  'best_score': 0.6048283720267768,
  'best_params': {'max_depth': 6, 'n_estimators': 17}},
 {'model': 'Logistic Regression',
  'best_score': 0.5727531690642358,
  'best_params': {'C': 1}}]

In [255]:
for model_name, model_parameter in models.items():
    print(model_parameter)

{'model': SVC(gamma='auto'), 'params': {'C': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], 'kernel': ['rbf', 'linear']}}
{'model': RandomForestClassifier(), 'params': {'n_estimators': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19], 'max_depth': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]}}
{'model': LogisticRegression(solver='liblinear'), 'params': {'C': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]}}


In [256]:
for model_name, model_parameter in models.items():
    print(model_name)

svm
Random Forest
Logistic Regression
