# Packages

1. Load required packages!

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Visualisation
import seaborn as sns

# theme adaptation for x, y-labels
from jupyterthemes import jtplot
jtplot.style(theme='monokai', context='notebook', ticks=True, grid=False)

# Data Preparation and Exploratory Data Analysis

## Data Import

We work on the [Heart Disease Dataset](https://archive.ics.uci.edu/ml/datasets/Heart+Disease) from UCI Machine Learning repository.

Only 14 attributes used:
1. #3 (age)
2. #4 (sex) (1 = male; 0 =female)
3. #9 (cp) (1 = typical angina; 2 = atypical angina; 3 = non-anginal pain; 4 = asymptomatic)
4. #10 (trestbps) resting blood pressure (in mm Hg on admission to the hopsital)
5. #12 (chol) serum cholestoral in mg/dl
6. #16 (fbs) fasting blood sugar > 120 mg/dl (1 = true; = 0 false)
7. #19 (restecg) resting electrocardiographic results (1 = normal; 2 = having ST-T wave abnormality; 2 = showing probable or definite left ventricular hypertrophy)
8. #32 (thalach) maximum heart rate achieved
9. #38 (exang) exercise induced angina (1 = yes; 0 = no)
10. #40 (oldpeak) ST depression induced by exercise relative to rest
11. #41 (slope) the slope of the peak exercise ST segment (1 = upsloping; 2 = flat; 3 = downsloping)
12. #44 (ca) number of major vessels (0â€“3) colored by flourosopy
13. #51 (thal) (3 = normal; 6 = fixed defect; 7 = reversable defect)
14. #58 (num) (the predicted attribute) diagnosis of heart disease. It is an integer valued from 0 (no presence) to 4.

2. Import the dataset and store it to object "data"

3. Get familiar with the dataset and understand its shape and features.

4. Correct the column names of the dataset.

In [None]:
actual_column_names = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']
# code here

In [None]:
heart.columns

## Feature Types

5. Encode the non-numeric features as "category".

In [None]:
heart['sex'] = heart['sex'].astype('category')
heart['cp'] = heart['cp'].astype('category')
heart['fbs'] = heart['fbs'].astype('category')
heart['restecg'] = heart['restecg'].astype('category')
heart['exang'] = heart['exang'].astype('category')
heart['slope'] = heart['slope'].astype('category')
heart['ca'] = heart['ca'].astype('category')
heart['thal'] = heart['thal'].astype('category')

## One-Hot-Encoding

6. Perform one-hot encoding on non-numeric features.

In [None]:
heart_one_hot = pd.get_dummies(heart, columns=["sex", 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal'])

In [None]:
heart_one_hot.describe()

In [None]:
corr = heart_one_hot.corr()

In [None]:
# generate mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

sns.heatmap(corr, mask=mask, cmap=cmap, vmin=-1, vmax=1, center=0,
            linewidths=.5, cbar_kws={"shrink": .5})

## Train / Test Split

7. separate dependent and independent features

In [None]:
# code here

8. create train and test datasets

In [None]:
# code here

# Modeling

9. build the random forest by making use of sklearn. scale the data, and creae predictions

In [None]:
# code here

## Model Evaluation

## Baseline Classifier

10. Calculate the baseline classifier accuracy

In [None]:
# code here

In [None]:
# code here

We have a baseline classifier of 45 %.

## Random Forest Classifier

11. create a confusion matrix

In [None]:
# code here

12. calculate the accuracy

In [None]:
# code here

Our classifier has an accuracy of 49 %.