##### Lab Objectives - In this lab, we will:

        1. Work with a dataset related to cardiovascular disease
        2. Build three different models to estimate how likely a person is to develop cardiovascular disease
        3. Implement a Decision Tree model from scikit-learn
        4. Implement a Random Forrest model from scikit-learn
        5. Implement the XGBoost using its own library
        6. Investigate how different parameters on the three models impact their performance

        In this notebook, we will:

            -Use Pandas to perform one-hot encoding of a dataset
            -Use scikit-learn to implement a Decision Tree, Random Forest and XGBoost models


In [None]:
import numpy as numpy
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
import matplotlib.pyplot as plt

plt.style.use("./deeplearning.mplstyle")

RANDOM_STATE = 55  ## We will pass it to every sklearn call so we ensure reproducibility

In [None]:
df = pd.read_csv("heart.csv")

In [None]:
df.head()

In [None]:
cat_variables = ["Sex", "ChestPainType", "RestingECG", "ExerciseAngina", "ST_Slope"]

In [None]:
df = pd.get_dummies(data=df, prefix=cat_variables, columns=cat_variables)

In [None]:
df.head()

In [None]:
features = [
    x for x in df.columns if x not in "HeartDisease"
]  ## Removing our target variable

In [None]:
print(len(features))

In [None]:
help(train_test_split)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    df[features], df["HeartDisease"], train_size=0.8, random_state=RANDOM_STATE
)
# We will keep the shuffle = True since our dataset has not any time dependency.

In [None]:
print(f"train samples: {len(X_train)}")
print(f"validation samples: {len(X_val)}")
print(f"target proportion: {sum(y_train)/len(y_train):.4f}")

In [None]:
# The hyperparameters we will use and investigate here are:

# min_samples_split: The minimum number of samples required to split an internal node.
# Choosing a higher min_samples_split can reduce the number of splits and may help to reduce overfitting.
# max_depth: The maximum depth of the tree.
# Choosing a lower max_depth can reduce the number of splits and may help to reduce overfitting.
min_samples_split_list = [
    2,
    10,
    30,
    50,
    100,
    200,
    300,
    700,
]  ## If the number is an integer, then it is the actual quantity of samples,
max_depth_list = [
    1,
    2,
    3,
    4,
    8,
    16,
    32,
    64,
    None,
]  # None means that there is no depth limit. Default = None

In [None]:
accuracy_list_train = []
accuracy_list_val = []
for min_samples_split in min_samples_split_list:
    # We can fit the model at the same time you define it, because the fit function returns the fitted estimator.
    model = DecisionTreeClassifier(
        min_samples_split=min_samples_split, random_state=RANDOM_STATE
    ).fit(X_train, y_train)
    predictions_train = model.predict(X_train)
    predictions_val = model.predict(X_val)
    accuracy_train = accuracy_score(y_train, y_pred=predictions_train)
    accuracy_val = accuracy_score(y_val, predictions_val)
    accuracy_list_train.append(accuracy_train)
    accuracy_list_val.append(accuracy_val)

plt.title("Train x Validation metrics")
plt.xlabel("min_samples_split")
plt.ylabel("accuracy")
plt.xticks(ticks=range(len(min_samples_split_list)), labels=min_samples_split_list)
plt.plot(accuracy_list_train)
plt.plot(accuracy_list_val)
plt.legend(["Train", "Validation"])

In [None]:
accuracy_list_train = []
accuracy_list_val = []
for max_depth in max_depth_list:
    model = DecisionTreeClassifier(max_depth=max_depth, random_state=RANDOM_STATE).fit(
        X_train, y_train
    )
    predictions_train = model.predict(X_train)
    predictions_val = model.predict(X_val)
    accuracy_train = accuracy_score(y_train, y_pred=predictions_train)
    accuracy_val = accuracy_score(y_val, predictions_val)
    accuracy_list_train.append(accuracy_train)
    accuracy_list_val.append(accuracy_val)

plt.title("Train x Validation metrics")
plt.xlabel("max_depth")
plt.ylabel("accuracy")
plt.xticks(ticks=range(len(max_depth_list)), labels=max_depth_list)
plt.plot(accuracy_list_train)
plt.plot(accuracy_list_val)
plt.legend(["Train", "Validation"])

In [None]:
# We can see that in general, reducing max_depth can help to reduce overfitting.

# Reducing max_depth from 8 to 4 increases validation accuracy closer to training accuracy, while significantly reducing training accuracy.
# The validation accuracy reaches the highest at tree_depth=4.
# When the max_depth is smaller than 3, both training and validation accuracy decreases. The tree cannot make enough splits to distinguish positives from negatives (the model is underfitting the training set).
# When the max_depth is too high ( >= 5), validation accuracy decreases while training accuracy increases, indicating that the model is overfitting to the training set.
# So we can choose the best values for these two hyper-parameters for our model to be:

# max_depth = 4
# min_samples_split = 50
decision_tree_model = DecisionTreeClassifier(
    min_samples_split=50, max_depth=4, random_state=RANDOM_STATE
).fit(X_train, y_train)

In [None]:
print(
    f"Metrics train:\n\tAccuracy score: {accuracy_score(y_train, decision_tree_model.predict(X_train)):.4f}"
)
print(
    f"Metrics validation:\n\tAccuracy score: {accuracy_score(y_val, decision_tree_model.predict(X_val)):.4f}"
)

In [None]:
# Random Forest
# n_estimators = the number of Decision Trees that make up the Random Forest. Default is 100
# max_features = number of random features to be used at a node in information gain comparison
# n_jobs = number of cpu cores to use for training the trees in parallel.
min_samples_split_list = [
    2,
    10,
    30,
    50,
    100,
    200,
    300,
    700,
]  ## If the number is an integer, then it is the actual quantity of samples,
## If it is a float, then it is the percentage of the dataset
max_depth_list = [2, 4, 8, 16, 32, 64, None]
n_estimators_list = [10, 50, 100, 500]

In [None]:
accuracy_list_train = []
accuracy_list_val = []
for min_samples_split in min_samples_split_list:
    # We can fit the model at the same time you define it, because the fit function returns the fitted estimator.
    model = RandomForestClassifier(
        min_samples_split=min_samples_split, random_state=RANDOM_STATE
    ).fit(X_train, y_train)
    predictions_train = model.predict(X_train)
    predictions_val = model.predict(X_val)
    accuracy_train = accuracy_score(y_train, predictions_train)
    accuracy_val = accuracy_score(y_val, predictions_val)
    accuracy_list_train.append(accuracy_train)
    accuracy_list_val.append(accuracy_val)

plt.title("Train x Validation metrics")
plt.xlabel("min_samples_split")
plt.ylabel("accuracy")
plt.xticks(ticks=range(len(min_samples_split_list)), labels=min_samples_split_list)
plt.plot(accuracy_list_train)
plt.plot(accuracy_list_val)
plt.legend(["Train", "Validation"])

In [None]:
accuracy_list_train = []
accuracy_list_val = []
for max_depth in max_depth_list:
    # We can fit the model at the same time you define it, because the fit function returns the fitted estimator.
    model = RandomForestClassifier(max_depth=max_depth, random_state=RANDOM_STATE).fit(
        X_train, y_train
    )
    predictions_train = model.predict(X_train)
    predictions_val = model.predict(X_val)
    accuracy_train = accuracy_score(y_train, predictions_train)
    accuracy_val = accuracy_score(y_val, predictions_val)
    accuracy_list_train.append(accuracy_train)
    accuracy_list_val.append(accuracy_val)

plt.title("Train x Validation metrics")
plt.xlabel("max_depth")
plt.ylabel("accuracy")
plt.xticks(ticks=range(len(max_depth_list)), labels=max_depth_list)
plt.plot(accuracy_list_train)
plt.plot(accuracy_list_val)
plt.legend(["Train", "Validation"])

In [None]:
accuracy_list_train = []
accuracy_list_val = []
for n_estimators in n_estimators_list:
    # We can fit the model at the same time you define it, because the fit function returns the fitted estimator.
    model = RandomForestClassifier(
        n_estimators=n_estimators, random_state=RANDOM_STATE
    ).fit(X_train, y_train)
    predictions_train = model.predict(X_train)
    predictions_val = model.predict(X_val)
    accuracy_train = accuracy_score(y_train, predictions_train)
    accuracy_val = accuracy_score(y_val, predictions_val)
    accuracy_list_train.append(accuracy_train)
    accuracy_list_val.append(accuracy_val)

plt.title("Train x Validation metrics")
plt.xlabel("n_estimators")
plt.ylabel("accuracy")
plt.xticks(ticks=range(len(n_estimators_list)), labels=n_estimators_list)
plt.plot(accuracy_list_train)
plt.plot(accuracy_list_val)
plt.legend(["Train", "Validation"])

In [None]:
# Let's then fit a random forest with the following parameters:

# max_depth: 16
# min_samples_split: 10
# n_estimators: 100
random_forest_model = RandomForestClassifier(
    n_estimators=100, max_depth=16, min_samples_split=10
).fit(X_train, y_train)

In [None]:
print(
    f"Metrics train:\n\tAccuracy score: {accuracy_score(y_train, random_forest_model.predict(X_train)):.4f}\nMetrics test:\n\tAccuracy score: {accuracy_score(y_val, random_forest_model.predict(X_val)):.4f}"
)

In [None]:
# XGBoost
n = int(len(X_train) * 0.8)
X_train_fit, X_train_eval, y_train_fit, y_train_eval = (
    X_train[:n],
    X_train[n:],
    y_train[:n],
    y_train[n:],
)

In [None]:
xgb_model = XGBClassifier(
    n_estimators=500,
    learning_rate=0.1,
    verbosity=1,
    early_stopping_rounds=10,
    random_state=RANDOM_STATE,
)
xgb_model.fit(
    X_train_fit,
    y_train_fit,
    eval_set=[(X_train_eval, y_train_eval)],
)

In [None]:
xgb_model.best_iteration

In [None]:
print(
    f"Metrics train:\n\tAccuracy score: {accuracy_score(y_train, xgb_model.predict(X_train)):.4f}\nMetrics test:\n\tAccuracy score: {accuracy_score(y_val, xgb_model.predict(X_val)):.4f}"
)