Apply boosting, bagging, random forests, and BART to a data set
of your choice. Be sure to ft the models on a training set and to
evaluate their performance on a test set. How accurate are the results
compared to simple methods like linear or logistic regression? Which
of these approaches yields the best performance?

In [0]:
%pip install --quiet mlxtend

### Preprocessing

In [0]:
# import relevant statistical packages
import numpy as np
import pandas as pd

In [0]:
# import relevant data visualisation packages
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [0]:
# import custom packages
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score as r2, mean_squared_error
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
from mlxtend.plotting import plot_linear_regression as PLS
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import confusion_matrix, classification_report

In [0]:
# load and preprocess data
url = "abfss://training@sa8451learningdev.dfs.core.windows.net/interpretable_machine_learning/eml_data/Heart.csv"
df = spark.read.option("header", "true").csv(url).toPandas()
df.set_index("_c0", inplace=True)

str_cols = ["ChestPain", "Thal", "AHD"]
float_cols = ["Ca", "Oldpeak"]
int_cols = list(set(df.columns)-set(str_cols)-set(float_cols))
df[str_cols] = df[str_cols].astype(str)
df["Ca"] = np.where(df["Ca"] == "NA", np.nan, df["Ca"])
df[float_cols] = df[float_cols].astype(float)
df[int_cols] = df[int_cols].astype(int)

In [0]:
df.ChestPain.value_counts()

In [0]:
df.Thal.value_counts()

In [0]:
df.AHD.value_counts()

In [0]:
df.AHD = df.AHD.map({'No': 0, 'Yes': 1})

In [0]:
df.Thal = pd.factorize(df.Thal)[0]

In [0]:
df.ChestPain = pd.factorize(df.ChestPain)[0]

In [0]:
df.head(25)

In [0]:
plt.xkcd()
plt.figure(figsize=(25, 10))
sns.heatmap(df.isna(), yticklabels=False, cbar=False, cmap='viridis')
plt.title('missing values in df', fontsize=30, color='m')
plt.xlabel('features', fontsize=20, color='c')

In [0]:
df.drop(axis=0, inplace=True, columns='Ca')

In [0]:
plt.xkcd()
plt.figure(figsize=(25, 10))
sns.heatmap(df.isna(), yticklabels=False, cbar=False, cmap='viridis')
plt.title('missing values in df', fontsize=30, color='m')
plt.xlabel('features', fontsize=20, color='c')

Now, we have no missing values in df.

In [0]:
# TODO: your response here