In [None]:
%pip install palmerpenguins

In [None]:
# Import packages and functions
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeRegressor, export_text
from sklearn import tree, metrics
from palmerpenguins import load_penguins

import matplotlib_inline.backend_inline

matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

In [None]:
# Load the penguins data from palmerpenguins package
penguins = load_penguins()

# Drop penguins with missing values
penguins = penguins.dropna()

# Create a new data frame with only Gentoo penguins
gentoo = penguins[penguins['species'] == 'Gentoo'].copy()

# Calculate summary statistics using .describe()
gentoo.describe(include='all')

In [None]:
# Create a matrix of input features with sex, flipper length, and bill length
X = gentoo[['sex', 'flipper_length_mm', 'bill_length_mm']]
X

`DecisionTreeRegressor` only takes numerical values as features, which means features like `sex` and `island` won't work. 

Thus, categorical features must be encoded as dummy variables that indicate which category the individual falls into.  This encoding is done in `pandas` with `get_dummies`.

In [None]:
# Use pd.get_dummies to convert sex to a binary (0/1) dummy variable
X_dummies = pd.get_dummies(X, drop_first=True)
X_dummies

The `drop_first=True` option creates a single dummy variable, instead of two. One dummy variable completely describes sex in this dataset.

- `sex_male=0`: female
- `sex_male=1`: male

In [None]:
y = gentoo['body_mass_g']

regtreeModel = DecisionTreeRegressor(max_depth=2, min_samples_leaf=2)
regtreeModel.fit(X_dummies, y)

In [None]:
# The print() statement outputs a text version of the regression tree
print(export_text(regtreeModel, feature_names=X.columns.to_list()))

In [None]:
# Using tree.plot_tree() makes a cleaner figure

# Resize the plotting window
plt.figure(figsize=[12, 8])

p = tree.plot_tree(
    regtreeModel,
    feature_names=X.columns,
    class_names=y.unique(),
    filled=False,
    fontsize=10,
)

In [None]:
# Add the predictions to the original data set
gentoo['pred'] = regtreeModel.predict(X_dummies)
gentoo

In [None]:
# Plot observed vs. predictions
p = sns.scatterplot(data=gentoo, x='body_mass_g', y='pred', hue='sex')
p.set_xlabel('Observed body mass', fontsize=14)
p.set_ylabel('Predicted body mass', fontsize=14)

In [None]:
# Calculate MSE
metrics.mean_squared_error(gentoo['pred'], y)