# Lecture 23 – Data 100, Summer 2024

Data 100, Summer 2024

[Acknowledgments Page](https://ds100.org/su24/acks/)

In [None]:
import seaborn as sns
import pandas as pd
sns.set(font_scale=1.5)
import matplotlib.pyplot as plt
import numpy as np
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings("ignore")

In [None]:
# set numpy random seed so that this notebook is deterministic
np.random.seed(21)

# Linear Classification

In [None]:
from sklearn import datasets

iris_data = datasets.load_iris()
iris_labels = iris_data['target_names'][iris_data['target']]
iris_data = pd.DataFrame(data= iris_data['data'],
                         columns= iris_data['feature_names'])
iris_data.columns=['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
iris_data['species'] = iris_labels

In [None]:
iris_data.sample(5)

In [None]:
sns.scatterplot(data = iris_data, x = "petal_length", y="petal_width", hue="species")

In [None]:
from sklearn.linear_model import LogisticRegression
logistic_regression_model = LogisticRegression(multi_class = 'ovr')
logistic_regression_model = logistic_regression_model.fit(iris_data[["petal_length", "petal_width"]], iris_data["species"])

In [None]:
from matplotlib.colors import ListedColormap
sns_cmap = ListedColormap(np.array(sns.color_palette())[0:3, :])

xx, yy = np.meshgrid(np.arange(0, 7, 0.02),
                     np.arange(0, 2.8, 0.02))

Z_string = logistic_regression_model.predict(np.c_[xx.ravel(), yy.ravel()])
categories, Z_int = np.unique(Z_string, return_inverse=True)
Z_int = Z_int 
Z_int = Z_int.reshape(xx.shape)
cs = plt.contourf(xx, yy, Z_int, cmap=sns_cmap)
sns.scatterplot(data = iris_data, x = "petal_length", y="petal_width", hue="species")
plt.xlim(0, 7)
plt.ylim(0, 2.8)

In [None]:
iris_data[["petal_length", "petal_width"]]

In [None]:
logistic_regression_model.predict([[1.4, 0.2]])

In [None]:
logistic_regression_model.predict_proba([[1.4, 0.2]])

In [None]:
logistic_regression_model.classes_

# Decision Tree Classification

In [None]:
from sklearn import tree
decision_tree_model = tree.DecisionTreeClassifier(criterion='entropy')
decision_tree_model = decision_tree_model.fit(iris_data[["petal_length", "petal_width"]], iris_data["species"])

In [None]:
four_random_rows = iris_data.sample(4)
four_random_rows

In [None]:
decision_tree_model.predict(four_random_rows[["petal_length", "petal_width"]])

In [None]:
tree.plot_tree(decision_tree_model, 
               feature_names = ["petal_length", "petal_width"],
               class_names = ["setosa", "versicolor", "virginica"],
               rounded = True, filled = True)

In [None]:
sns_cmap = ListedColormap(np.array(sns.color_palette())[0:3, :])

xx, yy = np.meshgrid(np.arange(0, 7, 0.02),
                     np.arange(0, 2.8, 0.02))

Z_string = decision_tree_model.predict(np.c_[xx.ravel(), yy.ravel()])
categories, Z_int = np.unique(Z_string, return_inverse=True)
Z_int = Z_int 
Z_int = Z_int.reshape(xx.shape)
cs = plt.contourf(xx, yy, Z_int, cmap=sns_cmap)
sns.scatterplot(data = iris_data, x = "petal_length", y="petal_width", hue="species")

In [None]:
from sklearn.metrics import accuracy_score
predictions = decision_tree_model.predict(iris_data[["petal_length", "petal_width"]])
accuracy_score(predictions, iris_data["species"])

In [None]:
iris_data[(iris_data["petal_length"]> 2.45)&(iris_data["petal_width"]> 1.75)&(iris_data["petal_length"]<=4.85)]

# Overfitting
Instead of the petal measurements, let's use the sepal measurements to train the decision tree.

In [None]:
sns.scatterplot(data = iris_data, x = "sepal_length", y="sepal_width", hue="species", legend=False)

In [None]:
sepal_decision_tree_model = tree.DecisionTreeClassifier(criterion="entropy")
sepal_decision_tree_model = decision_tree_model.fit(iris_data[["sepal_length", "sepal_width"]], iris_data["species"])
sns_cmap = ListedColormap(np.array(sns.color_palette())[0:3, :])

xx, yy = np.meshgrid(np.arange(4, 8, 0.02),
                     np.arange(1.9, 4.5, 0.02))

Z_string = sepal_decision_tree_model.predict(np.c_[xx.ravel(), yy.ravel()])
categories, Z_int = np.unique(Z_string, return_inverse=True)
Z_int = Z_int 
Z_int = Z_int.reshape(xx.shape)
cs = plt.contourf(xx, yy, Z_int, cmap=sns_cmap)
sns.scatterplot(data = iris_data, x = "sepal_length", y="sepal_width", hue="species", legend=False)
fig = plt.gcf()
# fig.savefig("iris_sepal_decision_boundaries_all_150_points.png", dpi=300, bbox_inches = "tight")

Let's split the dataset into a training set with 110 observations, and a validation set with 40 observations.

In [None]:
train_iris_data, valid_iris_data = np.split(iris_data.sample(frac=1), [110])
print(train_iris_data.shape, valid_iris_data.shape)

In [None]:
# Sort so that the plot color labels match what we had in the earlier part of lecture
train_iris_data = train_iris_data.sort_values(by="species")
valid_iris_data = valid_iris_data.sort_values(by="species")

In [None]:
train_iris_data.head(5)

We use the training data to fit our old model (using the petal measurements `petal_length` and `petal_width`):

In [None]:
decision_tree_model = tree.DecisionTreeClassifier(criterion="entropy")
decision_tree_model = decision_tree_model.fit(train_iris_data[["petal_length", "petal_width"]], train_iris_data["species"])

In [None]:
tree.plot_tree(decision_tree_model, feature_names = ["petal_length", "petal_width"],
              class_names = ["setosa", "versicolor", "virginica"],
              rounded = True, filled = True);

Decision boundary on the training data:

In [None]:
sns_cmap = ListedColormap(np.array(sns.color_palette())[0:3, :])

xx, yy = np.meshgrid(np.arange(0, 7, 0.02),
                     np.arange(0, 2.8, 0.02))

Z_string = decision_tree_model.predict(np.c_[xx.ravel(), yy.ravel()])
categories, Z_int = np.unique(Z_string, return_inverse=True)
Z_int = Z_int 
Z_int = Z_int.reshape(xx.shape)
cs = plt.contourf(xx, yy, Z_int, cmap=sns_cmap)
sns.scatterplot(data = train_iris_data, x = "petal_length", y="petal_width", hue="species");

Decision boundary and the validation data:

In [None]:
sns_cmap = ListedColormap(np.array(sns.color_palette())[0:3, :])

xx, yy = np.meshgrid(np.arange(0, 7, 0.02),
                     np.arange(0, 2.8, 0.02))

Z_string = decision_tree_model.predict(np.c_[xx.ravel(), yy.ravel()])
categories, Z_int = np.unique(Z_string, return_inverse=True)
Z_int = Z_int 
Z_int = Z_int.reshape(xx.shape)
cs = plt.contourf(xx, yy, Z_int, cmap=sns_cmap)
sns.scatterplot(data = valid_iris_data, x = "petal_length", y="petal_width", hue="species")

Accuracy on the traning data:

In [None]:
accuracy_score(decision_tree_model.predict(train_iris_data[["petal_length", "petal_width"]]), train_iris_data["species"])

Accuracy on the validation data:

In [None]:
predictions = decision_tree_model.predict(valid_iris_data[["petal_length", "petal_width"]])
accuracy_score(predictions, valid_iris_data["species"])

Let's now use the sepal measurements (`sepal_length` and `sepal_width`) to train the decision trees.

In [None]:
sepal_decision_tree_model = tree.DecisionTreeClassifier(criterion="entropy")
sepal_decision_tree_model = sepal_decision_tree_model.fit(train_iris_data[["sepal_length", "sepal_width"]], train_iris_data["species"])

In [None]:
tree.plot_tree(sepal_decision_tree_model, feature_names = ["sepal_length", "sepal_width"],
              class_names = ["setosa", "versicolor", "virginica"],
              rounded = True, filled = True);

Decision boundary and training data:

In [None]:
sns_cmap = ListedColormap(np.array(sns.color_palette())[0:3, :])

xx, yy = np.meshgrid(np.arange(4, 8, 0.02),
                     np.arange(1.9, 4.5, 0.02))

Z_string = sepal_decision_tree_model.predict(np.c_[xx.ravel(), yy.ravel()])
categories, Z_int = np.unique(Z_string, return_inverse=True)
Z_int = Z_int 
Z_int = Z_int.reshape(xx.shape)
cs = plt.contourf(xx, yy, Z_int, cmap=sns_cmap)
sns.scatterplot(data = train_iris_data, x = "sepal_length", y="sepal_width", hue="species", legend=False)

Decision boundary and validation data:

In [None]:
sns_cmap = ListedColormap(np.array(sns.color_palette())[0:3, :])

xx, yy = np.meshgrid(np.arange(4, 8, 0.02),
                     np.arange(1.9, 4.5, 0.02))

Z_string = sepal_decision_tree_model.predict(np.c_[xx.ravel(), yy.ravel()])
categories, Z_int = np.unique(Z_string, return_inverse=True)
Z_int = Z_int 
Z_int = Z_int.reshape(xx.shape)
cs = plt.contourf(xx, yy, Z_int, cmap=sns_cmap)
sns.scatterplot(data = valid_iris_data, x = "sepal_length", y="sepal_width", hue="species", legend=False)

Accuracy on the training data:

In [None]:
accuracy_score(sepal_decision_tree_model.predict(train_iris_data[["sepal_length", "sepal_width"]]), train_iris_data["species"])

Accuracy on the validation data:

In [None]:
accuracy_score(sepal_decision_tree_model.predict(valid_iris_data[["sepal_length", "sepal_width"]]), valid_iris_data["species"])

<br>
<hr>
<br>
Naturally, we can include even more features. For example, if we want to use the petal AND sepal measurements, we simply train the decision tree on all four columns of the data.

In [None]:
decision_tree_model_4d = tree.DecisionTreeClassifier(criterion="entropy")
decision_tree_model_4d = decision_tree_model_4d.fit(train_iris_data[["petal_length", "petal_width", 
                                                                     "sepal_length", "sepal_width"]], train_iris_data["species"])

In [None]:
predictions = decision_tree_model_4d.predict(train_iris_data[["petal_length", "petal_width", "sepal_length", "sepal_width"]])
accuracy_score(predictions, train_iris_data["species"])

In [None]:
predictions = decision_tree_model_4d.predict(valid_iris_data[["petal_length", "petal_width", "sepal_length", "sepal_width"]])
accuracy_score(predictions, valid_iris_data["species"])

In [None]:
plt.figure(figsize=(6,6))  # set plot size (denoted in inches)
tree.plot_tree(decision_tree_model_4d, feature_names = ["petal_length", "petal_width", "sepal_length", "sepal_width"],
              class_names = ["setosa", "versicolor", "virginica"],
              rounded = True, filled = True, fontsize=4)
plt.show()

In [None]:
plt.figure(figsize=(12,12))  # set plot size (denoted in inches)
tree.plot_tree(sepal_decision_tree_model, feature_names = ["sepal_length", "sepal_width"],
              class_names = ["setosa", "versicolor", "virginica"],
              rounded = True, filled = True, fontsize=4)
plt.show()

Comparing our decision tree diagram with four features (petal and sepal) versus our decision tree diagram with just sepal featuers, we can see that the 4d-model does not overfit, even though it has access to all of these features. Instead, these two models are extremely similar. In fact, our 4d-model only makes use of the setal features exactly once, and that's to resolve the tricky case we had before where there were overlapping virginica and versicolor flowers. 

This showcases that for decision tree models, more features doesn’t necessarily lead to overfitting, especially if a small subset of the features do a good job of resolving the difference between the classes.

<br>
<hr>
<br">

Say we want to just use sepal features. Looking at the corresponding decision tree rules and train/validation accuracy (94.5% versus 75%), it seems like we may be overfitting! 

Let's try to prevent growth using two `sklearn` hyperparameters: `min_samples_split` and `max_depth`. You can read more about these hyperparameters (and others) in the [DecisionTreeClassifier documentation](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html).

### `min_samples_split`

In [None]:
sepal_decision_tree_model_minsamples = tree.DecisionTreeClassifier(criterion="entropy", min_samples_split=10)
sepal_decision_tree_model_minsamples = sepal_decision_tree_model_minsamples.fit(train_iris_data[["sepal_length", "sepal_width"]], 
                                                    train_iris_data["species"])

Let's compare the resulting decision tree rules with `min_samples_split = 10` versus before (default `min_samples_split = 2`).

In [None]:
plt.figure(figsize=(12,12))  # set plot size (denoted in inches)
tree.plot_tree(sepal_decision_tree_model_minsamples, feature_names = ["sepal_length", "sepal_width"],
              class_names = ["setosa", "versicolor", "virginica"],
              rounded = True, filled = True, fontsize=6)
plt.show()

In [None]:
plt.figure(figsize=(12,12))  # set plot size (denoted in inches)
tree.plot_tree(sepal_decision_tree_model, feature_names = ["sepal_length", "sepal_width"],
              class_names = ["setosa", "versicolor", "virginica"],
              rounded = True, filled = True, fontsize=6)
plt.show()

In [None]:
print('train accuracy:', accuracy_score(sepal_decision_tree_model_minsamples.predict(train_iris_data[["sepal_length", "sepal_width"]]), 
                     train_iris_data["species"]))

In [None]:
print('validation accuracy:', accuracy_score(sepal_decision_tree_model_minsamples.predict(valid_iris_data[["sepal_length", "sepal_width"]]), 
                     valid_iris_data["species"]))

### `max_depth`

In [None]:
sepal_decision_tree_model_maxdepth = tree.DecisionTreeClassifier(criterion="entropy", max_depth=5)
sepal_decision_tree_model_maxdepth = sepal_decision_tree_model_maxdepth.fit(train_iris_data[["sepal_length", "sepal_width"]], 
                                                    train_iris_data["species"])

Let's compare the resulting decision tree rules with `max_depth = 5` versus before (default `max_depth = None` or no max depth).

In [None]:
plt.figure(figsize=(12,12))  # set plot size (denoted in inches)
tree.plot_tree(sepal_decision_tree_model_maxdepth, feature_names = ["sepal_length", "sepal_width"],
              class_names = ["setosa", "versicolor", "virginica"],
              rounded = True, filled = True, fontsize=6)
plt.show()

In [None]:
plt.figure(figsize=(12,12))  # set plot size (denoted in inches)
tree.plot_tree(sepal_decision_tree_model, feature_names = ["sepal_length", "sepal_width"],
              class_names = ["setosa", "versicolor", "virginica"],
              rounded = True, filled = True, fontsize=6)
plt.show()

In [None]:
print('train accuracy:', accuracy_score(sepal_decision_tree_model_maxdepth.predict(train_iris_data[["sepal_length", "sepal_width"]]), 
                     train_iris_data["species"]))

In [None]:
print('train accuracy:', accuracy_score(sepal_decision_tree_model_maxdepth.predict(valid_iris_data[["sepal_length", "sepal_width"]]), 
                     valid_iris_data["species"]))

We can also use these hyperparameters together and/or with other hyperparameters.

<br>
<hr>
<br>
Let's see how different the decision trees could have been if the data had been slightly different.

The code below generates new train-validation splits (by shuffling the data) and then fits Decision Trees on the training data. It does this 10 times.

In [None]:
ten_decision_tree_models = []
ten_training_sets = []
for i in range(10):
    current_model = tree.DecisionTreeClassifier(criterion="entropy")
    temp_iris_training_data, temp_iris_test_data = np.split(iris_data.sample(frac=1), [110])
    temp_iris_training_data = temp_iris_training_data.sort_values("species")
    current_model.fit(temp_iris_training_data[["sepal_length", "sepal_width"]], temp_iris_training_data["species"])
    ten_decision_tree_models.append(current_model)
    ten_training_sets.append(temp_iris_training_data)

In [None]:
def plot_decision_tree(decision_tree_model, data = None, disable_axes = False):
    from matplotlib.colors import ListedColormap
    sns_cmap = ListedColormap(np.array(sns.color_palette())[0:3, :])

    xx, yy = np.meshgrid(np.arange(4, 8, 0.02),
                     np.arange(1.9, 4.5, 0.02))

    Z_string = decision_tree_model.predict(np.c_[xx.ravel(), yy.ravel()])
    categories, Z_int = np.unique(Z_string, return_inverse=True)
    Z_int = Z_int.reshape(xx.shape)
    cs = plt.contourf(xx, yy, Z_int, cmap=sns_cmap)
    if data is not None:
        sns.scatterplot(data = data, x = "sepal_length", y="sepal_width", hue="species", legend=False);

    if disable_axes:
        plt.axis("off")

Let's visualize the decision boundaries for these ten Decision Trees.

In [None]:
m_num = 0
plot_decision_tree(ten_decision_tree_models[m_num], ten_training_sets[m_num])

In [None]:
m_num = 7
plot_decision_tree(ten_decision_tree_models[m_num], ten_training_sets[m_num])

In [None]:
# Decision boundaries for 9 Decision Trees 
import matplotlib.gridspec as gridspec
gs1 = gridspec.GridSpec(3, 3)
gs1.update(wspace=0.025, hspace=0.025) # set the spacing between axes. 

for i in range(0, 9):
    plt.subplot(gs1[i]) #3, 3, i)
    plot_decision_tree(ten_decision_tree_models[i], None, True)    

What do we do?
* Idea 1: Bagging or Bootstrap Aggregating (Leo Breiman).
* Idea 2: Only use a random subset of m features at each split.

# Random Forests 

Important arguments ([sklearn RandomForestClassifier documentation](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html)):
* `n_estimators, default = 100`
* `bootstrap, default = True`
* `max_features, default = 'sqrt'`
* `max_depth, default = None` (same as `DecisionTreeClassifier`)
* `min_samples_split, default = 2` (same as `DecisionTreeClassifier`)

In [None]:
from sklearn import ensemble

sepal_random_forest_model = ensemble.RandomForestClassifier(criterion="entropy", max_depth=5, min_samples_split=10)
sepal_random_forest_model = sepal_random_forest_model.fit(train_iris_data[["sepal_length", "sepal_width"]], 
                                                          train_iris_data["species"])
sepal_random_forest_model

In [None]:
print('train accuracy:', accuracy_score(sepal_random_forest_model.predict(train_iris_data[["sepal_length", "sepal_width"]]),
                                        train_iris_data["species"]))

In [None]:
print('validation accuracy:', accuracy_score(sepal_decision_tree_model_minsamples.predict(valid_iris_data[["sepal_length", "sepal_width"]]),
                                             valid_iris_data["species"]))