# Exercise 8.9 {.unnumbered}
> This problem involves the `OJ` data set which is part of the ISLP
package.


In [None]:
#| echo: False
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
oj = pd.read_csv("data/OJ.csv")
oj.head()

## (a)
> Create a training set containing a random sample of 800 observations, and a test set containing the remaining observations


In [None]:
from sklearn.model_selection import train_test_split

X = oj.drop("Purchase", axis=1).replace({"Store7":{"Yes": 1, "No": 0}})
y = oj["Purchase"].replace({"CH": 1, "MM": 0})

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=0, train_size=0.8
)

## (b)
> Fit a tree to the training data, with Purchase as the response and the other variables as predictors. What is the training error rate?


In [None]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(random_state=0, max_leaf_nodes=5)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

## (c)

In [None]:
from sklearn.tree import plot_tree
plot_tree(clf)

The tree has $5$ terminal nodes (`clf.get_n_leaves()`). 


In [None]:
clf.get_n_leaves()

## (e)
> Predict the response on the test data, and produce a confusion matrix comparing the test labels to the predicted test labels. What is the test error rate?


In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_estimator(clf, X_test, y_test)

In [None]:
from sklearn.metrics import RocCurveDisplay
RocCurveDisplay.from_estimator(clf, X_test, y_test)

In [None]:
from sklearn.metrics import RocCurveDisplay
RocCurveDisplay.from_estimator(clf, X_test, y_test)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

clf_boost = GradientBoostingClassifier()
clf_boost.fit(X_train, y_train)
clf_rf = RandomForestClassifier()
clf_rf.fit(X_train, y_train)
clf_logit = LogisticRegression()
clf_logit.fit(X_train, y_train)

_, ax = plt.subplots()
RocCurveDisplay.from_estimator(clf_boost, X_test, y_test, ax=ax)
RocCurveDisplay.from_estimator(clf, X_test, y_test, ax=ax)
RocCurveDisplay.from_estimator(clf_rf, X_test, y_test, ax=ax)
RocCurveDisplay.from_estimator(clf_logit, X_test, y_test, ax=ax)

from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

loss = lambda clf: -log_loss(y_test, clf.predict_proba(X_test))
aucl = lambda clf: roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
accl = lambda clf: accuracy_score(y_test, clf.predict(X_test))


pd.DataFrame(
    {
        "logloss": [loss(clf_boost), loss(clf_rf), loss(clf_logit), loss(clf)],
        "AUC": [aucl(clf_boost), aucl(clf_rf), aucl(clf_logit), aucl(clf)],
        "accuracy": [accl(clf_boost), accl(clf_rf), accl(clf_logit), accl(clf)],
    },
    index=["boost", "random forest", "logit", "tree"],
)

## (f)
> Use cross-validation on the training set in order to determine the optimal tree size.


In [None]:
from sklearn.model_selection import GridSearchCV

clf_cv = GridSearchCV(
    DecisionTreeClassifier(random_state=0),
    param_grid=[{"max_leaf_nodes": range(2, 20)}],
    cv=5,
    refit="neg_log_loss",
    scoring=["neg_log_loss", "accuracy"],
)

clf_cv.fit(X_train, y_train)
clf_cv.score(X_test, y_test)

***Note:*** The score method is inconsistent here. To get accuracy we need


In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(clf_cv.predict(X_test), y_test)

## (g)
> Produce a plot with tree size on the $x$-axis and cross-validated log loss on the $y$-axis.


In [None]:
x = [list(x.values())[0] for x in clf_cv.cv_results_["params"]]
y = clf_cv.cv_results_["mean_test_neg_log_loss"]
plt.plot(x,y)
plt.xlabel("Maximal number of leaf nodes")
plt.ylabel("Cross-validated negative log loss")

## (h)

In [None]:
x_acc = [list(x.values())[0] for x in clf_cv.cv_results_["params"]]
y_acc = clf_cv.cv_results_["mean_test_accuracy"]
plt.plot(x_acc,y_acc)
plt.xlabel("Maximal number of leaf nodes")
plt.ylabel("Cross-validated accuracy score")

## (i)

In [None]:
x_acc = [list(x.values())[0] for x in clf_cv.cv_results_["params"]]
y_acc = clf_cv.cv_results_["mean_test_accuracy"]
plt.plot(x_acc,y_acc)
plt.xlabel("Maximal number of leaf nodes")
plt.ylabel("Cross-validated accuracy score")

## (j)
> Produce a pruned tree corresponding to the optimal tree size obtained using cross-validation. If cross-validation does not lead to selection of a pruned tree, then create a pruned tree with five terminal nodes.

[Cost-complexity pruning](https://scikit-learn.org/stable/auto_examples/tree/plot_cost_complexity_pruning.html)


In [None]:
path = clf.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas

## (with pipelines)

In [None]:
X = oj.drop("Purchase", axis=1)
y = oj["Purchase"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=0, train_size=0.8
)
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import TargetEncoder
from sklearn.compose import ColumnTransformer

clf = make_pipeline(
    ColumnTransformer(
        [("Store7", OneHotEncoder(), ["Store7"])],
        remainder="passthrough",
    ),
    DecisionTreeClassifier(random_state=0),
)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

## (c)
> Create a plot of the tree, and interpret the results. How many terminal nodes does the tree have?


In [None]:
plot_tree(clf.named_steps["decisiontreeclassifier"])

In [None]:
clf.get_n_leaves()

## (d)
> Predict the response on the test data, and produce a confusion matrix comparing the test labels to the predicted test labels. What is the test error rate?
