In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import sklearn.tree
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import train_test_split
from dtreeviz.trees import dtreeviz

Using make_blobs to generate some data for classification:
https://docs.w3cub.com/scikit_learn/modules/generated/sklearn.datasets.make_blobs

In [None]:
x, y = make_blobs(n_samples=300,
                  random_state=0, 
                  cluster_std=2)

In [None]:
df = pd.DataFrame({'x0':x[:,0], 
                   'x1':x[:,1], 
                   'y':y})

In [None]:
sns.scatterplot(data=df,
                x='x0',
                y='x1',
                hue='y',
                palette=['red','green','blue'])

## Train/test split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df[['x0','x1']], 
                                                    df['y'], 
                                                    random_state=0)

## First, Logistic Regression

In [None]:
logreg_clf = LogisticRegression()
logreg_clf.fit(x_train, y_train)

In [None]:
DecisionBoundaryDisplay.from_estimator(logreg_clf, 
                                       df[['x0','x1']], 
                                       response_method="predict",
                                       cmap="RdBu", 
                                       alpha=0.5
)

sns.scatterplot(data=df,
                x='x0', 
                y='x1',
                hue='y',
                palette=['red','green','blue'])

plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

In [None]:
test_score = logreg_clf.score(x_test, y_test)
print(f"Accuracy of Logistic Regression: {test_score:.2f}")

## Decision Tree

In [None]:
#tree_clf = DecisionTreeClassifier()

tree_clf = DecisionTreeClassifier(max_depth=2)

In [None]:
tree_clf.fit(x_train, y_train)

In [None]:
DecisionBoundaryDisplay.from_estimator(tree_clf, 
                                       df[['x0','x1']], 
                                       response_method="predict",
                                       cmap="RdBu", 
                                       alpha=0.5
)

sns.scatterplot(data=df,
                x='x0', 
                y='x1',
                hue='y',
                palette=['red','green','blue'])

plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

In [None]:
text_representation = sklearn.tree.export_text(tree_clf)
print(text_representation)

In [None]:
tree_clf.classes_

In [None]:
plt.figure(figsize=(12,8))
sklearn.tree.plot_tree(tree_clf, 
               feature_names=['x0','x1'],  
               class_names=[str(i) for i in tree_clf.classes_],
               filled=True);

In [None]:
plt.figure(figsize=(12,8))
dtreeviz(tree_clf, 
         df[['x0','x1']], 
         df['y'],
         feature_names=['x0','x1'],
         class_names=[i for i in tree_clf.classes_],
         target_name="y")

In [None]:
test_score = tree_clf.score(x_test, y_test)
print(f"Accuracy of Decision Tree: {test_score:.2f}")

### How to make predictions?

In [None]:
sample = pd.DataFrame({"x0": [0.], "x1": [1]})
print('Predicted Class:',tree_clf.predict(sample))

In [None]:
y_pred_proba = tree_clf.predict_proba(sample)
print(y_pred_proba)

In [None]:
y_proba_class_0 = pd.Series(y_pred_proba[0], index=tree_clf.classes_).plot.bar()

In [None]:
#print percentages for max_depth of 2 as check of probabilities


## Exercise

Adapt the above to do Logistic Regression and Decision Tree Classification on last week's classification dataset on breast cancer.
* Use the feature variables "mean radius" and "mean texture"
* Predict the 0 or 1 cancer target variable
* I have included a couple cells to get you started.

In [None]:
import sklearn.datasets
import sklearn.model_selection

x,y = sklearn.datasets.load_breast_cancer(return_X_y=True,
                                          as_frame=True)

In [None]:
x['target'] = y

In [None]:
x.corr()['target']

In [None]:
sns.scatterplot(data=x, 
                x='mean radius', 
                y='mean texture',
                hue='target')

In [None]:
# split into training/test sets



In [None]:
# initialize the logistic regression model


In [None]:
# train the logistic regression model


In [None]:
# plot the decision boundary with the data


In [None]:
# print the accuracy of the model


## Decision Tree

In [None]:
# initialize the decision tree classification model


In [None]:
# train the decision tree classification model


In [None]:
# plot the decision boundary with the data


In [None]:
# print out a text representation of the tree


In [None]:
# plot a visualization of the tree


In [None]:
# print the accuracy of the model


In [None]:
# make a sample data point
# predict its class
# and print the calculated prediction probabilities for each class

