In [None]:
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

import seaborn as sns

# Add a Dartmouth-y color theme
import matplotlib as mpl
dartmouth_colors = ["#00693E", "#12312B", "#C3DD88", "#6EAA8D", "#797979", "#EBF3EF"]
mpl.rcParams.update({
                        'figure.facecolor': "#EBF3EF",
                        'figure.figsize': [7.50, 3.50],
                        'axes.prop_cycle': mpl.cycler(color=dartmouth_colors),
                        'axes.facecolor': "#FFFFFF",
                        'axes.labelcolor': '#12312B',
                        'text.color': '#12312B'
                    })

%config InlineBackend.figure_formats = ['svg']


In [None]:
penguins = sns.load_dataset('penguins').dropna()
penguins.head()

In [None]:
sns.pairplot(penguins, hue='species')

In [None]:
plt.figure(figsize=(5, 5))
sns.scatterplot(penguins, 
                x='body_mass_g', y='bill_length_mm',
                hue='species')
plt.xlabel('Body mass [g]')
plt.ylabel('Bill length [mm]')

In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree

tree = DecisionTreeClassifier(max_depth=1).fit(X=penguins[['body_mass_g', 'bill_length_mm']], y=penguins['species'])


plot_tree(tree, feature_names=['body_mass_g', 'bill_length_mm'], class_names=penguins['species'].unique());


In [None]:
from sklearn.inspection import DecisionBoundaryDisplay
import matplotlib.colors

cmap = matplotlib.colors.ListedColormap([dartmouth_colors[0], dartmouth_colors[2]])

fig = plt.figure(figsize=(5, 5))
ax = plt.subplot(1, 1, 1)
dbd = DecisionBoundaryDisplay.from_estimator(
        tree,
        penguins[['body_mass_g', 'bill_length_mm']],
        ax=ax,
        cmap=cmap,
        response_method="predict",
        xlabel='body_mass_g',
        ylabel='bill_length_mm',
    )

sns.scatterplot(penguins, 
                x='body_mass_g', y='bill_length_mm',
                hue='species',
                ax=dbd.ax_)
plt.xlabel('Body mass [g]')
plt.ylabel('Bill length [mm]')

In [None]:
tree = DecisionTreeClassifier(max_depth=2).fit(X=penguins[['body_mass_g', 'bill_length_mm']], y=penguins['species'])


plot_tree(tree, feature_names=['body_mass_g', 'bill_length_mm'], class_names=penguins['species'].unique());


In [None]:
cmap = matplotlib.colors.ListedColormap(dartmouth_colors[0:3])

fig = plt.figure(figsize=(5, 5))
ax = plt.subplot(1, 1, 1)
dbd = DecisionBoundaryDisplay.from_estimator(
        tree,
        penguins[['body_mass_g', 'bill_length_mm']],
        ax=ax,
        cmap=cmap,
        response_method="predict",
        xlabel='body_mass_g',
        ylabel='bill_length_mm',
    )

sns.scatterplot(penguins, 
                x='body_mass_g', y='bill_length_mm',
                hue='species',
                ax=dbd.ax_)
plt.xlabel('Body mass [g]')
plt.ylabel('Bill length [mm]')

In [None]:
tree = DecisionTreeClassifier(max_depth=3).fit(X=penguins[['body_mass_g', 'bill_length_mm']], y=penguins['species'])


plot_tree(tree, feature_names=['body_mass_g', 'bill_length_mm'], class_names=penguins['species'].unique());


In [None]:
cmap = matplotlib.colors.ListedColormap(dartmouth_colors[0:3])

fig = plt.figure(figsize=(5, 5))
ax = plt.subplot(1, 1, 1)
dbd = DecisionBoundaryDisplay.from_estimator(
        tree,
        penguins[['body_mass_g', 'bill_length_mm']],
        ax=ax,
        cmap=cmap,
        response_method="predict",
        xlabel='body_mass_g',
        ylabel='bill_length_mm',
    )

sns.scatterplot(penguins, 
                x='body_mass_g', y='bill_length_mm',
                hue='species',
                ax=dbd.ax_)
plt.xlabel('Body mass [g]')
plt.ylabel('Bill length [mm]')

In [None]:
tree = DecisionTreeClassifier(max_depth=4).fit(X=penguins[['body_mass_g', 'bill_length_mm']], y=penguins['species'])


plot_tree(tree, feature_names=['body_mass_g', 'bill_length_mm'], class_names=penguins['species'].unique());

In [None]:
cmap = matplotlib.colors.ListedColormap(dartmouth_colors[0:3])

fig = plt.figure(figsize=(5, 5))
ax = plt.subplot(1, 1, 1)
dbd = DecisionBoundaryDisplay.from_estimator(
        tree,
        penguins[['body_mass_g', 'bill_length_mm']],
        ax=ax,
        cmap=cmap,
        response_method="predict",
        xlabel='body_mass_g',
        ylabel='bill_length_mm',
    )

sns.scatterplot(penguins, 
                x='body_mass_g', y='bill_length_mm',
                hue='species',
                ax=dbd.ax_)
plt.xlabel('Body mass [g]')
plt.ylabel('Bill length [mm]')

In [None]:
tree = DecisionTreeClassifier(max_depth=20).fit(X=penguins[['body_mass_g', 'bill_length_mm']], y=penguins['species'])


plot_tree(tree, feature_names=['body_mass_g', 'bill_length_mm'], class_names=penguins['species'].unique());

In [None]:
cmap = matplotlib.colors.ListedColormap(dartmouth_colors[0:3])

fig = plt.figure(figsize=(5, 5))
ax = plt.subplot(1, 1, 1)
dbd = DecisionBoundaryDisplay.from_estimator(
        tree,
        penguins[['body_mass_g', 'bill_length_mm']],
        ax=ax,
        cmap=cmap,
        response_method="predict",
        xlabel='body_mass_g',
        ylabel='bill_length_mm',
    )

sns.scatterplot(penguins, 
                x='body_mass_g', y='bill_length_mm',
                hue='species',
                ax=dbd.ax_)
plt.xlabel('Body mass [g]')
plt.ylabel('Bill length [mm]')

In [None]:
p = penguins.sample(3, random_state=1769)[['body_mass_g', 'bill_length_mm', 'species']]

In [None]:
p

In [None]:
plt.figure(figsize=(5, 5))
sns.scatterplot(p, x = 'body_mass_g', y = 'bill_length_mm')
plt.xlim([penguins.body_mass_g.min(), penguins.body_mass_g.max()])
plt.ylim([penguins.bill_length_mm.min(), penguins.bill_length_mm.max()])
plt.xlabel('Body mass [g]')
plt.ylabel('Bill length [mm]')

In [None]:
plt.figure(figsize=(5, 5))
sns.scatterplot(p, x = 'body_mass_g', y = 'bill_length_mm', hue='species')
plt.xlim([penguins.body_mass_g.min(), penguins.body_mass_g.max()])
plt.ylim([penguins.bill_length_mm.min(), penguins.bill_length_mm.max()])
plt.xlabel('Body mass [g]')
plt.ylabel('Bill length [mm]')

In [None]:
p = penguins.sample(20, random_state=1769)[['body_mass_g', 'bill_length_mm', 'species']]
plt.figure(figsize=(5, 5))
sns.scatterplot(p, x = 'body_mass_g', y = 'bill_length_mm', hue='species')
plt.xlim([penguins.body_mass_g.min(), penguins.body_mass_g.max()])
plt.ylim([penguins.bill_length_mm.min(), penguins.bill_length_mm.max()])
plt.xlabel('Body mass [g]')
plt.ylabel('Bill length [mm]')

In [None]:
plt.step([0, 0.5, 1], [0, 0, 1])
plt.xlabel('Pseudo-feature')
plt.ylabel('Activation')
         

In [None]:
x = np.linspace(-6, 6, 30)
y = 1 / (1 + np.exp(-x))
plt.figure(figsize=(3, 3))
plt.plot(x, y)
plt.xlabel('Pseudo-feature')
plt.ylabel('Activation')
         

In [None]:
# Encode categorial variables
penguins.sex = penguins.sex.astype('category').cat.codes
penguins.island = penguins.island.astype('category').cat.codes

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(penguins, test_size=0.1)




In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

features = ['island', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g', 'sex']
target = 'species'

scaler = StandardScaler().fit(X_train[features])
X_train[features] = scaler.transform(X_train[features])
X_test[features] = scaler.transform(X_test[features])

knn = KNeighborsClassifier().fit(X_train[features], X_train[target])

In [None]:
knn.score(X_test[features], X_test[target])

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

candidates = {
    'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'weights': ['uniform', 'distance']
}

knn = GridSearchCV(KNeighborsClassifier(), param_grid=candidates, verbose=1).fit(X_train[features], X_train[target])
print(knn.best_params_)
print(classification_report(X_test[target], knn.predict(X_test[features])))

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

ConfusionMatrixDisplay.from_estimator(knn, X_test[features], X_test[target])

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(max_depth=5).fit(X_train[features], X_train[target])

print(classification_report(X_test[target], dt.predict(X_test[features])))

In [None]:
ConfusionMatrixDisplay.from_estimator(dt, X_test[features], X_test[target])

In [None]:
from sklearn.tree import plot_tree

plot_tree(dt, feature_names=features, class_names=penguins.species.unique(), filled=True);

In [None]:
feature = 'flipper_length_mm'
threshold = 0.394

scaler.inverse_transform(X_train.loc[X_train[feature] <= threshold, features])[:, features.index(feature)].max()