In [2]:
%matplotlib inline

import os # locating directories

import numpy as np   # Arrays
import pandas as pd  # DataFrames

# Plotting
import matplotlib
import matplotlib.pyplot as plt
matplotlib.rcParams['animation.embed_limit'] = 30000000.0
plt.rcParams['figure.dpi'] = 120
import seaborn as sns; sns.set()

from IPython.display import Image                # displaying .png images
from sklearn.preprocessing import StandardScaler # scaling features
from sklearn.preprocessing import LabelEncoder   # binary encoding
import random

image_dir = os.path.join(os.getcwd(),"Images")

In [3]:
col_dict = {
    "Adelie":"#ff7600",
    "Chinstrap":"#c65dcb",
    "Gentoo":"#057576"}

shape_dict = {
    "Adelie":"o",
    "Chinstrap":"s",
    "Gentoo":"X"}

datasets = {}
penguins = sns.load_dataset("penguins")

# dropna
penguins_rm = penguins.dropna()

# keep categorical features
penguins_cat = penguins_rm[["island", "sex", "species"]]
penguins_bin = penguins_cat[penguins_cat.species != "Chinstrap"]
y_bin = penguins_bin[["species"]].replace({'Adelie': 0, 'Gentoo': 1}).values.flatten()
# for the classification data we don't want species there either
penguins_class_feat = penguins_bin.drop("species", axis=1)
datasets['cat'] = {"df":penguins_bin, 
                    "X": penguins_class_feat[["island", "sex"]].values,
                    "y": y_bin,
                    "feats": ["island", "sex"],
                     "class": ['Adelie', 'Gentoo']}


# drop continuous features
penguins_cont = penguins_rm.drop(["island", "sex", "species"], axis=1)

# for the regression data we don't want body_mass_g there either
penguins_reg_feat = penguins_cont.drop(["body_mass_g"], axis=1)

datasets['reg_full'] = {"df":penguins_cont, 
                        "X":penguins_reg_feat.values,
                        "y": penguins_rm[["body_mass_g"]].values.flatten(),
                        "feats": list(penguins_reg_feat.columns)}

# regression datset just to compare flipper length to body mass
datasets['flbm'] = {"df":penguins_cont, 
                    "X":penguins_reg_feat[["flipper_length_mm"]].values,
                    "y": penguins_rm[["body_mass_g"]].values.flatten(),
                    "feats": ["Flipper Length (mm)"]}

# make a binary classification dataset
penguins_class = penguins_rm.drop(["island", "sex"], axis=1)
# for the classification data we don't want species there either
penguins_class_feat = penguins_class.drop("species", axis=1)
y_multi = penguins_class[["species"]].replace({'Adelie': 0, 'Gentoo': 1, "Chinstrap":2}).values.flatten()

datasets['multi'] = {"df":penguins_class, 
                     "X": penguins_class_feat.values,
                     "y": y_multi,
                     "feats": list(penguins_class_feat.columns),
                     "class": ['Adelie', 'Gentoo', "Chinstrap"]}


penguins_bin = penguins_class[penguins_class.species != "Chinstrap"]
y_bin = penguins_bin[["species"]].replace({'Adelie': 0, 'Gentoo': 1}).values.flatten()
# for the classification data we don't want species there either
penguins_class_feat = penguins_bin.drop("species", axis=1)

datasets['bin'] = {"df":penguins_bin, 
                   "X": penguins_class_feat.values,
                   "y": y_bin,
                   "feats": list(penguins_class_feat.columns),
                   "class": ['Adelie', 'Gentoo']}

datasets['flbl'] = {"df":penguins_bin, 
                    "X": penguins_class_feat[["flipper_length_mm", "bill_length_mm"]].values,
                    "y": y_bin,
                    "feats": ["flipper_length_mm", "bill_length_mm"],
                     "class": ['Adelie', 'Gentoo']}

datasets['blbd'] = {"df":penguins_bin, 
                    "X": penguins_class_feat[["bill_length_mm", "bill_depth_mm"]].values,
                    "y": y_bin,
                    "feats": ["bill_length_mm", "bill_depth_mm"],
                    "class": ['Adelie', 'Gentoo']}

1. Using Figure X classify the following penguins as either a Adelie or Gentoo penguin

species | bill_length_mm |	bill_depth_mm |	flipper_length_mm |	body_mass_g |
-       |-               |-               | -                 | -           |
    ?	| 49.1	         | 14.8	          |220.0              |	5150.0      |
    ?   | 37.7	         | 19.8           |	198.0	          | 3500.0      |

In [None]:
display(Image(os.path.join(image_dir,"Figurex.png")))

In [4]:
# 1
display(datasets['bin']['df'].sample(n=2, random_state=42))

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
254,Gentoo,49.1,14.8,220.0,5150.0
121,Adelie,37.7,19.8,198.0,3500.0


2. using Figure Y classify the following penguins as either a Adelie, Gentoo, or Adelie penguin

species | bill_length_mm |	bill_depth_mm |	flipper_length_mm |	body_mass_g |
-       |-               |-               | -                 | -           |
?       |	35.2	     |15.9	          |186.0	          |3050.0       |
?       |   51.3	     |18.2	          |197.0	          |3750.0       |

In [None]:
display(Image(os.path.join(image_dir,"Figurey.png")))

In [5]:
display(datasets['multi']['df'].sample(n=2, random_state=3))

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
124,Adelie,35.2,15.9,186.0,3050.0
159,Chinstrap,51.3,18.2,197.0,3750.0


__TODO__
- write more exercises

In [1]:
# Convert to pdf without code showing
!jupyter nbconvert ./Trees_Exercise_Answers.ipynb --to=pdf --TemplateExporter.exclude_input=True

[NbConvertApp] Converting notebook ./Exercises.ipynb to pdf
[NbConvertApp] Support files will be in Exercises_files\
[NbConvertApp] Making directory .\Exercises_files
[NbConvertApp] Making directory .\Exercises_files
[NbConvertApp] Writing 46613 bytes to notebook.tex
[NbConvertApp] Building PDF
[NbConvertApp] Running xelatex 3 times: ['xelatex', 'notebook.tex', '-quiet']
[NbConvertApp] Running bibtex 1 time: ['bibtex', 'notebook']
[NbConvertApp] PDF successfully created
[NbConvertApp] Writing 89517 bytes to Exercises.pdf
