# Palmer Penguins Modeling

Import the Palmer Penguins dataset and print out the first few rows.

Suppose we want to predict `species` using the other variables in the dataset.

**Dummify** all variables that require this.

In [1]:
pip install palmerpenguins

Collecting palmerpenguins
  Downloading palmerpenguins-0.1.4-py3-none-any.whl.metadata (2.0 kB)
Downloading palmerpenguins-0.1.4-py3-none-any.whl (17 kB)
Installing collected packages: palmerpenguins
Successfully installed palmerpenguins-0.1.4


In [2]:
import pandas as pd
from palmerpenguins import load_penguins

penguins = load_penguins()
penguins = penguins.dropna()
display(penguins.head())

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,male,2007


In [3]:
penguins_dummified = pd.get_dummies(penguins, columns=['species', 'island', 'sex'], drop_first=True)
display(penguins_dummified.head())

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,year,species_Chinstrap,species_Gentoo,island_Dream,island_Torgersen,sex_male
0,39.1,18.7,181.0,3750.0,2007,False,False,False,True,True
1,39.5,17.4,186.0,3800.0,2007,False,False,False,True,False
2,40.3,18.0,195.0,3250.0,2007,False,False,False,True,False
4,36.7,19.3,193.0,3450.0,2007,False,False,False,True,False
5,39.3,20.6,190.0,3650.0,2007,False,False,False,True,True


Let's use the other variables to predict `species`. Prepare your data and fit the following models on the entire dataset:

* Two kNN models (for different values of K)
* Two decision tree models (for different complexities of trees)

Compute the following, for each of your models, on test data. Keep in mind that you may need to stratify your creation of the training and test data.

* Confusion matrix
* Overall Accuracy
* Precision, Recall, AUC, and F1-score for each species

Create one ROC plot for the species of your choice.

In [19]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

X = penguins_dummified.drop(columns=['species_Chinstrap', 'species_Gentoo'])
y = penguins['species']

knn3_scaled = make_pipeline(
    StandardScaler(),
    KNeighborsClassifier(n_neighbors=3)
)

cv_scores_3_scaled = cross_val_score(knn3_scaled, X, y, cv=5, scoring="accuracy")
np.mean(cv_scores_3_scaled)


np.float64(0.9849841700587969)

In [21]:
precision_macro = cross_val_score(knn3_scaled, X, y,cv=5, scoring="precision_macro").mean()

recall_macro = cross_val_score(knn3_scaled, X, y, cv=5, scoring="recall_macro").mean()

f1score_macro = cross_val_score(knn3_scaled, X, y,cv=5, scoring="f1_macro").mean()

precision_macro, recall_macro, f1score_macro


(np.float64(0.9834432234432235),
 np.float64(0.98075028419856),
 np.float64(0.9817674254688825))

In [20]:
knn10_scaled = make_pipeline(
    StandardScaler(),
    KNeighborsClassifier(n_neighbors=10)
)

cv_scores_10_scaled = cross_val_score(knn10_scaled, X, y, cv=5, scoring="accuracy")
np.mean(cv_scores_10_scaled)

np.float64(0.9909995477159657)

In [22]:
precision_macro = cross_val_score(knn10_scaled, X, y, cv=5, scoring="precision_macro").mean()

recall_macro    = cross_val_score(knn10_scaled, X, y, cv=5, scoring="recall_macro").mean()

f1score_macro   = cross_val_score(knn10_scaled, X, y, cv=5, scoring="f1_macro").mean()

precision_macro, recall_macro, f1score_macro


(np.float64(0.9911111111111112),
 np.float64(0.9878110395351776),
 np.float64(0.989135869307227))

In [24]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd
from plotnine import ggplot, aes, geom_line, geom_point, labs

tree_model_4 = DecisionTreeClassifier(max_depth=4, random_state=0)

cv_scores_tree = cross_val_score(tree_model, X, y, cv=5, scoring="accuracy")

np.mean(cv_scores_tree)

np.float64(0.9728629579375848)

In [25]:
precision_macro = cross_val_score(tree_model_4, X, y, cv=5, scoring="precision_macro").mean()

recall_macro    = cross_val_score(tree_model_4, X, y, cv=5, scoring="recall_macro").mean()

f1score_macro   = cross_val_score(tree_model_4, X, y, cv=5, scoring="f1_macro").mean()

precision_macro, recall_macro, f1score_macro


(np.float64(0.9629494649461279),
 np.float64(0.9633341754031409),
 np.float64(0.9617784829863577))

In [27]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd
from plotnine import ggplot, aes, geom_line, geom_point, labs

tree_model_8 = DecisionTreeClassifier(max_depth=8, random_state=0)

cv_scores_tree_8 = cross_val_score(tree_model, X, y, cv=5, scoring="accuracy")

np.mean(cv_scores_tree_8)

np.float64(0.9728629579375848)

In [28]:
precision_macro = cross_val_score(tree_model_8, X, y, cv=5, scoring="precision_macro").mean()

recall_macro    = cross_val_score(tree_model_8, X, y, cv=5, scoring="recall_macro").mean()

f1score_macro   = cross_val_score(tree_model_8, X, y, cv=5, scoring="f1_macro").mean()

precision_macro, recall_macro, f1score_macro


(np.float64(0.9703765206490458),
 np.float64(0.970709654330344),
 np.float64(0.9695501685247603))