# Palmer Penguins Modeling

Import the Palmer Penguins dataset and print out the first few rows.

Suppose we want to predict `species` using the other variables in the dataset.

**Dummify** all variables that require this.

In [1]:
from palmerpenguins import load_penguins

dat = load_penguins()

from pandas import get_dummies

df = dat.dropna()

df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,male,2007


In [20]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet 
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import r2_score, confusion_matrix, ConfusionMatrixDisplay, accuracy_score, precision_score, recall_score
import numpy as np
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor


In [15]:
# K-means with 9 nearest neighbors 
X = df.drop(["species"], axis = 1)
y = df["species"]

ct = ColumnTransformer(
  [
    ("dummify", 
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize", 
    StandardScaler(), 
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

pipeline_k_means = Pipeline(
  [("preprocessing", ct),
  ("k_means", KNeighborsClassifier(n_neighbors = 9))]
)

scores = cross_val_score(pipeline_k_means, X, y, cv = 5, scoring = 'f1_macro')

mean_RSME = -scores.mean()
mean_RSME

np.float64(-0.9821401872540736)

Let's use the other variables to predict `species`. Prepare your data and fit the following models on the entire dataset:

* Two kNN models (for different values of K)
* Two decision tree models (for different complexities of trees)

Compute the following, for each of your models, on test data. Keep in mind that you may need to stratify your creation of the training and test data.

* Confusion matrix
* Overall Accuracy
* Precision, Recall, AUC, and F1-score for each species

Create one ROC plot for the species of your choice.

In [13]:
# Models
# K-means with 9 nearest neighbors 
X = df.drop(["species"], axis = 1)
y = df["species"]

ct_kmeans = ColumnTransformer(
  [
    ("dummify", 
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize", 
    StandardScaler(), 
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

pipeline_k_means5 = Pipeline(
  [("preprocessing", ct_kmeans),
  ("k_means", KNeighborsClassifier(n_neighbors = 9))]
)
pipeline_k_means10 = Pipeline(
  [("preprocessing", ct_kmeans),
  ("k_means", KNeighborsClassifier(n_neighbors = 10))]
)

ct_tree = ColumnTransformer(
  [
    ("dummify", 
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object))
  ],
  remainder = "passthrough"
)

pipeline_tree2 = Pipeline(
  [("preprocessing", ct_tree),
  ("decision_tree", DecisionTreeClassifier(max_depth=2))]
)
pipeline_tree4 = Pipeline(
  [("preprocessing", ct_tree),
  ("decision_tree", DecisionTreeClassifier(max_depth=2))]
)

In [15]:
# Fitting on entire dataset
kmeans5 = pipeline_k_means5.fit(X,y)
kmeans10 = pipeline_k_means10.fit(X,y)

tree2 = pipeline_tree2.fit(X,y)
tree4 = pipeline_tree4.fit(X,y)



In [None]:
# Confusion matrix
y_train_ = pd.Series(kmeans5.predict(X), name = "Predicted")

pd.crosstab(y, y_train_, margins=True)


Predicted,Adelie,Chinstrap,Gentoo,All
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Adelie,139,1,6,146
Chinstrap,0,57,0,57
Gentoo,0,9,110,119
All,139,67,116,322


In [23]:
cv_scores = cross_val_score(pipeline_k_means5, X, y,
                            cv=10, scoring="accuracy")
cv_scores.mean()

np.float64(0.9880570409982173)

In [24]:
precision_0 = cross_val_score(pipeline_k_means5, X, y,
                              cv=10, scoring="precision").mean()

recall_0 = cross_val_score(pipeline_k_means5, X, y,
                           cv=10, scoring="recall").mean()

f1score_0 = cross_val_score(pipeline_k_means5, X, y,
                            cv=10, scoring="f1").mean()

precision_0, recall_0, f1score_0

Traceback (most recent call last):
  File "C:\Users\Eddie\anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 140, in __call__
    score = scorer._score(
        cached_call, estimator, *args, **routed_params.get(name).score
    )
  File "C:\Users\Eddie\anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 380, in _score
    y_pred = method_caller(
        estimator,
    ...<2 lines>...
        pos_label=pos_label,
    )
  File "C:\Users\Eddie\anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 90, in _cached_call
    result, _ = _get_response_values(
                ~~~~~~~~~~~~~~~~~~~~^
        estimator, *args, response_method=response_method, **kwargs
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "C:\Users\Eddie\anaconda3\Lib\site-packages\sklearn\utils\_response.py", line 207, in _get_response_values
    raise ValueError(
    ...<2 lines>...
    )
ValueError: pos_label=1 is not a valid label: It should be one of 

(np.float64(nan), np.float64(nan), np.float64(nan))