# Palmer Penguins Modeling

Import the Palmer Penguins dataset and print out the first few rows.

Suppose we want to predict `species` using the other variables in the dataset.

**Dummify** all variables that require this.

In [2]:
!pip install palmerpenguins


Collecting palmerpenguins
  Downloading palmerpenguins-0.1.4-py3-none-any.whl.metadata (2.0 kB)
Downloading palmerpenguins-0.1.4-py3-none-any.whl (17 kB)
Installing collected packages: palmerpenguins
Successfully installed palmerpenguins-0.1.4


In [3]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from palmerpenguins import load_penguins
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score

In [4]:
penguins = load_penguins().dropna()

penguins['gentoo'] = penguins['species'] == "Gentoo"

features = penguins.drop(columns=['gentoo'])
target = penguins['gentoo']

categorical_features = ['species', 'island', 'sex']
numeric_features = features.drop(columns=categorical_features).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)
    ]
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('logistic_regression', LogisticRegression(max_iter=200))
])

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

pipeline.fit(X_train, y_train)

y_train_pred = pipeline.predict(X_train)
train_accuracy1 = accuracy_score(y_train, y_train_pred)

print("Training Accuracy:", train_accuracy1)


Training Accuracy: 1.0


Let's use the other variables to predict `species`. Prepare your data and fit the following models on the entire dataset:

* Two kNN models (for different values of K)
* Two decision tree models (for different complexities of trees)

Compute the following, for each of your models, on test data. Keep in mind that you may need to stratify your creation of the training and test data.

* Confusion matrix
* Overall Accuracy
* Precision, Recall, AUC, and F1-score for each species

Create one ROC plot for the species of your choice.

In [None]:
penguins = load_penguins().dropna()

features = penguins.drop(columns=['species'])
target = penguins['species']


categorical_features = ['island', 'sex']
numeric_features = features.drop(columns=categorical_features).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)
    ]
)

pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('knn', KNeighborsClassifier(n_neighbors= 3))
    ])

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

pipeline.fit(X_train, y_train)

y_train_pred = pipeline.predict(X_train)
train_accuracy2 = accuracy_score(y_train, y_train_pred)

print("Training Accuracy:", train_accuracy2)


Training Accuracy: 0.9887218045112782


In [None]:
penguins = load_penguins().dropna()

features = penguins.drop(columns=['species'])
target = penguins['species']


categorical_features = ['island', 'sex']
numeric_features = features.drop(columns=categorical_features).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)
    ]
)



pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('knn', KNeighborsClassifier(n_neighbors= 15))
    ])

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

pipeline.fit(X_train, y_train)

y_train_pred = pipeline.predict(X_train)
train_accuracy3 = accuracy_score(y_train, y_train_pred)

print("Training Accuracy:", train_accuracy3)

Training Accuracy: 0.9962406015037594


In [None]:
from palmerpenguins import load_penguins
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

penguins = load_penguins().dropna()

features = penguins.drop(columns=['species'])
target = penguins['species']

categorical_features = ['island', 'sex']
numeric_features = features.drop(columns=categorical_features).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)
    ]
)

tree_depths = [3, 5]

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

for depth in tree_depths:
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('decision_tree', DecisionTreeClassifier(max_depth=depth, random_state=42))
    ])

    pipeline.fit(X_train, y_train)

    y_train_pred = pipeline.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_train_pred)

    y_test_pred = pipeline.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)

    print(f"Decision Tree Model with max_depth={depth}")
    print(f"Training Accuracy: {train_accuracy}")
    print(f"Test Accuracy: {test_accuracy}\n")


Decision Tree Model with max_depth=3
Training Accuracy: 0.981203007518797
Test Accuracy: 0.9701492537313433

Decision Tree Model with max_depth=5
Training Accuracy: 1.0
Test Accuracy: 0.9850746268656716

