# Palmer Penguins Modeling

Import the Palmer Penguins dataset and print out the first few rows.

Suppose we want to predict `bill_depth_mm` using the other variables in the dataset.

**Dummify** all variables that require this.

In [65]:
from palmerpenguins import load_penguins

dat = load_penguins()

from pandas import get_dummies

df = dat.dropna()

Let's use the other variables to predict `bill_depth_mm`. Prepare your data and fit the following models on the entire dataset:

* Your best multiple linear regression model from before
* Two kNN models (for different values of K)
* A decision tree model

Create a plot like the right plot of Fig 1. in our `Model Validation` chapter with the training and test error plotted for each of your four models.

Which of your models was best?

In [56]:
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [57]:
y4 = df['bill_depth_mm']
X4 = df[['bill_length_mm', 'flipper_length_mm', 'body_mass_g','species_Chinstrap', 'species_Gentoo', 'sex_female', 'island_Biscoe', 'island_Dream']]

X_train4, X_test4, y_train4, y_test4 = train_test_split(X4, y4, test_size=0.25)

lr = LinearRegression()

lr_fit = lr.fit(X_train4, y_train4)

y_pred_lrfit4_train = lr_fit.predict(X_train4)
mse_train_lrfit4 = mean_squared_error(y_train4, y_pred_lrfit4_train)

y_pred_lrfit4_test = lr_fit.predict(X_test4)
mse_test_lrfit4 = mean_squared_error(y_test4, y_pred_lrfit4_test)

print(mse_train_lrfit4)
print(mse_test_lrfit4)

0.6188735719540988
0.6433718121413136


In [58]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet 
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import r2_score
import numpy as np


In [81]:
# K-means with 9 nearest neighbors 
X = df.drop(["bill_depth_mm"], axis = 1)
y = df["bill_depth_mm"]

ct = ColumnTransformer(
  [
    ("dummify", 
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize", 
    StandardScaler(), 
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

pipeline_k_means = Pipeline(
  [("preprocessing", ct),
  ("k_means", KNeighborsRegressor(n_neighbors = 9))]
)

scores = cross_val_score(pipeline_k_means, X, y, cv = 5, scoring = 'neg_root_mean_squared_error')

mean_RSME = -scores.mean()
mean_RSME

np.float64(1.041958096702818)

In [82]:
# K-means with 5 nearest neighbors 
X = df.drop(["bill_depth_mm"], axis = 1)
y = df["bill_depth_mm"]

ct = ColumnTransformer(
  [
    ("dummify", 
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize", 
    StandardScaler(), 
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

pipeline_k_means = Pipeline(
  [("preprocessing", ct),
  ("k_means", KNeighborsRegressor(n_neighbors = 5))]
)

scores = cross_val_score(pipeline_k_means, X, y, cv = 5, scoring = 'neg_root_mean_squared_error')

mean_RSME = -scores.mean()
mean_RSME

np.float64(1.044573445081833)

In [60]:
degrees = {'k_means__n_neighbors': [2,3,4,5,6,7,8,9,10,11]}

gscv_ridge = GridSearchCV(pipeline_k_means, degrees, cv = 5, scoring='neg_root_mean_squared_error')

gscv_ridge_fitted = gscv_ridge.fit(X,y)

pd.DataFrame(data = {"degrees": [2,3,4,5,6,7,8,9,10,11], "scores": gscv_ridge_fitted.cv_results_['mean_test_score']})

Unnamed: 0,degrees,scores
0,2,-1.166097
1,3,-1.163471
2,4,-1.125543
3,5,-1.097845
4,6,-1.086308
5,7,-1.083959
6,8,-1.07783
7,9,-1.076262
8,10,-1.082706
9,11,-1.076602


In [67]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression

In [79]:
# Dont standardize for decision trees
X = df.drop(["bill_depth_mm"], axis = 1)
y = df["bill_depth_mm"]

ct = ColumnTransformer(
  [
    ("dummify", 
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object))
  ],
  remainder = "passthrough"
)

pipeline_tree = Pipeline(
  [("preprocessing", ct),
  ("decision_tree", DecisionTreeRegressor(max_depth=2))]
)

scores = cross_val_score(pipeline_tree, X, y, cv = 5, scoring = 'neg_root_mean_squared_error')

mean_RSME = -scores.mean()
mean_RSME

np.float64(0.8655602547084573)

In [78]:
degrees = {'decision_tree__max_depth': [1,2,3,4]}

gscv_ridge = GridSearchCV(pipeline_tree, degrees, cv = 5, scoring='neg_root_mean_squared_error')

gscv_ridge_fitted = gscv_ridge.fit(X,y)

pd.DataFrame(data = {"degrees": [1,2,3,4], "scores": gscv_ridge_fitted.cv_results_['mean_test_score']})

Unnamed: 0,degrees,scores
0,1,-1.148592
1,2,-0.86556
2,3,-0.888846
3,4,-0.98147
