# Palmer Penguins Modeling

Import the Palmer Penguins dataset and print out the first few rows.

Suppose we want to predict `bill_depth_mm` using the other variables in the dataset.

**Dummify** all variables that require this.

In [4]:
!pip install plotnine
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error

penguin = sns.load_dataset("penguins")
penguin = penguin.dropna()
print(penguin.head())

Collecting plotnine
  Obtaining dependency information for plotnine from https://files.pythonhosted.org/packages/eb/c1/fcc5985eee6511aa321e68c8f813d9fdbe1b506713a95d4f612a5f963270/plotnine-0.12.3-py3-none-any.whl.metadata
  Downloading plotnine-0.12.3-py3-none-any.whl.metadata (8.9 kB)
Collecting mizani<0.10.0,>0.9.0 (from plotnine)
  Obtaining dependency information for mizani<0.10.0,>0.9.0 from https://files.pythonhosted.org/packages/e2/95/d4e33d3f5bc9fee5512637661208b6b595bda58e9b6a66fa867137761dd7/mizani-0.9.3-py3-none-any.whl.metadata
  Downloading mizani-0.9.3-py3-none-any.whl.metadata (4.6 kB)
Collecting tzdata (from mizani<0.10.0,>0.9.0->plotnine)
  Downloading tzdata-2023.3-py2.py3-none-any.whl (341 kB)
     ---------------------------------------- 0.0/341.8 kB ? eta -:--:--
     --------------- ---------------------- 143.4/341.8 kB 4.2 MB/s eta 0:00:01
     -------------------------------------- 341.8/341.8 kB 4.3 MB/s eta 0:00:00
Downloading plotnine-0.12.3-py3-none-any.whl 

In [None]:
penguin = pd.get_dummies(penguin, columns=['species', 'island', 'sex'], drop_first=True)

Let's use the other variables to predict `bill_depth_mm`. Prepare your data and fit the following models on a training dataset subset of the entire dataset:

* Four different models, each containing a different set of predictor variables

Create a plot like the right plot of Fig 1. in our `Model Validation` chapter with the training and test error plotted for each of your four models.

Which of your models was best?

In [None]:
y = penguin['bill_depth_mm']

X1 = penguin[['bill_length_mm']]
X2 = penguin[['bill_length_mm', 'flipper_length_mm', 'body_mass_g']]
X3 = penguin[['bill_length_mm', 'flipper_length_mm', 'body_mass_g', 'species_Chinstrap', 'species_Gentoo']]
X4 = penguin.drop(columns=['bill_depth_mm'])

In [None]:
#train data
train_data, test_data = train_test_split(penguin, test_size=0.25)
y_train, y_test = train_data['bill_depth_mm'], test_data['bill_depth_mm']

In [None]:
X1_train, X2_train, X3_train, X4_train = train_data[['bill_length_mm']], train_data[['bill_length_mm', 'flipper_length_mm', 'body_mass_g']], train_data[['bill_length_mm', 'flipper_length_mm', 'body_mass_g', 'species_Chinstrap', 'species_Gentoo']], train_data.drop(columns=['bill_depth_mm'])
X1_test, X2_test, X3_test, X4_test = test_data[['bill_length_mm']], test_data[['bill_length_mm', 'flipper_length_mm', 'body_mass_g']], test_data[['bill_length_mm', 'flipper_length_mm', 'body_mass_g', 'species_Chinstrap', 'species_Gentoo']], test_data.drop(columns=['bill_depth_mm'])


In [None]:
#training models
models = [LinearRegression() for _ in range(4)]
X_train_list = [X1_train, X2_train, X3_train, X4_train]
X_test_list = [X1_test, X2_test, X3_test, X4_test]
train_errors, test_errors = [], []

for i, model in enumerate(models):
    model.fit(X_train_list[i], y_train)
    train_errors.append(mean_squared_error(y_train, model.predict(X_train_list[i])))
    test_errors.append(mean_squared_error(y_test, model.predict(X_test_list[i])))

In [None]:
error_data = pd.DataFrame({
    'Model': ['Model 1', 'Model 2', 'Model 3', 'Model 4'] * 2,
    'Error Type': ['Training Error'] * 4 + ['Test Error'] * 4,
    'Error': train_errors + test_errors
})


In [None]:
from plotnine import ggplot, aes, geom_point, geom_line, labs, theme_light

plot = (ggplot(error_data, aes(x='Model', y='Error', color='Error Type', group='Error Type')) +
        geom_point(size=3) +
        geom_line(size=1.5) +
        labs(title='Training and Test Error', y='Mean Squared Error') +
        theme_light()
       )

print(plot)

model 3 and 4 had almost even test errors but it appears that model fours test error is just slightly lower making ut the best model.