# Palmer Penguins Modeling

Import the Palmer Penguins dataset and print out the first few rows.

Suppose we want to predict `bill_depth_mm` using the other variables in the dataset.

Which variables would we need to **dummify**?

In [12]:
# input dataset/libraries
#!pip install palmerpenguins
from palmerpenguins import load_penguins
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
penguins = load_penguins()
penguins.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,Adelie,Torgersen,,,,,,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007


In [13]:
#Pandas
#double brackets to get dataframe on species variable
#T/F same thing as 1/0
pd.get_dummies(penguins[['species']])

#does for entire dataset, but not good b/c it will do every col that is categorical (specifically bad for Name variable)
pd.get_dummies(penguins)



Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,year,species_Adelie,species_Chinstrap,species_Gentoo,island_Biscoe,island_Dream,island_Torgersen,sex_female,sex_male
0,39.1,18.7,181.0,3750.0,2007,True,False,False,False,False,True,False,True
1,39.5,17.4,186.0,3800.0,2007,True,False,False,False,False,True,True,False
2,40.3,18.0,195.0,3250.0,2007,True,False,False,False,False,True,True,False
3,,,,,2007,True,False,False,False,False,True,False,False
4,36.7,19.3,193.0,3450.0,2007,True,False,False,False,False,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
339,55.8,19.8,207.0,4000.0,2009,False,True,False,False,True,False,False,True
340,43.5,18.1,202.0,3400.0,2009,False,True,False,False,True,False,True,False
341,49.6,18.2,193.0,3775.0,2009,False,True,False,False,True,False,False,True
342,50.8,19.0,210.0,4100.0,2009,False,True,False,False,True,False,False,True


In [19]:
#OneHotEncoder

#making new function thats job is to onehotencode
enc = OneHotEncoder(handle_unknown='ignore')

# give function dataset (learns categories of species variables)
enc.fit(penguins[['species']])

#trigger change (use to toarray to make intermediate steps viewable)
enc.transform(penguins[['species']]).toarray()

#shows categories and order of categories
enc.categories_

#you can put whole dataset in but its very confusing

[array(['Adelie', 'Chinstrap', 'Gentoo'], dtype=object)]

In [20]:
#standardizing quantitative variables
from sklearn.preprocessing import StandardScaler
#only good for numeric variable
scaler = StandardScaler()
scaler.fit_transform(penguins[['bill_length_mm']])

array([[-0.88449874],
       [-0.81112573],
       [-0.66437972],
       [        nan],
       [-1.32473679],
       [-0.84781224],
       [-0.92118525],
       [-0.86615549],
       [-1.80166135],
       [-0.35254443],
       [-1.12296102],
       [-1.12296102],
       [-0.5176337 ],
       [-0.976215  ],
       [-1.70994508],
       [-1.34308004],
       [-0.95787175],
       [-0.26082817],
       [-1.74663159],
       [ 0.38118565],
       [-1.12296102],
       [-1.14130427],
       [-1.47148281],
       [-1.04958801],
       [-0.9395285 ],
       [-1.58154232],
       [-0.60934996],
       [-0.62769321],
       [-1.10461777],
       [-0.62769321],
       [-0.81112573],
       [-1.23302053],
       [-0.81112573],
       [-0.5543202 ],
       [-1.37976655],
       [-0.86615549],
       [-0.9395285 ],
       [-0.31585793],
       [-1.15964752],
       [-0.75609598],
       [-1.3614233 ],
       [-0.57266346],
       [-1.45313956],
       [ 0.03266386],
       [-1.26970704],
       [-0

Let's use `bill_length_mm` to predict `bill_depth_mm`. Prepare your data and fit the following models on the entire dataset:

* Simple linear regression (e.g. straight-line) model
* Quadratic (degree 2 polynomial) model
* Cubic (degree 3 polynomial) model
* Degree 10 polynomial model

Make predictions for each model and plot your fitted models on the scatterplot.

In [60]:
#drop NA for all dataset to save time
penguins = penguins.dropna()

# Code Here
#Modelling with sklearn
import sklearn
import numpy as np

#choose our predictor and response
X = penguins[['bill_length_mm']]
y = penguins['bill_depth_mm']

#use linear regression
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

#train/split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

#fit models using training sets
lr_fit = lr.fit(X_train, y_train)

#use the fitted models to get predicted values for the test data
y_pred_lr = lr_fit.predict(X_test)

#use MSE and compute for test data predictions to access accuracy
from sklearn.metrics import r2_score, mean_squared_error

#train MSE
#need to use fitted model to predict vals of train dataset
y_pred_lr_train = lr_fit.predict(X_train)
MSE_train = mean_squared_error(y_train, y_pred_lr_train)

#test MSE
MSE_test = mean_squared_error(y_test, y_pred_lr)

#train R^2
R_train = r2_score(y_train, y_pred_lr_train)

#test R^2
R_test = r2_score(y_test, y_pred_lr)

print(MSE_train, MSE_test, R_train, R_test)



3.752182489894035 3.4077617094702437 0.04849914470952921 0.06286875062028574


Repeat the steps above but for quadratic, cubic, and degree 10

In [61]:
# Code Here
#Modelling with sklearn
import sklearn
import numpy as np

#create squ x variable
penguins['bill_length_mm_squ'] = penguins['bill_length_mm']**2

#choose our predictor and response
X = penguins[['bill_length_mm', 'bill_length_mm_squ']]
y = penguins['bill_depth_mm']

#use linear regression
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

#train/split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

#fit models using training sets
lr_fit = lr.fit(X_train, y_train)

#use the fitted models to get predicted values for the test data
y_pred_lr = lr_fit.predict(X_test)

#use MSE and compute for test data predictions to access accuracy
from sklearn.metrics import r2_score, mean_squared_error

#train MSE
#need to use fitted model to predict vals of train dataset
y_pred_lr_train = lr_fit.predict(X_train)
MSE_train = mean_squared_error(y_train, y_pred_lr_train)

#test MSE
MSE_test = mean_squared_error(y_test, y_pred_lr)

#train R^2
R_train = r2_score(y_train, y_pred_lr_train)

#test R^2
R_test = r2_score(y_test, y_pred_lr)

print(MSE_train, MSE_test, R_train, R_test)


3.226917838998579 4.1406309562065235 0.15266039465118764 -0.029820362259139976


In [62]:
# Code Here
#Modelling with sklearn
import sklearn
import numpy as np

#create square and cube var of X
penguins['bill_length_mm_squ'] = penguins['bill_length_mm']**2
penguins['bill_length_mm_cube'] = penguins['bill_length_mm']**3

#choose our predictor and response
X = penguins[['bill_length_mm', 'bill_length_mm_squ', 'bill_length_mm_cube']]
y = penguins['bill_depth_mm']

#drop na's
X.dropna(inplace=True)
y.dropna(inplace=True)

#use linear regression
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

#train/split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

#fit models using training sets
lr_fit = lr.fit(X_train, y_train)

#use the fitted models to get predicted values for the test data
y_pred_lr = lr_fit.predict(X_test)

#use MSE and compute for test data predictions to access accuracy
from sklearn.metrics import r2_score, mean_squared_error

#train MSE
#need to use fitted model to predict vals of train dataset
y_pred_lr_train = lr_fit.predict(X_train)
MSE_train = mean_squared_error(y_train, y_pred_lr_train)

#test MSE
MSE_test = mean_squared_error(y_test, y_pred_lr)

#train R^2
R_train = r2_score(y_train, y_pred_lr_train)

#test R^2
R_test = r2_score(y_test, y_pred_lr)

print(MSE_train, MSE_test, R_train, R_test)


3.1549013948612536 4.166566001849939 0.1496182358813556 0.004795724003107238


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.dropna(inplace=True)


In [64]:
# Code Here
#Modelling with sklearn
import sklearn
import numpy as np

#create square and cube and tenth^ var of X
penguins['bill_length_mm_squ'] = penguins['bill_length_mm']**2
penguins['bill_length_mm_cube'] = penguins['bill_length_mm']**3
penguins['bill_length_mm_4'] = penguins['bill_length_mm']**4
penguins['bill_length_mm_5'] = penguins['bill_length_mm']**5
penguins['bill_length_mm_6'] = penguins['bill_length_mm']**6
penguins['bill_length_mm_7'] = penguins['bill_length_mm']**7
penguins['bill_length_mm_8'] = penguins['bill_length_mm']**8
penguins['bill_length_mm_9'] = penguins['bill_length_mm']**9
penguins['bill_length_mm_10'] = penguins['bill_length_mm']**10


#choose our predictor and response
X = penguins[['bill_length_mm', 'bill_length_mm_squ', 'bill_length_mm_cube', 'bill_length_mm_4', 'bill_length_mm_5', 'bill_length_mm_6', 'bill_length_mm_7', 'bill_length_mm_8', 'bill_length_mm_9', 'bill_length_mm_10']]
y = penguins['bill_depth_mm']

#use linear regression
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

#train/split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

#fit models using training sets
lr_fit = lr.fit(X_train, y_train)

#use the fitted models to get predicted values for the test data
y_pred_lr = lr_fit.predict(X_test)

#use MSE and compute for test data predictions to access accuracy
from sklearn.metrics import r2_score, mean_squared_error

#train MSE
#need to use fitted model to predict vals of train dataset
y_pred_lr_train = lr_fit.predict(X_train)
MSE_train = mean_squared_error(y_train, y_pred_lr_train)

#test MSE
MSE_test = mean_squared_error(y_test, y_pred_lr)

#train R^2
R_train = r2_score(y_train, y_pred_lr_train)

#test R^2
R_test = r2_score(y_test, y_pred_lr)

print(MSE_train, MSE_test, R_train, R_test)


2.762044857462702 5.3702228790181445 0.25801211184161665 -0.2516269549699872


In [None]:
#overfitting = way better MSE for training when compared to test MSE

#underfitting = training MSE and test MSE is bad

* Are any of the models above underfitting the data? If so, which ones and how can you tell?
* Are any of thhe models above overfitting the data? If so, which ones and how can you tell?
* Which of the above models do you think fits the data best and why?

In [None]:
# Because MSE for train is better when compared to MSE test for the quadratic and cubic model, these models are overfitting

# Because the MSE for the simple linear model contains the same value for the MSE test and train , that means they are underfitting or they are good models!

# for the 10th degree model, the MSE for the train is significantly better than the MSE of test so that means that this model is also overfitting by a lot!

#I would say the best model above is the simple linear model because it is guarateed to not be overfitting