In [1]:
#importing relevant packages & libraries
import pandas as pd
import seaborn as sns

In [2]:
#loading & viewing the data
penguins = sns.load_dataset("penguins")
penguins.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [3]:
#already very familiar with dataset, needs to be cleaned up a bit
#creating subset of data
penguins = penguins[["body_mass_g", "bill_length_mm", "sex", "species"]]
#renaming columns
penguins.columns = ["body_mass_g", "bill_length_mm", "gender", "species"]
#dropping rows with missing values
penguins.dropna(inplace=True)
#resetting index
penguins.reset_index(inplace=True, drop=True)
#viewing cleaned up data
penguins.head()

Unnamed: 0,body_mass_g,bill_length_mm,gender,species
0,3750.0,39.1,Male,Adelie
1,3800.0,39.5,Female,Adelie
2,3250.0,40.3,Female,Adelie
3,3450.0,36.7,Female,Adelie
4,3650.0,39.3,Male,Adelie


In [4]:
#looks better
#going to do something new that i recently learned about
#going to create holdout sample to better test model & related results
#subsetting x and y variables
penguins_X = penguins[["bill_length_mm", "gender", "species"]]
penguins_y = penguins[["body_mass_g"]]

In [5]:
#importing training test and split functionality from scikit learn packages
from sklearn.model_selection import train_test_split

In [6]:
#creating training data sets and holdout datasets
#setting random state for reproducability state (best practice)
X_train, X_test, y_train, y_test = train_test_split(penguins_X, penguins_y, 
                                                    test_size = 0.3, random_state = 42)

In [7]:
#i recall from my last foray with this data that bill length flipper length are linearly correlated
#bill length and body mass and flipper length and body mass also linearly correlated
#im going to focus on body mass since i focused on something different in my previous simple linear regression case study
#developing OLS model
ols_formula = "body_mass_g ~ bill_length_mm + C(gender) + C(species)"

In [8]:
#importing OLS functionality from statsmodels api like before
from statsmodels.formula.api import ols

In [9]:
#creating OLS dataframe
ols_data = pd.concat([X_train, y_train], axis = 1)

In [10]:
#creating variable for OLS & fitting the model to data
OLS = ols(formula = ols_formula, data = ols_data)
model = OLS.fit()

In [11]:
#taking a peek at model results
model.summary()

0,1,2,3
Dep. Variable:,body_mass_g,R-squared:,0.85
Model:,OLS,Adj. R-squared:,0.847
Method:,Least Squares,F-statistic:,322.6
Date:,"Mon, 10 Apr 2023",Prob (F-statistic):,1.31e-92
Time:,04:46:51,Log-Likelihood:,-1671.7
No. Observations:,233,AIC:,3353.0
Df Residuals:,228,BIC:,3371.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2032.2111,354.087,5.739,0.000,1334.510,2729.913
C(gender)[T.Male],528.9508,55.105,9.599,0.000,420.371,637.531
C(species)[T.Chinstrap],-285.3865,106.339,-2.684,0.008,-494.920,-75.853
C(species)[T.Gentoo],1081.6246,94.953,11.391,0.000,894.526,1268.723
bill_length_mm,35.5505,9.493,3.745,0.000,16.845,54.256

0,1,2,3
Omnibus:,0.339,Durbin-Watson:,1.948
Prob(Omnibus):,0.844,Jarque-Bera (JB):,0.436
Skew:,0.084,Prob(JB):,0.804
Kurtosis:,2.871,Cond. No.,798.0


In [None]:
#providing interpretation below in following comments
#if penguin is male, body mass should be about 528 grams heavier 
#Chinstrap penguins body mass is about 285 grams less than Adelie penguins
#Gentoo penguins body mass is about 1,081 grams heavier than Adelie penguins
#model explains about 85% of variation in body mass
#all variables in model are statistically signifcant 