In [2]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [3]:
# Load and split data first
df = sns.load_dataset("penguins")

# Split the data
train, test = train_test_split(
    df,
    test_size=0.2,
    train_size = 0.8
)

In [4]:
train.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
211,Chinstrap,Dream,45.6,19.4,194.0,3525.0,Female
19,Adelie,Torgersen,46.0,21.5,194.0,4200.0,Male
48,Adelie,Dream,36.0,17.9,190.0,3450.0,Female
26,Adelie,Biscoe,40.6,18.6,183.0,3550.0,Male
181,Chinstrap,Dream,52.8,20.0,205.0,4550.0,Male


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [6]:
train.isnull().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  10
dtype: int64

In [7]:
# Missing values in test
test.isnull().sum()

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  1
dtype: int64

In [8]:
train.describe()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
count,273.0,273.0,273.0,273.0
mean,43.78315,17.202198,200.908425,4199.084249
std,5.336294,2.028237,13.794808,761.796978
min,33.5,13.1,172.0,2700.0
25%,39.1,15.6,190.0,3600.0
50%,44.1,17.3,197.0,4000.0
75%,47.8,18.8,214.0,4725.0
max,59.6,21.5,230.0,6050.0


In [9]:
# Handle missing values

flip_mean_train = train["flipper_length_mm"].mean()
body_mean_train = train["body_mass_g"].mean()

flip_mean_test = test["flipper_length_mm"].mean()
body_mean_test = test["body_mass_g"].mean()

train["flipper_length_mm"].fillna(flip_mean_train, inplace=True)
test["flipper_length_mm"].fillna(flip_mean_test, inplace=True)

train["body_mass_g"].fillna(body_mean_train, inplace=True)
test["body_mass_g"].fillna(body_mean_test, inplace=True)

In [10]:
# Create my X, y data

target = "flipper_length_mm"
features = "body_mass_g"

X_train = train[[features]]
y_train = train[[target]]

X_test = test[[features]]
y_test = test[[target]]

In [11]:
# Lets predict flipper_length_mm from body_mass_g

# Create a model object
lr = LinearRegression()

# Train the model
lr.fit(X_train, y_train)

In [None]:
# Save the model
import pickle
filename = 'model.pkl'
pickle.dump(lr, open(filename, 'wb'))