# Overfit and regularization exercises


In [51]:
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression


## 0. Tips data EDA (*)

In [3]:
df = sns.load_dataset("tips")
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [6]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


In [34]:
df_cleaned = df.drop(labels=["sex", "smoker", "day", "time"], axis=1)
df_cleaned

Unnamed: 0,total_bill,tip,size
0,16.99,1.01,2
1,10.34,1.66,3
2,21.01,3.50,3
3,23.68,3.31,2
4,24.59,3.61,4
...,...,...,...
239,29.03,5.92,3
240,27.18,2.00,2
241,22.67,2.00,2
242,17.82,1.75,2


## 1. Train|test split (*)

In [35]:
X_train, X_test, y_train, y_test = train_test_split(df_cleaned.drop("tip", axis=1), df_cleaned["tip"], test_size=0.3, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((170, 2), (74, 2), (170,), (74,))

## 2. Feature standardization (*)

In [36]:
df.mean()["tip"]

  df.mean()["tip"]


2.99827868852459

In [41]:
X_train_standard = (X_train - X_train.mean() )/X_train.std()
X_test_standard = (X_test - X_train.mean() )/X_train.std()


In [42]:
X_train_standard.mean(), X_test_standard.mean()

(total_bill    6.791953e-17
 size         -1.985340e-16
 dtype: float64,
 total_bill   -0.191380
 size         -0.044779
 dtype: float64)

In [43]:
X_train_standard.std(), X_test_standard.std()

(total_bill    1.0
 size          1.0
 dtype: float64,
 total_bill    0.924957
 size          1.066389
 dtype: float64)

## 3. Polynomial features (*)

In [50]:
poly = PolynomialFeatures(2)
poly_features = poly.fit_transform(X_train_standard)

poly_features

array([[ 1.00000000e+00, -5.26512199e-01, -6.23675796e-01,
         2.77215096e-01,  3.28372915e-01,  3.88971498e-01],
       [ 1.00000000e+00,  1.50543056e-02,  1.51824108e+00,
         2.26632118e-04,  2.28560652e-02,  2.30505597e+00],
       [ 1.00000000e+00,  1.57811210e+00,  1.51824108e+00,
         2.49043781e+00,  2.39595462e+00,  2.30505597e+00],
       ...,
       [ 1.00000000e+00, -1.60304074e+00, -6.23675796e-01,
         2.56973961e+00,  9.99777708e-01,  3.88971498e-01],
       [ 1.00000000e+00,  1.57591061e+00, -6.23675796e-01,
         2.48349426e+00, -9.82857306e-01,  3.88971498e-01],
       [ 1.00000000e+00,  2.64033096e+00,  4.47282641e-01,
         6.97134757e+00,  1.18097421e+00,  2.00061761e-01]])

In [56]:
model = LinearRegression()
model.fit(poly_features, y_train)
model.coef_, model.intercept_

(array([ 0.        ,  0.82772287,  0.23836995, -0.02489396,  0.17563373,
        -0.03058172]),
 3.046024962289253)

In [62]:

x_test_poly_features = poly.transform(X_test_standard)
pred = model.predict(x_test_poly_features)

In [63]:
pred

array([2.84639848, 1.93273718, 3.84239468, 3.99984872, 2.29119315,
       2.6921269 , 3.50799483, 2.24770953, 2.43880605, 2.48813249,
       2.72428488, 2.10356516, 2.09338352, 2.41406139, 1.9249191 ,
       2.97876173, 2.985814  , 2.9851225 , 2.57555899, 6.27385736,
       3.38277884, 3.18008637, 2.40001532, 1.99964712, 3.20495633,
       2.18133574, 2.06108438, 3.27066479, 2.8724681 , 7.78397729,
       4.8228087 , 1.84381943, 3.03399813, 2.88362698, 2.80963834,
       3.78691167, 2.20994782, 5.92685024, 2.3045399 , 3.07009981,
       2.08149397, 2.41736382, 3.16897466, 2.12055246, 2.06704077,
       1.45844072, 2.08828945, 2.98379937, 1.82821472, 2.31453984,
       3.19353895, 3.42157345, 4.67114716, 2.53724176, 2.8201797 ,
       2.29369684, 1.80129411, 2.68648933, 2.83912779, 2.6788719 ,
       4.77704366, 2.64532639, 3.12120558, 2.56090005, 2.87888182,
       2.93331997, 2.45774019, 1.74055039, 3.76080211, 3.56084776,
       3.27190667, 4.54596717, 2.64370896, 2.83290004])

## 4. Polynomial regression (*)