### for practice:
1. set baseline predictions (mean, median)
2. evaluate the baseline (we are comparing y (actual values) to the predicted values, which are all the same value...the mean of y, e.g.)
    - y: 19, 18, 12, 8, 5
    - y_pred: 11, 11, 11, 11, 11
3. LinearRegression()
4. LassoLars()
5. PolynomialFeatures(degree=2) ... then LinearRegression()

for each one, evaluate with training predictions, and then with validate predictions.

In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import math
import warnings
warnings.filterwarnings("ignore")

from pydataset import data

from sklearn.linear_model import LinearRegression, LassoLars
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error

In [11]:
#load tips dataset
tips = data('tips')

In [12]:
# feature engineering
tips['tip_percentage'] = tips.tip / tips.total_bill
tips['price_per_person'] = tips.total_bill / tips['size']

# drop total_bill and size (accounted for in 'price per person')
tips = tips.drop(columns=['total_bill', 'size'])

# encode categorical variables into dummies (sex, smoker, day, time)
dummy_vars = tips[['sex', 'smoker', 'day', 'time']]
dummy_df = pd.get_dummies(dummy_vars, dummy_na=False, drop_first=True)
df = pd.concat([tips, dummy_df], axis=1)
df = df.drop(columns=dummy_vars)

# split data into train, test, validate
from sklearn.model_selection import train_test_split

train_validate, test = train_test_split(df, test_size=.2, 
                                        random_state=123)

train, validate = train_test_split(train_validate, 
                                   test_size=.3, random_state=123)
train.shape, validate.shape, test.shape

((136, 9), (59, 9), (49, 9))

In [13]:
# split into X and y dfs
X_train = train.drop(columns=['tip'])
X_validate = validate.drop(columns=['tip'])
X_test = test.drop(columns=['tip'])

y_train = train[['tip']]
y_validate = validate[['tip']]
y_test = test[['tip']]

# scale the data (creates arrays)
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(copy=True).fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_validate_scaled = scaler.transform(X_validate)
X_test_scaled = scaler.transform(X_test)

# turn scaled data arrays to df
X_train_scaled = pd.DataFrame(X_train_scaled, 
                              columns=X_train.columns.values).\
                            set_index([X_train.index.values])

X_validate_scaled = pd.DataFrame(X_validate_scaled, 
                                columns=X_validate.columns.values).\
                            set_index([X_validate.index.values])

X_test_scaled = pd.DataFrame(X_test_scaled, 
                                columns=X_test.columns.values).\
                            set_index([X_test.index.values])

In [14]:
X_train_scaled.head()

Unnamed: 0,tip_percentage,price_per_person,sex_Male,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch
19,0.252863,0.150344,0.0,0.0,0.0,1.0,0.0,0.0
173,1.0,0.032258,1.0,1.0,0.0,1.0,0.0,0.0
119,0.161808,0.182796,0.0,0.0,0.0,0.0,1.0,1.0
29,0.240873,0.452194,1.0,0.0,1.0,0.0,0.0,0.0
238,0.0,0.775647,1.0,1.0,1.0,0.0,0.0,0.0


In [17]:
y_train.head()

Unnamed: 0,tip
19,3.5
173,5.15
119,1.8
29,4.3
238,1.17


### Baseline

In [46]:
baseline = y_train.mean()
baseline

tip    2.946985
dtype: float64

In [47]:
#baseline rmse
baseline_rmse = math.sqrt(
    mean_squared_error(
        y_train, np.full(
            y_train.shape[0], np.mean(y_train))))
baseline_rmse

1.4512460770849047

#### Linear Regression

In [48]:
# fit the model
lm = LinearRegression(normalize=True)\
.fit(X_train_scaled, y_train)

# predict train observations
lm_pred = lm.predict(X_train_scaled)

# evaluate train: compute root mean squared error
lm_rmse = math.sqrt(mean_squared_error(y_train, lm_pred))
lm_rmse

1.1552132256162309

In [49]:
# validate

#predict validate observations
lm_pred_v = lm.predict(X_validate_scaled)

# evaluate validate: compute root mean squared error
lm_rmse_v = math.sqrt(mean_squared_error(y_validate, lm_pred_v))
lm_rmse_v

1.149558220540037

#### LassoLars

In [50]:
#train model
lars = LassoLars(alpha=0.1)\
.fit(X_train_scaled, y_train)

lars_pred = lars.predict(X_train_scaled)

lars_rmse = math.sqrt(mean_squared_error(y_train, lars_pred))
lars_rmse

1.4512460770849047

In [51]:
#validate model
lars_pred_v = lars.predict(X_validate_scaled)

lars_rmse_v = math.sqrt(mean_squared_error(y_validate, lars_pred_v))
lars_rmse_v

1.5173853468530591

#### PolynomialFeatures + LinearRegression (squared)

In [52]:
# make the polynomial thing
pf = PolynomialFeatures(degree=2)

# fit and transform the thing
# to get a new set of features..which are the original features sqauared
X_train_squared = pf.fit_transform(X_train_scaled)
X_validate_squared = pf.transform(X_validate_scaled)

# feed that data into our linear model. 
# make the thing
lm_squared = LinearRegression()
lm_squared.fit(X_train_squared, y_train)

# predict training observervations
lm_squared_pred = lm_squared.predict(X_train_squared)

# Evaluate our training predictions
lm_squared_rmse = math.sqrt(mean_squared_error(y_train, lm_squared_pred))
lm_squared_rmse

0.9426603906654986

In [53]:
# validate

lm_squared_pred_v = lm_squared.predict(X_validate_squared)

lm_squared_rmse_v = math.sqrt(mean_squared_error(y_validate, lm_squared_pred_v))
lm_squared_rmse_v

1.0719652305239187

#### PolynomialFeatures + LinearRegression (cubed)

In [60]:
# make the polynomial thing
pf_c = PolynomialFeatures(degree=3)

# fit and transform the thing
# to get a new set of features..which are the original features sqauared
X_train_cubed = pf_c.fit_transform(X_train_scaled)
X_validate_cubed = pf_c.transform(X_validate_scaled)

# feed that data into our linear model. 
# make the thing
lm_cubed = LinearRegression()
lm_cubed.fit(X_train_cubed, y_train)

# predict training observervations
lm_cubed_pred = lm_cubed.predict(X_train_cubed)

# Evaluate our training predictions
lm_cubed_rmse = math.sqrt(mean_squared_error(y_train, lm_cubed_pred))
lm_cubed_rmse

0.7814657036779133

In [61]:
# validate

lm_cubed_pred_v = lm_cubed.predict(X_validate_cubed)

lm_cubed_rmse_v = math.sqrt(mean_squared_error(y_validate, lm_cubed_pred_v))
lm_cubed_rmse_v

32086485508.008778

### Evaulate and Compare

In [62]:
lm_rmse = math.sqrt(mean_squared_error(y_train, lm_pred))
lars_rmse = math.sqrt(mean_squared_error(y_train, lars_pred))
lm_squared_rmse = math.sqrt(mean_squared_error(y_train, lm_squared_pred))
lm_cubed_rmse = math.sqrt(mean_squared_error(y_train, lm_cubed_pred))

print("Baseline, Mean: ", baseline_rmse)
print("Linear Model: ", lm_rmse)
print("LassoLars: ", lars_rmse)
print("Polynomial, squared: ", lm_squared_rmse)
print("Polynomial, cubed: ", lm_cubed_rmse)

Baseline, Mean:  1.4512460770849047
Linear Model:  1.1552132256162309
LassoLars:  1.4512460770849047
Polynomial, squared:  0.9426603906654986
Polynomial, cubed:  0.7814657036779133
