# Modeling Exercises

##### Exercise 1. Select a dataset with a continuous target variable.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures

from pydataset import data


In [2]:
df = data('mpg')
df.head()

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact


##### Exercise 2. Be sure your data is prepared (no missing values, numeric datatypes) and split into samples.

In [3]:
# scaler
from sklearn.preprocessing import MinMaxScaler
# train test split from sklearn
from sklearn.model_selection import train_test_split

In [4]:
train, test = train_test_split(df, train_size = 0.8, random_state = 1234)
train, validate = train_test_split(train, train_size = 0.7, random_state = 1234)

In [5]:
def data_scaler(train, validate, test, columns_to_scale):
    train_scaled = train.copy()
    validate_scaled = validate.copy()
    test_scaled = test.copy()
    
    scaler = MinMaxScaler()
    
    train_scaled[columns_to_scale] = pd.DataFrame(scaler.fit_transform(train[columns_to_scale]), 
                                                  columns=train[columns_to_scale].columns.values).set_index([train.index.values])

    validate_scaled[columns_to_scale] = pd.DataFrame(scaler.transform(validate[columns_to_scale]),
                                                  columns=validate[columns_to_scale].columns.values).set_index([validate.index.values])
    
    test_scaled[columns_to_scale] = pd.DataFrame(scaler.transform(test[columns_to_scale]),
                                                 columns=test[columns_to_scale].columns.values).set_index([test.index.values])

    return train_scaled, validate_scaled, test_scaled

In [6]:
train_scaled, validate_scaled, test_scaled = data_scaler(train, validate, test, ['cty', 'hwy'])

In [7]:
train_scaled.head()

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
214,volkswagen,jetta,2.0,1999,4,manual(m5),f,0.5,0.53125,r,compact
179,toyota,4runner 4wd,4.7,2008,8,auto(l5),4,0.208333,0.15625,r,suv
81,ford,explorer 4wd,4.0,2008,6,auto(l5),4,0.166667,0.21875,r,suv
13,audi,a4 quattro,2.8,1999,6,manual(m5),4,0.333333,0.40625,p,compact
57,dodge,dakota pickup 4wd,5.2,1999,8,auto(l4),4,0.083333,0.09375,r,pickup


In [8]:
X_train_scaled = train_scaled[['cty', 'hwy']]
y_train_scaled = train_scaled.displ

X_validate_scaled = validate_scaled[['cty', 'hwy']]
y_validate_scaled = validate_scaled.displ

X_test_scaled = test_scaled[['cty', 'hwy']]
y_test_scaled = test_scaled.displ

##### Exercise 3. Work through all of the steps outlined in the lesson, from setting the baseline to selected a model and evaluating the final model on your test data.

In [9]:
y_train_scaled = pd.DataFrame(y_train_scaled)
y_validate_scaled = pd.DataFrame(y_validate_scaled)

In [10]:
displ_pred_mean = y_train_scaled['displ'].mean()
y_train_scaled['displ_pred_mean'] = displ_pred_mean
y_validate_scaled['displ_pred_mean'] = displ_pred_mean

In [11]:
displ_pred_median = y_train_scaled['displ'].median()
y_train_scaled['displ_pred_median'] = displ_pred_median
y_validate_scaled['displ_pred_median'] = displ_pred_median
y_train_scaled.head()

Unnamed: 0,displ,displ_pred_mean,displ_pred_median
214,2.0,3.496923,3.3
179,4.7,3.496923,3.3
81,4.0,3.496923,3.3
13,2.8,3.496923,3.3
57,5.2,3.496923,3.3


In [12]:
rmse_train = mean_squared_error(y_train_scaled.displ, y_train_scaled.displ_pred_mean)**(1/2)
rmse_validate = mean_squared_error(y_validate_scaled.displ, y_validate_scaled.displ_pred_mean)**(1/2)


In [13]:
print("RMSE using Mean\nTrain/In-Sample: ", round(rmse_train, 2), 
      "\nValidate/Out-of-Sample: ", round(rmse_validate, 2))

RMSE using Mean
Train/In-Sample:  1.27 
Validate/Out-of-Sample:  1.36


In [14]:
rmse_train = mean_squared_error(y_train_scaled.displ, y_train_scaled.displ_pred_median)**(1/2)
rmse_validate = mean_squared_error(y_validate_scaled.displ, y_validate_scaled.displ_pred_median)**(1/2)

print("RMSE using Median\nTrain/In-Sample: ", round(rmse_train, 2), 
      "\nValidate/Out-of-Sample: ", round(rmse_validate, 2))

RMSE using Median
Train/In-Sample:  1.29 
Validate/Out-of-Sample:  1.38


In [15]:
# create the model
lm = LinearRegression(normalize=True)

#fit the model
lm.fit(X_train_scaled, y_train_scaled.displ)

# predict train
y_train_scaled['displ_pred_lm'] = lm.predict(X_train_scaled)

# evaluate: rmse
rmse_train = mean_squared_error(y_train_scaled.displ, y_train_scaled.displ_pred_lm)**(1/2)

# predict validate
y_validate_scaled['displ_pred_lm'] = lm.predict(X_validate_scaled)

# evaluate: rmse
rmse_validate = mean_squared_error(y_validate_scaled.displ, y_validate_scaled.displ_pred_lm)**(1/2)

print("RMSE for OLS using LinearRegression\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate)

RMSE for OLS using LinearRegression
Training/In-Sample:  0.8085364575814656 
Validation/Out-of-Sample:  0.7861347373102197


In [16]:
# create the model
lars = LassoLars(alpha=1.0)

# fit the model
lars.fit(X_train_scaled, y_train_scaled.displ)

# predict train
y_train_scaled['displ_pred_lars'] = lars.predict(X_train_scaled)

# evaluate: rmse
rmse_train = mean_squared_error(y_train_scaled.displ, y_train_scaled.displ_pred_lars)**(1/2)

# predict validate
y_validate_scaled['displ_pred_lars'] = lars.predict(X_validate_scaled)

# evaluate: rmse
rmse_validate = mean_squared_error(y_validate_scaled.displ, y_validate_scaled.displ_pred_lars)**(1/2)

print("RMSE for LassoLars\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate)

RMSE for LassoLars
Training/In-Sample:  1.272788486962535 
Validation/Out-of-Sample:  1.3610759295789134


In [17]:
# create the model 
glm = TweedieRegressor(power=1, alpha=0)

# fit the model
glm.fit(X_train_scaled, y_train_scaled.displ)

# predict train
y_train_scaled['displ_pred_glm'] = glm.predict(X_train_scaled)

# evaluate: rmse
rmse_train = mean_squared_error(y_train_scaled.displ, y_train_scaled.displ_pred_glm)**(1/2)

# predict validate
y_validate_scaled['displ_pred_glm'] = glm.predict(X_validate_scaled)

# evaluate: rmse
rmse_validate = mean_squared_error(y_validate_scaled.displ, y_validate_scaled.displ_pred_glm)**(1/2)

print("RMSE for GLM  using Tweedie\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate)

RMSE for GLM  using Tweedie
Training/In-Sample:  0.7534198329265355 
Validation/Out-of-Sample:  0.6784919787115772


In [18]:
# make the polynomial features to get a new set of features
pf = PolynomialFeatures(degree=2)

# fit and transform X_train_scaled
X_train_degree2 = pf.fit_transform(X_train_scaled)

# transform X_validate_scaled & X_test_scaled
X_validate_degree2 = pf.transform(X_validate_scaled)
X_test_degree2 = pf.transform(X_test_scaled)

In [19]:
# create the model
lm2 = LinearRegression(normalize=True)

# fit the model
lm2.fit(X_train_degree2, y_train_scaled.displ)

# predict train
y_train_scaled['displ_pred_lm2'] = lm2.predict(X_train_degree2)

# evaluate: rmse
rmse_train = mean_squared_error(y_train_scaled.displ, y_train_scaled.displ_pred_lm2)**(1/2)

# predict validate
y_validate_scaled['displ_pred_lm2'] = lm2.predict(X_validate_degree2)

# evaluate: rmse
rmse_validate = mean_squared_error(y_validate_scaled.displ, y_validate_scaled.displ_pred_lm2)**(1/2)

print("RMSE for Polynomial Model, degrees=2\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate)

RMSE for Polynomial Model, degrees=2
Training/In-Sample:  0.7370771842459999 
Validation/Out-of-Sample:  0.6444400042250958


## Selected Model: GLM  using Tweedie

### Out-of-Sample Evaluation

In [21]:
y_test_scaled = pd.DataFrame(y_test_scaled)
y_test_scaled.head()

Unnamed: 0,displ
105,1.8
209,2.0
7,3.1
112,2.4
52,3.9


In [25]:
y_test_scaled['displ_pred_glm'] = glm.predict(X_test_scaled)

# evaluate: rmse
rmse_validate = mean_squared_error(y_test_scaled.displ, y_test_scaled.displ_pred_glm)**(1/2)

print("RMSE for GLM using Tweedie\nTest set/Out-of-Sample: ", rmse_validate)

RMSE for GLM using Tweedie
Test set/Out-of-Sample:  0.7095603361532568
