# Exercises

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pydataset import data

from statsmodels.formula.api import ols

## Evaluating simple linear regressions on lemonade data with other features:

1. Create a dataframe from the csv at [link](https://gist.githubusercontent.com/ryanorsinger/c303a90050d3192773288f7eea97b708/raw/536533b90bb2bf41cea27a2c96a63347cde082a6/lemonade.csv)

In [2]:
lemonade = pd.read_csv('lemonade.csv')

In [3]:
lemonade.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 365 entries, 0 to 364
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Date         365 non-null    object 
 1   Day          365 non-null    object 
 2   Temperature  365 non-null    float64
 3   Rainfall     365 non-null    float64
 4   Flyers       365 non-null    int64  
 5   Price        365 non-null    float64
 6   Sales        365 non-null    int64  
dtypes: float64(3), int64(2), object(2)
memory usage: 20.1+ KB


In [4]:
lemonade.nunique()

Date           365
Day              7
Temperature    176
Rainfall        34
Flyers          62
Price            1
Sales           35
dtype: int64

In [5]:
lemonade.head()

Unnamed: 0,Date,Day,Temperature,Rainfall,Flyers,Price,Sales
0,1/1/17,Sunday,27.0,2.0,15,0.5,10
1,1/2/17,Monday,28.9,1.33,15,0.5,13
2,1/3/17,Tuesday,34.5,1.33,27,0.5,15
3,1/4/17,Wednesday,44.1,1.05,28,0.5,17
4,1/5/17,Thursday,42.4,1.0,33,0.5,18


2. Make a baseline for predicting `sales`. (The mean is a good baseline)

In [6]:
baseline = lemonade.Sales.mean()
baseline

25.323287671232876

3. Create a new dataframe to hold residuals.

In [7]:
evaluate = pd.DataFrame()

4. Calculate the baseline residuals.

In [8]:
evaluate['y'] = lemonade.Sales
evaluate['x'] = lemonade.Flyers
evaluate['baseline'] = baseline
evaluate['baseline_residual'] = evaluate.baseline - evaluate.y

5. Use `ols` from `statsmodels` to create a simple linear regression (1 independent variable, 1 dependent variable) to predict `sales` using `flyers`.

In [9]:
model = ols('Sales ~ Flyers', data=lemonade).fit()

6. Use the `.predict` method from `ols` to produce all of our predictions. Add these predictions to the data

In [10]:
evaluate['yhat'] = model.predict()

7. Calculate that model's residuals.

In [11]:
evaluate['model_residual'] = evaluate.yhat - evaluate.y

In [12]:
evaluate.head()

Unnamed: 0,y,x,baseline,baseline_residual,yhat,model_residual
0,10,15,25.323288,15.323288,14.673754,4.673754
1,13,15,25.323288,12.323288,14.673754,1.673754
2,15,27,25.323288,10.323288,19.727926,4.727926
3,17,28,25.323288,8.323288,20.149107,3.149107
4,18,33,25.323288,7.323288,22.255013,4.255013


8. Evaluate that model's performance and answer if the model is significant.

In [13]:
baseline_sse = (evaluate.baseline_residual**2).sum()
model_sse = (evaluate.model_residual**2).sum()

if model_sse < baseline_sse:
    print("Our model beats the baseline")
    print("It makes sense to evaluate this model more deeply.")
else:
    print("Our baseline is better than the model.")

print("Baseline SSE", baseline_sse)
print("Model SSE", model_sse)

Our model beats the baseline
It makes sense to evaluate this model more deeply.
Baseline SSE 17297.85205479452
Model SSE 6083.326244705024


In [14]:
from sklearn.metrics import mean_squared_error
from math import sqrt

mse = mean_squared_error(evaluate.y, evaluate.yhat)
rmse = sqrt(mse)

print("SSE is", model_sse, " which is the sum sf squared errors")
print("MSE is", mse, " which is the average squared error")
print("RMSE is", rmse, " which is the square root of the MSE")

SSE is 6083.326244705024  which is the sum sf squared errors
MSE is 16.666647245767187  which is the average squared error
RMSE is 4.082480526073233  which is the square root of the MSE


9. Evaluate that model's performance and answer if the feature is significant.

In [15]:
r2 = model.rsquared
print('R-squared = ', round(r2,3))

f_pval = model.f_pvalue
print("p-value for model significance = ", f_pval)

f_pval < .05

R-squared =  0.648
p-value for model significance =  2.193718738113383e-84


True

## Repetition Improves Performance!
- In the next section of your notebook, perform the steps above with the `rainfall` column as the model's feature. Does this model beat the baseline? Would you prefer the rainfall model over the `flyers` model?

In [16]:
# set baseline (mean of target variable)
baseline # =lemonade.Sales.mean()

25.323287671232876

In [18]:
# build linear model
# ols("target ~ feature", df).fit()

from statsmodels.formula.api import ols

model2 = ols('Sales ~ Rainfall', lemonade).fit()

# calculate residuals - `rainfall`
evaluate2 = pd.DataFrame()
evaluate2['x'] = lemonade.Rainfall
evaluate2['y'] = lemonade.Sales
evaluate2['baseline'] = baseline
evaluate2['yhat'] = model2.predict()
evaluate2['baseline_residual'] = evaluate2.baseline - evaluate2.y
evaluate2['model_residual'] = evaluate2.yhat - evaluate2.y

evaluate2.head()

Unnamed: 0,x,y,baseline,yhat,baseline_residual,model_residual
0,2.0,10,25.323288,-1.599602,15.323288,-11.599602
1,1.33,13,25.323288,13.773142,12.323288,0.773142
2,1.33,15,25.323288,13.773142,10.323288,-1.226858
3,1.05,17,25.323288,20.197573,8.323288,3.197573
4,1.0,18,25.323288,21.344793,7.323288,3.344793


In [22]:
# compare SSE between baseline and model
baseline_sse2 = (evaluate2.baseline_residual**2).sum()
model_sse2 = (evaluate2.model_residual**2).sum()

if model_sse2 < baseline_sse2:
    print("Our model beats the baseline")
    print("It makes sense to evaluate this model more deeply.")
else:
    print("Our baseline is better than the model.")

print("Baseline SSE", baseline_sse2)
print("Model 2 SSE ", model_sse2)

Our model beats the baseline
It makes sense to evaluate this model more deeply.
Baseline SSE 2 17297.85205479452
Model SSE 2 2998.2371310300655


In [23]:
# calculate other metrics, MSE, RMSE
from sklearn.metrics import mean_squared_error
from math import sqrt

mse2 = mean_squared_error(evaluate2.y, evaluate2.yhat)
rmse2 = sqrt(mse2)

print("SSE 2 is", model_sse2, " which is the sum sf squared errors")
print("MSE 2 is", mse2, " which is the average squared error")
print("RMSE 2 is", rmse2, " which is the square root of the MSE")

SSE 2 is 2998.2371310300655  which is the sum sf squared errors
MSE 2 is 8.21434830419196  which is the average squared error
RMSE 2 is 2.866068440249109  which is the square root of the MSE


In [25]:
# Find correlation, then p-value for F-Test (for significance of model and feature)
r2_2 = model2.rsquared
print('R-squared = ', round(r2_2,3))

f_pval2 = model2.f_pvalue
print("p-value for model significance = ", f_pval2)

f_pval2 < .05

R-squared =  0.827
p-value for model significance =  3.2988846597381e-140


True

In [30]:
# WHICH MODEL IS BEST? 
print('Baseline: ', baseline_sse)
print('Flyers Model: ', model_sse)
print('*Rainfall Model: ', model_sse2)

Baseline:  17297.85205479452
Flyers Model:  6083.326244705024
*Rainfall Model:  2998.2371310300655


- In the next section of your notebook, perform the steps above with the `log_rainfall` column as the model's feature. Does this model beat the baseline? Would you prefer the `log_rainfall` model over the `flyers` model? Would you prefer the model built with `log_rainfall` over the rainfall model from before?

- In the next section of your notebook, perform the steps above with the `temperature` column as the model's only feature. Does this model beat the baseline? Would you prefer the `rainfall`, `log_rainfall`, or the `flyers` model?

- Which of these 4 single regression models would you want to move forward with?

## Tips dataset
1. Load the `tips` dataset from pydataset or seaborn
2. Define your baseline for "tip". Our goal will be to see if we can make a model that is better than baseline for predicting tips on `total_bill`.
3. Fit a linear regression model (ordinary least squares) and compute yhat, predictions of tip using `total_bill`. 

Here is some sample code to get you started:

1. Calculate the sum of squared errors, explained sum of squares, total sum of squares, mean squared error, and root mean squared error for your model.
2. Calculate the sum of squared errors, mean squared error, and root mean squared error for the baseline model (i.e. a model that always predicts the average tip amount).
3. Write python code that compares the sum of squared errors for your model against the sum of squared errors for the baseline model and outputs whether or not your model performs better than the baseline model.
4. What is the amount of variance explained in your model?
5. Is your model significantly better than the baseline model?
6. Plot the residuals for the linear regression model that you made.