In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import f_regression 
from math import sqrt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

#### Regression Evaluation

Why do we evaluate models?
    
- Does this model add any value?
- Which model is better?
- How confident am I in my model's predictions?

Slides: https://docs.google.com/presentation/d/1WE9JfHrWg3IzqFhUP1eaV5-1fhv2-eRF0S7AyWqMBLc/edit#slide=id.p

TL:DR  

- RMSE: how much error does the typical prediction has; same units as the target; smaller is better  
- R2: variance in y (target) explained by X (predictor); closer to 1 is better




#### Can we predict the final grade of students based on score from 1st quiz

x = score from exam1  
y = final grade

In [None]:
# generate our student grade data
df = pd.DataFrame(np.array([[100, 96], [93, 90], [84, 89], [80, 85], [76, 80], [70, 68], [79, 75]]), columns = ['x', 'y'])

x = df['x']
y = df['y']

df.head()

In [None]:
# look at scatter of x vs y
plt.scatter(df.x, df.y)
plt.xlabel('x = exam1 score')
plt.ylabel('y = final score');

Linear Regression:  
     - Does a set of independent variables do a good job in predicting an outcome (dependent) variable  
     - Is the model good enough compared to a baseline?  
     - Is the model with features/ind variable valuable/significant compared to a model with no features?  

### Create a baseline prediction.
- Make a prediction just based on dependent variable i.e. model with no independent variable
    - mean or median of dependent variable (we will use mean for this lesson)

In [None]:
baseline = df.y.mean()
baseline

In [None]:
#baseline is mean of dependent variable
df['baseline'] = df.y.mean()
df.head()

In [None]:
# plot data with baseline:

plt.scatter(df.x, df.y)
plt.axhline(df.y.mean(), ls = ':')
plt.xlabel('x = exam1 score')
plt.ylabel('y = final score')
plt.title('Baseline model');

In [None]:
# Now we fit a regression model using OLS method.
# OLS = ordinary least squares.

# create the model object
lm = LinearRegression(normalize=True)

# fit the model to trainig data
lm.fit(df[['x']], df.y)

# make prediction
df['yhat'] = lm.predict(df[['x']])

df

y = Actual target  
yhat = OLS model prediction

In [None]:
# visualize the line of best fit from OLS linear model
plt.scatter(df.x, df.y)
plt.plot(df.x, df.yhat)
plt.xlabel('x = exam1 score')
plt.ylabel('y = final score')
plt.title('OLS linear model');

## Regression Metrics: Measure of dispersion/distance of datapoints around the fitted line

### Regression Metrics :

- Compare to baseline
- Compare different models with different independent variables

### Vocabulary 
- Residual - error (actual minus predicted)
- SSE (sum of squared error)
- MSE (mean squared error)
- RMSE (root mean squared error)

### Residuals (error)

- predicted minus actual

In [None]:
df.head()

In [None]:
# residual = actual - predicted
df['residual'] = df.y - df.yhat
df['baseline_residual'] = df.y - df.baseline

In [None]:
df

In [None]:
# Do we prefer higher residuals or lower residuals

# What is the sum of residuals for these models?

### Residuals for Baseline model
![regression-2.png](attachment:regression-2.png)

### Residuals for OLS model
![regression-3.png](attachment:regression-3.png)

In [None]:
# sum of residual is zero
df.residual.sum()

In [None]:
# sum of baseline residual is zero too
df.baseline_residual.sum()

##### Sum of residual is zero for both model above so 'residuals' are not helpful in finding the line of best fit
 - This is true for any line passing through the centeroid 

### Residual Plots:
- Independent variable vs residual values
- dependent variable vs residual values

In [None]:
# residual plots (y vs residual)

plt.figure(figsize = (11,5))

plt.subplot(121)
plt.scatter(df.y, df.baseline_residual)
plt.axhline(y = 0, ls = ':')
plt.xlabel('y')
plt.ylabel('Residual')
plt.title('Baseline Residuals')

plt.subplot(122)
plt.scatter(df.y, df.residual)
plt.axhline(y = 0, ls = ':')
plt.xlabel('y')
plt.ylabel('Residual')
plt.title('OLS model residuals');

***Presence of trends in residual plots indicate that there is an uncaptured linear trend, meaning we can still improve our model using independent features (in this case x).***

### SSE (Sum of Squared Errors)

- sum the squared of residuals

In [None]:
# first calculate the square of residuals

df['residual^2'] = df.residual**2
df['baseline_residual^2'] = df.baseline_residual**2

df.head()

In [None]:
SSE = df['residual^2'].sum()
SSE_baseline = df['baseline_residual^2'].sum()

print('SSE =', "{:.1f}".format(SSE))
print("SSE Baseline =", "{:.1f}".format(SSE_baseline))

what are the units of residual, SSE?

##### Note: In regression, the 'line of best fit' is one which minimizes the SSE

### MSE (Mean Squared Error)

- average of SSE = SSE/(no. of data points)

- the average of your errors that have each been squared

In [None]:
len(df)
df.shape[0]

In [None]:
MSE = SSE/len(df)
MSE_baseline = SSE_baseline/len(df)

print("MSE = ", "{:.1f}".format(MSE))
print("MSE baseline = ", "{:.1f}".format(MSE_baseline))

### RMSE (Root Mean Squared Error)

- Square root of MSE
- Same units as the output (y) variable

In [None]:
from math import sqrt
RMSE = sqrt(MSE)
RMSE_baseline =  sqrt(MSE_baseline)


print("RMSE = ", "{:.1f}".format(RMSE))
print("RMSE baseline = ", "{:.1f}".format(RMSE_baseline))

### Metrics in Sklearn:
https://scikit-learn.org/stable/modules/model_evaluation.html#regression-metrics


Start with MSE : (https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html)

![regression.jpg](attachment:regression.jpg)

In [None]:
from sklearn.metrics import mean_squared_error

MSE2 = mean_squared_error(df.y, df.yhat)
MSE2_baseline = mean_squared_error(df.y, df.baseline)

print("MSE", MSE2) 
print("MSE baseline" ,MSE2_baseline) 

In [None]:
#calculate SSE

SSE2 = MSE2 * len(df)
SSE2_baseline = MSE2_baseline * len(df)

print("SSE", SSE2) 
print("SSE baseline" ,SSE2_baseline) 

In [None]:
#calculate RMSE: either take square root of MSE or use sklearn (kwarg 'squared = False')

RMSE2 = mean_squared_error(df.y, df.yhat, squared = False)
RMSE2_baseline = mean_squared_error(df.y, df.baseline, squared=False)

print("RMSE", RMSE2) 
print("RMSE baseline" ,RMSE2_baseline)  

### Which metric to use?

Most often used:

- RMSE - Most commonly used - Same units as the dependent variable. Average distance of each point from fitted regression line


-------------------------------------------------------------
- SSE - If outliers matter, you can use SSE - Amplifies the effect of outliers

    - Residual 5 ----> SSE 25
    - Residual 10 ---> SSE 100




### How much of variance in target variable is explained by your independent variables?

 - $R^2$ - Coefficient of determination (0 to 1)
     - r2_score == explained_variance_score
 - Compares the fit of the chosen model with that of a horizontal straight line (baseline)



![regression-5.png](attachment:regression-5.png)

Distance between the point (value) and mean value (baseline) is 'Total Error'  
Total Error = 'Explained Error' (accounted by for regression) + 'Unexplained Error'  

ESS = Explained Sum of Squared Error  
TSS = Total sum of squared error

$R^2$ = ESS/TSS

$R^2$ = 1 - SSE/TSS  (since ESS + SSE = TSS)

*Note: TSS == SSE for baseline model (mean model)*

In [None]:
# calculate R2 manually:

# Total Sum of Squares = SSE for baseline
TSS = SSE_baseline =   df['baseline_residual^2'].sum()

# Sum of squared error for the regression line (Unexplained error)
SSE = df['residual^2'].sum()

# ESS - Explained sum of squares ('Explained Error')
ESS = TSS - SSE

# Calculate R2
R2 = ESS/TSS
R2

In [None]:
# calculate R2 the easy way:

from sklearn.metrics import r2_score
r2_score(df.y, df.yhat)

Note: explained_variance_score == r2_score