# Simple linear regression

## Import the relevant libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# We can override the default matplotlib styles with those of Seaborn
import seaborn as sns
sns.set()

In [None]:
%%capture
# See https://stackoverflow.com/a/65808542/1988855
import statsmodels.api as sm

## Load the data

In [None]:
# Load the data from a .csv in the same folder
data = pd.read_csv('data/eduGrades.csv')

In [None]:
# Let's check what's inside this data frame
data.head()

In [None]:
# This method gives us very nice descriptive statistics. We don't need this as of now, but will later on!
data.describe()

# Create your first regression

## Define the dependent and the independent variables

In [None]:
# Following the regression equation, our dependent variable (y) is the GPA
pCol='verb_SAT'
tCol='univ_GPA'

y = data [tCol]
# Similarly, our independent variable (x) is the SAT score
x1 = data [pCol]

## Explore the data

In [None]:
# Plot a scatter plot (first we put the horizontal axis, then the vertical axis)
plt.scatter(x1,y)
# Name the axes
plt.xlabel(pCol, fontsize = 10)
plt.ylabel(tCol, fontsize = 10)
# Show the plot
plt.show()

## Regression itself

In [None]:
%%capture
# Add a constant. Essentially, we are adding a new column (equal in length to x), which consists only of 1s
X = sm.add_constant(x1)

In [None]:
def fitOLS(X,y):
  # Fit the model, according to the OLS (ordinary least squares) method with a dependent variable y and an idependent X
  results = sm.OLS(y,X).fit()
  return results

In [None]:
results = fitOLS(X,y)

# Print a nice summary of the regression. That's one of the strong points of statsmodels -> the summaries
results.summary()

In [None]:
# Create a scatter plot
plt.scatter(x1,y)
# Define the regression equation, so we can plot it later.
# Note that the fitted parameters are in the results.params array.
yhat = results.params[0] + results.params[1]*x1
# Plot the regression line against the independent variable
fig = plt.plot(x1,yhat, lw=4, c='orange', label ='regression line')
# Label the axes
plt.xlabel(pCol, fontsize = 10)
plt.ylabel(tCol, fontsize = 10)
plt.show()

# Exercise

Clearly the score on the verbal SAT score (obtained in "high school") is not a particularly good predictor of college GPA performance.

1. Repeat the process using the math SAT score. Does it perform any better? Give reasons for your answer.
2. Do the same with the GPA obtained in "high school"....

We can add another predictor to our model, to see whether it performs better than a univariate (single predictor) model. What if we add the `math_SAT` column to the `verb_SAT` predictor?

In [None]:
pCol1='verb_SAT'
pCol2='math_SAT'
X = data[[pCol1, pCol2]]

In [None]:
%%capture
# Add a constant. Essentially, we are adding a new column (equal in length to x), which consists only of 1s
X = sm.add_constant(X)

Now we try fitting again, with the expanded (2-predictor + constant) X.

In [None]:
results2 = fitOLS(X,y)
results2.summary()

Plotting is a bit more tricky than before, because we need to do a 3-D plot over a mesh of points in "predictor space". First we compute the points on the hyperplane.

In [None]:
from mpl_toolkits.mplot3d import Axes3D

# Range of predictor values for 3d plot
x1 = np.linspace(data[pCol1].min(), data[pCol1].max(), 100)
x2 = np.linspace(data[pCol2].min(), data[pCol2].max(), 100)

# Mesh of predictor values for 3d plot
xx1, xx2 = np.meshgrid(x1, x2)
                       
# Compute the hyperplane values by evaluating the points on the grid 
Z = results2.params[0] + xx1*results2.params[1] + xx2*results2.params[2]

Now we setup the shared axes with viewing direction, the surface and the actual data.

In [None]:
import warnings
# create matplotlib 3d axes 
fig = plt.figure(figsize=(12, 8))

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    ax = Axes3D(fig, azim=-115, elev=15)

# plot hyperplane 
plane = ax.plot_surface(xx1, xx2, Z, cmap=plt.cm.RdBu_r, alpha=0.6, linewidth=0) 

# plot data points - points above the hyperplane are white, points below are black 
resid = y - results2.predict(X) 
ax.scatter(X[resid >= 0][pCol1], X[resid >= 0][pCol2], y[resid >= 0], color='black', alpha=1.0, facecolor='white') 
ax.scatter(X[resid < 0][pCol1], X[resid < 0][pCol2], y[resid < 0], color='black', alpha=1.0) 

# set axis labels 
ax.set_xlabel(pCol1) 
ax.set_ylabel(pCol2) 
ax.set_zlabel(tCol)