# Linear regression demo

Load packages we will use

In [1]:
# dataframes
import pandas as pd

# plotting
import matplotlib.pyplot as plt
import seaborn

# linear regression two ways
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression

# for choosing covariates to include in model
from patsy import dmatrices 

# Basic python

In [2]:
# basic math 
2+2

4

In [3]:
2*pi

NameError: name 'pi' is not defined

In [None]:
from numpy import pi
2 * pi

In [None]:
# lists
a = [1, 2, 3]
a

In [None]:
a.append(17)
a

In [None]:
# python uses 0-based indexing 
a[0]

In [None]:
# dictionaries 
d = {'a': 1, 'b': 2, 'c': 3}
d

In [None]:
d['b']

In [None]:
d['d']

In [None]:
d['d'] = 4
d['d']

In [None]:
# for loops
for i in range(10):
    print("hello", i)

In [None]:
# functions, if statements
def fibonacci(n):
    if n==0:
        return 1
    else:
        return n*fibonacci(n-1)
    
fibonacci(5)

# Jupyter workflow tips

* run a cell with Shift-Enter
* Jupyter displays value of last expression in cell 
* open a new cell below the current one (Alt-Enter) to see values of variables, test assumptions
* you can reorder cells using the arrows in the toolbar, or by copy-pasting cells up or down

# Electricity usage example

In [None]:
usage = pd.read_csv('elec_usage.txt')
usage.head()

In [None]:
usage.hist()

In [None]:
usage.plot.scatter(x='temperature', y='usage')

In [None]:
seaborn.pairplot(usage)

In [None]:
# add two more predictors: nonlinear functions of original variables
usage['tempsqr'] = usage['temperature']^2
usage['yearcts'] = usage['year'] + (usage['month']-1)/12
usage.head()

In [None]:
variables = list(usage.columns)
variables

In [None]:
features = variables[0:2] + variables[3:]
features

In [None]:
target = variables[2]
target

In [None]:
X = usage[features]
Y = usage[target]
X.head()

In [None]:
# more concise way of collecting data
Y, X = dmatrices('usage ~ 0 + temperature + np.power(temperature, 2) + month + year + yearcts', data=usage, return_type='dataframe')
X.head()

modifications:
    * add an intercept to the model
    * add other nonlinear terms

In [None]:
# fit a least squares model
model = sm.OLS(Y, X).fit()
Yhat = model.predict(X) # make the predictions by the model

# Print out the statistics
print(model.summary())

In [None]:
# Plot outputs
plt.scatter(Y, Yhat,  color='black')
plt.xlabel("true outcome Y")
plt.ylabel("predicted outcome Yhat")

plt.show()

In [None]:
# plot observed vs predicted usage as a function of temperature
p = usage.plot.scatter(x='temperature', y='usage', label='observed')
p.scatter(x=usage['temperature'], y=Yhat, color='r', marker="+", label='predicted')
plt.legend()

In [None]:
# let's make a function to plot predictions against truth
def plot_fit(Y, Yhat):
    plt.scatter(Y, Yhat,  color='black')
    plt.xlabel("true outcome Y")
    plt.ylabel("predicted outcome Yhat")

    plt.show()
    
plot_fit(Y, Yhat)

In [None]:
# we could even make a function to help us visually search for a good model 

def assess_model_quality(data, formula):
    Y, X = dmatrices(formula, data=data, return_type='dataframe')
    model = sm.OLS(Y, X).fit()
    Yhat = model.predict(X)
    plot_fit(Y, Yhat)
    return model.summary()
    
assess_model_quality(usage, 'usage ~ 1 + temperature')

# Now let's fit the same model, using sklearn

In [None]:
lm = LinearRegression()
model = lm.fit(X,Y)
model

In [None]:
Yhat = lm.predict(X)
Yhat[:5]

In [None]:
# how well does the model fit? 
lm.score(X,Y) # R^2 score

In [None]:
# what about the coefficients?
lm.coef_

In [None]:
# and the intercept β_0
lm.intercept_

In [None]:
# and visualize the fit 
plot_fit(Y, Yhat)

# Test assumptions: normality, independence, homoskedasticity

In [None]:
resid = model.resid

In [None]:
# test mutual independence 
plt.scatter(resid[:-1], resid[1:])