# DS-SF-34 | 08 | Linear Regression, Part 2 | Codealong | Starter Code

In [None]:
import os

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 10)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn import feature_selection, linear_model

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

In [None]:
df = pd.read_csv(os.path.join('..', 'datasets', 'dataset-08-zillow.csv'), index_col = 'ID')

In [None]:
df

## Part A | Multiple Linear Regression

### `SalePrice` as a function of `Size` and `LotSize`

In [None]:
# TODO

> ### Activity | Comment on the significance of each feature

Answer: TODO

### `SalePrice` as a function of `Size` and `Beds`

In [None]:
# TODO

> ### Activity | Comment on each feature significance

Answer: TODO

> ### Activity | Look at the coefficient for `Beds`.  How do you interpret it?  What happened?

Answer: TODO

## Part B | Multicollinearity

### `SalePrice ~ Size` (reference)

In [None]:
model = smf.ols(formula = 'SalePrice ~ Size', data = df).fit()

print 'Size:'
print "\t- coefficient =", model.params.Size
print "\t- std error =", model.bse.Size
print "\t- t-value =", model.tvalues.Size
print "\t- p-value =", model.pvalues.Size

confidence_interval = model.conf_int().loc['Size']

print "\t- 95% confidence interval = [{}, {}]".format(confidence_interval[0], confidence_interval[1])

### `SalePrice ~ Size + "same exact" Size`

In [None]:
df['Size_2'] = df.Size

In [None]:
df[ ['Size', 'Size_2'] ].corr()

In [None]:
model = smf.ols(formula = 'SalePrice ~ Size + Size_2', data = df).fit()

model.summary()

In [None]:
print model.params[ ['Size', 'Size_2'] ].sum()
print model.bse[ ['Size', 'Size_2'] ].sum()

> The coefficient's weight and standard error of the original `Size` feature has now been divided equally between both `Size*` features, but their significance is unchanged (same `t-` and `p-values`)

In [None]:
# Seed for the pseudo-random number generator so as to reproduce the the results below
np.random.seed(1)

df['Noise'] = np.random.random(df.shape[0])

df.Size_2 = df.Size * (1. + .01 * df.Noise)

In [None]:
df[ ['Size', 'Size_2'] ].corr()

In [None]:
smf.ols(formula = 'SalePrice ~ Size + Size_2', data = df).fit().summary()

> #### Activity | What happened?

Answer: TODO

## Part C | Feature Engineering

> #### Activity | Create new variables `SizeLog` and `LotSizeLog` that represent the log of `Size` and `LotSize`.  Repeat using square root, cube root, square, and cube

In [None]:
# TODO

In [None]:
df

> ### Activity | Show the correlation between the different engineered features of  `Size`

In [None]:
# TODO

### `SalePrice` as a function of `Size` and its other engineered features

In [None]:
# TODO

> #### Activity | What happened?

Answer: TODO

## Part D | Adjusted $R^2$

In [None]:
formula = 'SalePrice ~ 0 + IsAStudio + Beds + Baths + Size + LotSize'

model = smf.ols(formula = formula, data = df).fit()

print 'R^2 =', model.rsquared, '(original model)'

> Let's now add some artificial noise:

In [None]:
x_df = pd.DataFrame(index = df.index)

np.random.seed(seed = 0)
for i in range(100):
    x = 'X{}'.format(i)
    x_df[x] = np.random.random(df.shape[0])

formula = 'SalePrice ~ 0 + IsAStudio + Beds + Baths + Size + LotSize + BuiltInYear + '
formula += ' + '.join(x_df.columns.values)

In [None]:
formula

In [None]:
x_df = x_df.join(df)

x_model = smf.ols(formula = formula, data = x_df).fit()

In [None]:
print 'Model with artificial noise:'
print '-          R^2 =', x_model.rsquared
print '- Adjusted R^2 =', x_model.rsquared_adj

> #### Activity | What happened?

Answer: TODO

## Part E | The F-statistic

### SalePrice ~ Size

In [None]:
model = smf.ols(formula = 'SalePrice ~ Size', data = df).fit()

model.summary()

In [None]:
print 'F-statistic        =', model.fvalue
print 'Prob (F-statistic) =', model.f_pvalue # (with a 5% significance level)¶

In [None]:
print "Size's p-value =", model.pvalues.Size

> #### The model F-statistic's p-value matches its unique regressor's p-value

### SalePrice ~ IsAStudio

In [None]:
model = smf.ols(formula = 'SalePrice ~ IsAStudio', data = df).fit()

model.summary()

In [None]:
print 'F-statistic         =', model.fvalue
print 'Prob (F-statistic)  =', model.f_pvalue
print "IsAStudio's p-value =", model.pvalues.IsAStudio

## Part F | Linear Regression Modeling with `sklearn`

- (http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html)

In [None]:
def summary(X, y, model):
    _, f_pvalues = feature_selection.f_regression(X, y)

    print 'R^2 =', model.score(X, y)
    print

    print 'Coefficients'
    print '- beta_0 (Intercept) = {}'.format(model.intercept_)

    for i, coef in enumerate(model.coef_):
        print '- beta_{} ({}) = {} (p-value = {})'.format(i + 1, X.columns[i], coef, f_pvalues[i])

> ### Remove samples with `NaN` in `Size`

In [None]:
# TODO

> ### SalePrice ~ Size with `sklearn`

In [None]:
X = df[ ['Size'] ]
y = df.SalePrice

# TODO

> #### The coefficients estimated by _statsmodels_ and _sklearn_ are identical:
> (as it should be as the same determinstic fit/optimization, namely OLS for Ordinary Least Squares, is performed by both packages)

In [None]:
summary(X, y, model)

> #### Score returns the $R^2$ of the prediction:
> (the accuracy was reported `.score()` when we used k-NN for classification; same method but different metric)

In [None]:
model.score(X, y)