# Linear Regression

## Fitting a Line

In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.datasets import load_boston

boston = load_boston()
features = boston.data[:,0:2]
target = boston.target
regression = LinearRegression()
model = regression.fit(features, target)

In [2]:
# get intercept
model.intercept_

22.485628113468223

In [3]:
# get coeffcient
model.coef_

array([-0.35207832,  0.11610909])

## Handling Interactive Effects 

In [4]:
from sklearn.preprocessing import PolynomialFeatures
interaction = PolynomialFeatures(degree=3, include_bias=False, interaction_only=True)
features_interaction = interaction.fit_transform(features)
regression = LinearRegression()
model = regression.fit(features_interaction, target)

In [6]:
model.coef_, model.intercept_

(array([-0.33715159,  0.08155747,  0.80662   ]), 22.07715825584366)

## Fitting a Nonlinear Relationship

In [7]:
features = boston.data[:,0:1]
polynomial = PolynomialFeatures(degree=3, include_bias=False)
features_polynomial = polynomial.fit_transform(features)
model = regression.fit(features_polynomial, target)

In [8]:
model.intercept_, model.coef_

(25.19047936932673, array([-1.13640072e+00,  2.37848254e-02, -1.48872090e-04]))

In [10]:
# View the first observation's values for x, x^2, and x^3
features_polynomial[0]

array([6.32000000e-03, 3.99424000e-05, 2.52435968e-07])

## Reducing Variance with Regularization

In [11]:
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
features_standardized = scaler.fit_transform(features)
regression = Ridge(alpha=0.5)
model = regression.fit(features_standardized, target)
model.coef_, model.intercept_

(array([-3.56422527]), 22.532806324110677)

Relationship between lasso and ridge:  
they are different in the form of optimize function.   
for ridge, it is $ \min \sum\limits_{i=1}^{n}(y_i-\hat{y_i})^2 + \alpha\sum\limits_{j=1}^p\hat{\beta_j}^2$.  
for lasso, it is $ \min \sum\limits_{i=1}^{n}(y_i-\hat{y_i})^2 +\alpha\sum\limits_{j=1}^p\mid\hat{\beta_j}\mid $  
$\alpha$ is hyperparameter, which we can use `RidgeCV` method to find.

In [14]:
from sklearn.linear_model import RidgeCV
regr_cv = RidgeCV(alphas=[0.1, 1.0, 10.0])
model_cv = regr_cv.fit(features_standardized, target)
model_cv.coef_

array([-3.49860484])

In [15]:
model_cv.alpha_

10.0

## Reducing Features with Lasso Regression

In [17]:
from sklearn.linear_model import Lasso


features = boston.data
target = boston.target
scaler = StandardScaler()
features_standardized = scaler.fit_transform(features)
regression = Lasso(alpha=0.5)
model = regression.fit(features_standardized, target)

In [18]:
model.coef_

array([-0.11526463,  0.        , -0.        ,  0.39707879, -0.        ,
        2.97425861, -0.        , -0.17056942, -0.        , -0.        ,
       -1.59844856,  0.54313871, -3.66614361])

In [19]:
model.intercept_

22.532806324110688

In [21]:
# Create lasso with a high alpha. something bad occur
regression_a10 = Lasso(alpha=10)
model_a10 = regression_a10.fit(features_standardized, target)
model_a10.coef_

array([-0.,  0., -0.,  0., -0.,  0., -0.,  0., -0., -0., -0.,  0., -0.])