In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge, Lasso, RidgeCV, LassoCV

In [None]:
def coef_df(fit_object, X):
    """Make a date frame of the coefficients"""
    coefs = fit_object.coef_.tolist()
    intercept = fit_object.intercept_.tolist()
    coefs = [intercept] + coefs
    cols = ['Intercept'] + list(X.columns)
    name = str(type(fit_object)).split('.')[-1].replace("'>","")
    coef_df = pd.DataFrame(coefs, cols, columns=[name])
    return coef_df

In [None]:
# load in Hitters Data
# also encode categorical variable to dummy variables
# this code works because all categorical variable have only two levels
hitters = pd.read_csv('Data/Hitters.csv')
hitters.head()

In [None]:
hitters['League_N'], *rest = hitters['League'].factorize() 
hitters['Division_W'], *rest = hitters['Division'].factorize()
hitters['NewLeague_N'], *rest = hitters['NewLeague'].factorize()
hitters.drop(['League','Division','NewLeague','Unnamed: 0'],axis=1, inplace=True)

In [None]:
hitters.head()

In [None]:
hitters.info()

In [None]:
# drop missing values
hitters.dropna(inplace=True)

In [None]:
hitters.info()

# Analysis With All the Data
Before we break into training and test sets, first let look at how changing the lambda value changes Ridge and Lasso Regression.  Note that in scikit-learn, the tuning parameter is called alpha (not lambda).

### Linear Regression

In [None]:
Y = hitters['Salary']
X = hitters.drop(['Salary'],axis=1)

In [None]:
lm = LinearRegression(normalize=True)
lm.fit(X, Y)
coef_df(lm, X)

### Ridge Regression

In [None]:
alphas = np.linspace(0,20,50)

In [None]:
#alphas

In [None]:
ridge = Ridge(max_iter = 100000, normalize=True)
coefs = []

In [None]:
for a in alphas:
    ridge.set_params(alpha=a)
    ridge.fit(X,Y)
    coefs.append(ridge.coef_)

In [None]:
ax = plt.gca()
ax.plot(alphas, coefs)
#ax.set_xscale('log')
plt.axis('tight')
plt.xlabel('alpha')
plt.ylabel('coefficients')

In [None]:
# look at coeficients for different alpha values (alpha = .01, 50)
ridge = Ridge(max_iter = 100000, normalize=True, alpha = 50)
ridge.fit(X,Y)

In [None]:
coef_df(ridge, X).join(coef_df(lm,X))

### Lasso Regression

In [None]:
alphas = 10**np.linspace(6,-2,50)*.5

In [None]:
#alphas

In [None]:
lasso = Lasso(max_iter = 100000, normalize=True)
coefs = []

In [None]:
for a in alphas:
    lasso.set_params(alpha=a)
    lasso.fit(X,Y)
    coefs.append(lasso.coef_)

In [None]:
np.shape(coefs)

In [None]:
ax = plt.gca()
ax.plot(alphas, coefs)
ax.set_xscale('log')
plt.axis('tight')
plt.xlabel('alpha')
plt.ylabel('coefficients')

In [None]:
lasso = Lasso(max_iter = 100000, normalize=True, alpha = 3)
lasso.fit(X,Y)

In [None]:
coef_df(lasso,X).join(coef_df(ridge, X)).join(coef_df(lm,X))

# Fitting model using Training set

In [None]:
# create test and training sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .5, random_state = 1)

In [None]:
X_train.shape

### Linear Regression

In [None]:
# multiple linear regression 
lm = LinearRegression(normalize=True)
lm.fit(X_train, Y_train)

In [None]:
coef_df(lm, X)

In [None]:
ols_predict = lm.predict(X_test)

In [None]:
lm_mse=mean_squared_error(ols_predict, Y_test)
lm_mse

### Ridge Regression

#### First choose alpha with cross validation

In [None]:
alphas = 10**np.linspace(6,-2,50)*.5
ridgecv = RidgeCV(alphas=alphas, normalize=True)
ridgecv.fit(X_train, Y_train)

In [None]:
ridgecv.alpha_

#### Fit Ridge Regression with cross validated alpha

In [None]:
ridge_cvalpha = Ridge(alpha=ridgecv.alpha_, normalize=True)
ridge_cvalpha.fit(X_train, Y_train)

In [None]:
rr_pred = ridge_cvalpha.predict(X_test)

In [None]:
ridge_mse = mean_squared_error(rr_pred, Y_test)
ridge_mse

In [None]:
coef_df(ridge_cvalpha, X_train)

### Lasso Regression

#### First choose alpha with cross validation

In [None]:
lassocv = LassoCV(alphas=None, n_alphas=200,cv=10, max_iter=100000, normalize=True, random_state=1)
lassocv.fit(X_train, Y_train)

In [None]:
print(lassocv.alpha_)

#### Fit Lasso Regression with cross validated alpha

In [None]:
lasso_cvalpha = Lasso(alpha=lassocv.alpha_, normalize=True)
lasso_cvalpha.fit(X_train, Y_train)

In [None]:
lasso_pred = lasso_cvalpha.predict(X_test)

In [None]:
lasso_mse = mean_squared_error(lasso_pred, Y_test)
lasso_mse

In [None]:
coef_df(lasso_cvalpha, X_train)

## Compare Final Coefficients and Test MSE

In [None]:
(coef_df(lm,X_train).join(coef_df(ridge_cvalpha, X_train))).join(coef_df(lasso_cvalpha, X_train))

In [None]:
print("mse lm = ", lm_mse)
print("mse ridge = ", ridge_mse)
print("mse lasso = ", lasso_mse)