    Ben Christensen
    Math 402
    November 30, 2018



In [56]:
import pandas as pd
import statsmodels.api as sm
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from itertools import combinations
from scipy import linalg as la
from statsmodels.regression.linear_model import OLS

In [4]:
df = pd.read_csv("wages.csv")
df.head()

Unnamed: 0,wage,educ,exper,tenure,nonwhite,female,married,numdep,smsa,northcen,...,trcommpu,trade,services,profserv,profocc,clerocc,servocc,lwage,expersq,tenursq
0,3.1,11,2,0,0,1,0,2,1,0,...,0,0,0,0,0,0,0,1.131402,4,0
1,3.24,12,22,2,0,1,1,3,1,0,...,0,0,1,0,0,0,1,1.175573,484,4
2,3.0,11,2,0,0,0,0,2,0,0,...,0,1,0,0,0,0,0,1.098612,4,0
3,6.0,8,44,28,0,0,1,0,1,0,...,0,0,0,0,0,1,0,1.791759,1936,784
4,5.3,12,7,2,0,0,1,1,0,0,...,0,0,0,0,0,0,0,1.667707,49,4


## 4.18

In [9]:
def ridge(lmbda, X, Y):
    """Perform an OLS regression with L^2 regularization 
    (the ridge method)
    
    Parameters:
        lmbda (float): regularization parameter
        X ((n,d) ndarray): feature data
        Y ((n,1) ndarray): dependent var. data
    
    Returns:
        Bhat ((d,1) ndarray): minimizer of the regularized regression
    """
    U, S, Vh = la.svd(X, full_matrices=False)
    
    for i, s in enumerate(S):
        if s < 1e-5:
            S[i] = 0
        else:
            S[i] = 1/s
    
    return Vh.T @ np.diag(S) @ U.T @ Y
    

In [13]:
from scipy.optimize import minimize
def fun(beta,x,y,lam):
    return np.linalg.norm(y-x@beta)**2+lam*np.linalg.norm(beta)**2

def sol(x,y,lam):
    return minimize(fun,ridge(lam,x,y),args=(x,y,lam))['x']

X=df[['educ','tenure','nonwhite','female','numdep']]
Y=df['wage']
print(ridge(7,np.array(X),np.array(Y)))
sol(np.array(X),np.array(Y),7000)

[ 0.47588912  0.15785767 -0.25481063 -1.92333271  0.09551804]


array([ 0.37679126,  0.16958451,  0.00056444, -0.02410082,  0.02483015])

## 4.19

In [28]:
df["female*married"] = df["female"]*df["married"]
df["intercept"] = 1

In [34]:
X = df[['female','educ','exper','tenure','married','female*married','numdep','nonwhite', 'intercept']]
Y = df["wage"]

for k in range(-5, 6):
    print("k =",k)
    lmbda = 10**k
    #(i)
    Bhat1 = sol(np.array(X), np.array(Y), lmbda)
    model = sm.OLS(Y, X)
    Bhat2 = OLS.fit_regularized(model, lmbda, L1_wt=0).params
    Bhat3 = linear_model.Ridge(lmbda, fit_intercept=False).fit(X,Y).coef_
    print("Bhat1:", Bhat1)
    print("Bhat2:", Bhat2)
    print("Bhat3:", Bhat3)

k = -5
Bhat1: [-0.34413527  0.56308905  0.02091295  0.12976128  1.73557801 -2.3578156
  0.08909389 -0.2142343  -2.56638478]
Bhat2: [-0.34413527  0.56308905  0.02091295  0.12976128  1.73557801 -2.3578156
  0.08909389 -0.2142343  -2.56638478]
Bhat3: [-0.34413616  0.56308895  0.02091294  0.12976129  1.7355772  -2.35781434
  0.08909384 -0.21423432 -2.56638269]
k = -4
Bhat1: [-0.34413527  0.56308905  0.02091295  0.12976128  1.73557801 -2.3578156
  0.08909389 -0.2142343  -2.56638478]
Bhat2: [-0.34413527  0.56308905  0.02091295  0.12976128  1.73557801 -2.3578156
  0.08909389 -0.2142343  -2.56638478]
Bhat3: [-0.34414422  0.56308805  0.02091285  0.12976135  1.73556986 -2.35780296
  0.08909341 -0.2142345  -2.5663638 ]
k = -3
Bhat1: [-0.34422483  0.56307903  0.0209119   0.12976206  1.73549618 -2.35768896
  0.08908913 -0.2142364  -2.5661741 ]
Bhat2: [-0.34413527  0.56308905  0.02091295  0.12976128  1.73557801 -2.3578156
  0.08909389 -0.2142343  -2.56638478]
Bhat3: [-0.34422474  0.56307907  0.02091

At first our solution is identical to the statsmodel package. Over time the statsmodel package's answer diverges from ours and ours becomes quite identical to the scikitlearn solution.

Huzzah!

## 4.20

In [49]:
X = df[['female','educ','exper','tenure','married','female*married','numdep','nonwhite']]
Y = df["wage"]
print("""
Features that give optimal AIC:
['educ', 'exper', 'tenure', 'married', 'married*female']
""")
print("""
Features that give optimal BIC:
['educ', 'tenure', 'married', 'married*female']
""")
for k in range(-5, 6):
    print("k =", k)
    lmbda = 10**k
    model = sm.OLS(Y, X)
    B1 = OLS.fit_regularized(model,alpha=lmbda, L1_wt=1).params
    print(B1[B1!=0].index)
    


Features that give optimal AIC:
['educ', 'exper', 'tenure', 'married', 'married*female']


Features that give optimal BIC:
['educ', 'tenure', 'married', 'married*female']

k = -5
Index(['female', 'educ', 'exper', 'tenure', 'married', 'female*married',
       'numdep', 'nonwhite'],
      dtype='object')
k = -4
Index(['female', 'educ', 'exper', 'tenure', 'married', 'female*married',
       'numdep', 'nonwhite'],
      dtype='object')
k = -3
Index(['female', 'educ', 'exper', 'tenure', 'married', 'female*married',
       'numdep', 'nonwhite'],
      dtype='object')
k = -2
Index(['female', 'educ', 'exper', 'tenure', 'married', 'female*married',
       'numdep', 'nonwhite'],
      dtype='object')
k = -1
Index(['female', 'educ', 'exper', 'tenure', 'married', 'female*married'], dtype='object')
k = 0
Index(['educ', 'exper', 'tenure'], dtype='object')
k = 1
Index(['educ', 'exper'], dtype='object')
k = 2
Index(['exper'], dtype='object')
k = 3
Index([], dtype='object')
k = 4
Index([], dtype='obje

(That was an implicit comparison)

In [54]:
print("""
Features that give optimal AIC:
['educ', 'exper', 'tenure', 'married', 'married*female']
""")
print("""
Features that give optimal BIC:
['educ', 'tenure', 'married', 'married*female']
""")
features = np.array(['female','educ','exper','tenure','married','female*married','numdep','nonwhite'])
for k in range(-5, 6):
    print("k =", k)
    lmbda = 10**k
    model = sm.OLS(Y, X)
    Beta = linear_model.Lasso(lmbda).fit(X,Y).coef_
    print(features[Beta!=0])


Features that give optimal AIC:
['educ', 'exper', 'tenure', 'married', 'married*female']


Features that give optimal BIC:
['educ', 'tenure', 'married', 'married*female']

k = -5
['female' 'educ' 'exper' 'tenure' 'married' 'female*married' 'numdep'
 'nonwhite']
k = -4
['female' 'educ' 'exper' 'tenure' 'married' 'female*married' 'numdep'
 'nonwhite']
k = -3
['female' 'educ' 'exper' 'tenure' 'married' 'female*married' 'numdep'
 'nonwhite']
k = -2
['female' 'educ' 'exper' 'tenure' 'married' 'female*married' 'numdep'
 'nonwhite']
k = -1
['female' 'educ' 'exper' 'tenure' 'married' 'female*married' 'numdep']
k = 0
['educ' 'exper' 'tenure']
k = 1
[]
k = 2
[]
k = 3
[]
k = 4
[]
k = 5
[]


(That was another implicit comparison)

## 4.21

In [63]:
Y = df["wage"]
#(i)
X = df[['female','educ','exper','tenure','married','female*married','numdep','nonwhite']]
model = linear_model.LinearRegression()
print("(i):", np.mean(cross_val_score(model, X,Y,cv=7)))
#(ii)
X = df[['educ','tenure','married','female*married']]
model = linear_model.LinearRegression()
print("(ii):",np.mean(cross_val_score(model, X,Y,cv=7)))
#(iii) and (iv)
print("(iii) and (iv)")
X = df[['female','educ','exper','tenure','married','female*married','numdep','nonwhite']]
for k in range(-5, 6):
    print("k =", k)
    lmbda = 10**k
    model1 = linear_model.Ridge(lmbda)
    model2 = linear_model.Lasso(lmbda)
    print("Ridge:", np.mean(cross_val_score(model1, X,Y,cv=7)))
    print("Lasso:", np.mean(cross_val_score(model2, X,Y,cv=7)))


(i): 0.345553188165
(ii): 0.353691631076
(iii) and (iv)
k = -5
Ridge: 0.345553193654
Lasso: 0.345554228515
k = -4
Ridge: 0.345553243059
Lasso: 0.345563551126
k = -3
Ridge: 0.345553737011
Lasso: 0.345653747943
k = -2
Ridge: 0.345558666678
Lasso: 0.346323758452
k = -1
Ridge: 0.345606990018
Lasso: 0.332495976712
k = 0
Ridge: 0.346003143145
Lasso: 0.261849182236
k = 1
Ridge: 0.346220375025
Lasso: -0.0337066244705
k = 2
Ridge: 0.331856513725
Lasso: -0.0337066244705
k = 3
Ridge: 0.283452471094
Lasso: -0.0337066244705
k = 4
Ridge: 0.163145685528
Lasso: -0.0337066244705
k = 5
Ridge: 0.0223653258297
Lasso: -0.0337066244705


Once k >=1 the lasso has no parameters so the MSE we have there is nonsense. The best then appears to be Lasso for k=0.

Huzzah!

## 4.22

Now do this for your own dataset y'all!