Authors: Andreas Haupt, Alexander Quispe, Anzony Quispe, Vasilis Syrgkanis

# Penalized Linear Regressions: A Simulation Experiment

In [None]:
import matplotlib.pyplot as plt
import random
import math
import numpy as np
import matplotlib.pyplot as plt
random.seed(42)
import warnings
warnings.simplefilter('ignore')
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV, ElasticNetCV
from sklearn.metrics import r2_score
import pandas as pd
from sklearn.base import BaseEstimator, clone

In [None]:
def gen_data(n, p, *, sparse=True):
    if sparse:
        beta = (1 / np.arange(1, p)) ** 2
    else:
        beta = ((np.random.normal(0, 1, p - 1)) * 0.2)
    true_fn = lambda x: np.exp(4 * x[:, 0]) + (x[:, 1:] @ beta)
    X = np.random.uniform(-.5, .5, size=(n, p))
    gX = true_fn(X) 
    y = gX + np.random.normal(0, 1, n)
    Xtest = np.random.uniform(-.5, .5, size=(n, p))
    gXtest = true_fn(Xtest)
    ytest = gXtest + np.random.normal(0, 1, n)
    Xpop = np.random.uniform(-.5, .5, size=(100000, p)) # almost population limit
    gXpop = true_fn(Xpop)
    ypop = gXpop + np.random.normal(0, 1, 100000) # almost population limit
    return X, y, gX, Xtest, ytest, gXtest, Xpop, ypop, gXpop

## Data Generating Process: Approximately Sparse

In [None]:
n = 100
p = 400
X, y, gX, Xtest, ytest, gXtest, Xpop, ypop, gXpop = gen_data(n, p, sparse=True)

In [None]:
plt.figure()
plt.title(r"$Y$ vs. $g(X)$")
plt.scatter(gX, y)
plt.xlabel(r"$g(X)$")
plt.ylabel(r"$Y$")
plt.show()

In [None]:
print(f"theoretical R^2:, {1 - np.var(ypop - gXpop) / np.var(ypop)}")
print(f"theoretical R^2:, {np.var(gXpop) / np.var(ypop)}")

We should know that `cv.glmnet` function in r **standarize** ***X*** data by default. So, we have to standarize our data before the execution of sklearn package. The **normalize** parameter will help for this. However, the function cv.glamnet  is also standarizing the **Y** [variable](https://web.stanford.edu/~hastie/glmnet/glmnet_alpha.html) and then unstadarize the coefficients from the regression. To do this with sklearn, we will standarize the **Y** variable before fitting with **StandardScaler** function. Finally, the r-function uses 10 folds by default so we will adjust our model to use **cv=10** ten folds.\
\
The parameter **l1_ratio** corresponds to **alpha** in the glmnet R package while **alpha** corresponds to the **lambda** parameter in **glmnet**. Specifically, **l1_ratio = 1** is the lasso penalty. Currently, **l1_ratio <= 0.01** is not reliable, unless you supply your own sequence of **alpha**.

In [None]:
poly = lambda x: np.hstack([x[:, [0]], x[:, [0]]**2, x[:, [0]]**3, x[:, 1:]])
scaler = StandardScaler()
X = scaler.fit_transform(poly(X))
Xtest = scaler.transform(poly(Xtest))
Xpop = scaler.transform(poly(Xpop))

In [None]:
# Regressions
lcv = LassoCV().fit(X, y)
ridge = RidgeCV().fit(X, y)
enet = ElasticNetCV(l1_ratio = 0.5).fit(X, y)

In [None]:
r2_lcv = r2_score(ytest, lcv.predict(Xtest))
r2_ridge = r2_score(ytest, ridge.predict(Xtest))
r2_enet = r2_score(ytest, enet.predict(Xtest))
r2_lcv, r2_ridge, r2_enet

In [None]:
r2_lcv = r2_score(ypop, lcv.predict(Xpop))
r2_ridge = r2_score(ypop, ridge.predict(Xpop))
r2_enet = r2_score(ypop, enet.predict(Xpop))
r2_lcv, r2_ridge, r2_enet

Here we compute the lasso and ols post lasso using plug-in choices for penalty levels, using package hdm

Rlasso functionality: it is searching the right set of regressors. This function was made for the case of ***p*** regressors and ***n*** observations where ***p >>>> n***. It assumes that the error is i.i.d. The errors may be non-Gaussian or heteroscedastic.\
The post lasso function makes OLS with the selected ***T*** regressors.
To select those parameters, they use $\lambda$ as variable to penalize\
**Funny thing: the function rlasso was named like that because it is the "rigorous" Lasso.**\
We find a Python code that tries to replicate the main function of hdm r-package. It was made by [Max Huppertz](https://maxhuppertz.github.io/code/). His library is this [repository](https://github.com/maxhuppertz/hdmpy). Download its repository and copy this folder to your site-packages folder. In my case it is located here ***C:\Python\Python38\Lib\site-packages*** .

We need to install this package ***pip install multiprocess***.

In [None]:
# We wrap the package so that it has the familiar sklearn API
import hdmpy

class RLasso(BaseEstimator):
    
    def __init__(self, *, post=True):
        self.post = post
    
    def fit(self, X, y):
        self.rlasso_ = hdmpy.rlasso(X, y, post=self.post)
        return self
    
    def predict(self, X):
        return X @ self.rlasso_.est['beta'].values.flatten() + self.rlasso_.est['intercept'].values

In [None]:
rlasso = RLasso(post = False).fit(X, y)
rlasso_post = RLasso(post = True).fit(X, y)

In [None]:
r2_rlasso = r2_score(ytest, rlasso.predict(Xtest))
r2_rlasso_post = r2_score(ytest, rlasso_post.predict(Xtest))
r2_rlasso, r2_rlasso_post

In [None]:
r2_rlasso = r2_score(ypop, rlasso.predict(Xpop))
r2_rlasso_post = r2_score(ypop, rlasso_post.predict(Xpop))
r2_rlasso, r2_rlasso_post

Now let's try the LAVA estimator

In [None]:
# We construct an sklearn API estimator that implements the LAVA method

class Lava(BaseEstimator):
    
    def __init__(self, *, lasso, ridge, iterations=5):
        self.lasso = lasso
        self.ridge = ridge
        self.iterations = iterations

    def fit(self, X, y):
        lasso = clone(self.lasso).fit(X, y)
        ridge = clone(self.ridge).fit(X, y - lasso.predict(X))
    
        for _ in range(self.iterations - 1):
            lasso = lasso.fit(X, y - ridge.predict(X))
            ridge = RidgeCV().fit(X, y - lasso.predict(X))

        self.lasso_ = lasso
        self.ridge_ = ridge
        return self

    def predict(self, X):
        return self.lasso_.predict(X) + self.ridge_.predict(X)

In [None]:
lava = Lava(lasso=RLasso(post = False), ridge=RidgeCV())
lava.fit(X, y)

In [None]:
r2_lava = r2_score(ytest, lava.predict(Xtest))
r2_lava

In [None]:
r2_lava = r2_score(ypop, lava.predict(Xpop))
r2_lava

In [None]:
df= pd.DataFrame({'LassoCV': [r2_lcv],
                  'RidgeCV': [r2_ridge],
                  'ElasticNetCV': [r2_enet],
                  'RLasso': [r2_rlasso],
                  'RLassoOLS': [r2_rlasso_post],
                  'Lava': [r2_lava]}).T
df.columns = ['Population R-squared']
df

In [None]:
plt.figure()
plt.scatter(gXtest, gXtest, marker = '.', c = 'black' )
plt.scatter(gXtest, rlasso.predict(Xtest), marker = 'D' , c = 'red' , label = 'RLasso' )
plt.scatter(gXtest, rlasso_post.predict(Xtest) , marker = '^' , c = 'green' , label = 'RLassoOLS')
plt.scatter(gXtest, lcv.predict(Xtest) , marker = 'o' , c = 'blue' , label = 'LassoCV')
plt.legend(loc='lower right')
plt.show()

## Data Generating Process: Approximately Sparse + Small Dense Part

In [None]:
n = 100
p = 400
X, y, gX, Xtest, ytest, gXtest, Xpop, ypop, gXpop = gen_data(n, p, sparse=False)

In [None]:
print(f"theoretical R^2:, {1 - np.var(ypop - gXpop) / np.var(ypop)}")
print(f"theoretical R^2:, {np.var(gXpop) / np.var(ypop)}")

In [None]:
poly = lambda x: np.hstack([x[:, [0]], x[:, [0]]**2, x[:, [0]]**3, x[:, 1:]])
scaler = StandardScaler()
X = scaler.fit_transform(poly(X))
Xtest = scaler.transform(poly(Xtest))
Xpop = scaler.transform(poly(Xpop))

In [None]:
# Regressions
lcv = LassoCV().fit(X, y)
ridge = RidgeCV().fit(X, y)
enet = ElasticNetCV(l1_ratio = 0.5).fit(X, y)
rlasso = RLasso(post = False).fit(X, y)
rlasso_post = RLasso(post = True).fit(X, y)
lava = Lava(lasso=RLasso(post = False), ridge=RidgeCV()).fit(X, y)

In [None]:
r2_lcv = r2_score(ypop, lcv.predict(Xpop))
r2_ridge = r2_score(ypop, ridge.predict(Xpop))
r2_enet = r2_score(ypop, enet.predict(Xpop))
r2_rlasso = r2_score(ypop, rlasso.predict(Xpop))
r2_rlasso_post = r2_score(ypop, rlasso_post.predict(Xpop))
r2_lava = r2_score(ypop, lava.predict(Xpop))

In [None]:
df= pd.DataFrame({'LassoCV': [r2_lcv],
                  'RidgeCV': [r2_ridge],
                  'ElasticNetCV': [r2_enet],
                  'RLasso': [r2_rlasso],
                  'RLassoOLS': [r2_rlasso_post],
                  'Lava': [r2_lava]}).T
df.columns = ['Population R-squared']
df

In [None]:
plt.figure()
plt.scatter(gXtest, gXtest, marker = '.', c = 'black' )
plt.scatter(gXtest, rlasso.predict(Xtest), marker = 'D' , c = 'red' , label = 'RLasso' )
plt.scatter(gXtest, rlasso_post.predict(Xtest) , marker = '^' , c = 'green' , label = 'RLassoOLS')
plt.scatter(gXtest, lcv.predict(Xtest) , marker = 'o' , c = 'blue' , label = 'LassoCV')
plt.scatter(gXtest, lava.predict(Xtest) , marker = 'o' , c = 'magenta' , label = 'Lava')
plt.legend(loc='lower right')
plt.show()