<a href="https://colab.research.google.com/github/AntoineChapel/metrics1_part2_hw1/blob/main/exo5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, ElasticNet, ElasticNetCV
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import r2_score

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
#adjust:
DATA_PATH = r"/content/cps09mar.txt"
np.random.seed(6411)

df = pd.read_csv(DATA_PATH, sep="\t")



df.columns = ["age", "female", "hisp", "education", "earnings", "hours", "week", "union", "uncov", "region", "race", "marital"]


df = df[df['female']==1]
df = df[df['race']==4]

df['logwage'] = np.log(df['earnings'])

for i in [12, 13, 14, 15, 16, 18, 20]:
    df['educ_dum_'+str(i)] = (df['education']==i).astype(int)

df['experience/40'] = (df['age'] - df['education'])/40
for i in np.arange(2, 10):
    df['experience/40_pow_'+str(i)] = df['experience/40']**i

df['married'] = (df['marital'] <= 3).astype(int)
df['widowed'] = (df['marital'] == 4).astype(int)
df['divorced'] = (df['marital'] == 5).astype(int)
df['separated'] = (df['marital'] == 6).astype(int)
df['never_married'] = (df['marital']==7).astype(int)

for i in np.arange(1, 5):
    df['region_'+str(i)] = (df['region']==i).astype(int)

for i in range(2):
    df['union_'+str(i)] = (df['union']==i).astype(int)


#keep only relevant variables
df = df.drop(columns=["age", "female", "hisp", "earnings", "hours", "week", "union", "uncov", "region", "race", "marital"])


In [5]:
df.head()

Unnamed: 0,education,logwage,educ_dum_12,educ_dum_13,educ_dum_14,educ_dum_15,educ_dum_16,educ_dum_18,educ_dum_20,experience/40,...,widowed,divorced,separated,never_married,region_1,region_2,region_3,region_4,union_0,union_1
155,16,10.491274,0,0,0,0,1,0,0,0.4,...,0,0,0,0,1,0,0,0,1,0
407,16,9.942708,0,0,0,0,1,0,0,0.825,...,0,0,0,0,1,0,0,0,1,0
740,12,9.942708,1,0,0,0,0,0,0,0.875,...,0,0,0,0,1,0,0,0,1,0
852,12,10.203592,1,0,0,0,0,0,0,0.875,...,0,0,0,0,1,0,0,0,1,0
1036,13,10.373491,0,1,0,0,0,0,0,0.625,...,0,1,0,0,1,0,0,0,1,0


In [6]:
y = df['logwage'].to_numpy().flatten()
X = df.drop(columns=['logwage']).to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

reg_results = np.empty((4, len(df.columns)))

In [7]:
#OLS
ols_reg = LinearRegression().fit(X_train, y_train)
y_pred_ols = ols_reg.predict(X_test)
r2_ols = r2_score(y_test, y_pred_ols)
print(f"R2: {r2_ols}")
print(ols_reg.coef_)

reg_results[0, :-1] = np.array(ols_reg.coef_)
reg_results[0, -1] = r2_ols

R2: 0.28514228291124444
[ 1.21727050e-02  5.66685434e-02  2.49983196e-01  4.55529099e-01
  1.04132780e-09  7.02523706e-01  8.35912753e-01  1.10708543e+00
 -1.17783539e+01  1.08688833e+02 -4.03644252e+02  8.17520330e+02
 -1.01493239e+03  8.10763291e+02 -4.13094721e+02  1.22758191e+02
 -1.61126217e+01  1.77082785e-03  6.74985013e-02  9.04560853e-02
 -1.30296434e-01 -2.94289447e-02  1.20421438e-01 -1.45563781e-01
 -2.67471797e-02  5.18894840e-02  5.76187236e-02 -5.76187180e-02]


In [8]:
#Ridge: hyperparameter tuning
parameters = {'alpha': np.logspace(0, 3, 30, base=10)/(10**5)}
ridge_reg = GridSearchCV(Ridge(), parameters, cv=5)
ridge_reg.fit(X_train, y_train)
print(ridge_reg.best_params_)

{'alpha': 0.0002807216203941177}


In [9]:
#Ridge model
final_ridge = Ridge(alpha=ridge_reg.best_params_['alpha']).fit(X_train, y_train)
r2_ridge = r2_score(y_test, final_ridge.predict(X_test))
print(final_ridge.coef_)
print(f"R2: {r2_ridge}")


reg_results[1, :-1] = np.array(final_ridge.coef_)
reg_results[1, -1] = r2_ridge

[ 1.23516669e-02  5.67158487e-02  2.48949106e-01  4.54584409e-01
  0.00000000e+00  7.00465157e-01  8.33602290e-01  1.10611972e+00
  7.32161492e+00 -1.13018515e+01 -1.09145204e+00  9.60049915e+00
  4.83997093e+00 -7.26425400e+00 -7.90651183e+00  9.71118847e+00
 -2.52833396e+00  1.47865144e-03  6.68156098e-02  9.18074177e-02
 -1.30709959e-01 -2.93917144e-02  1.21124143e-01 -1.46722774e-01
 -2.67906764e-02  5.23893040e-02  5.77050377e-02 -5.77050373e-02]
R2: 0.28509765503997053


In [10]:
#Lasso hyperparameter tuning:
parameters = {'alpha': np.logspace(0, 3, 30, base=10)/(10**5)}
lasso_reg = GridSearchCV(Lasso(tol=1e-2, max_iter=2000), parameters, cv=5)
lasso_reg.fit(X_train, y_train)
print(lasso_reg.best_params_)


{'alpha': 0.0003562247890262442}


In [11]:
#Lasso model
final_lasso = Lasso(alpha=0.1).fit(X_train, y_train)
r2_lasso = r2_score(y_test, final_lasso.predict(X_test))
print(final_lasso.coef_)
print(f"R2: {r2_lasso}")
print(np.sum(np.array(final_lasso.coef_) != 0))

reg_results[2, :-1] = np.array(final_lasso.coef_)
reg_results[2, -1] = r2_lasso

[ 0.09756184 -0.         -0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.00088011  0.
  0.          0.         -0.         -0.          0.         -0.
 -0.          0.          0.         -0.        ]
R2: 0.2558707356993879
2


In [12]:
#Elasticreg hyperparameter tuning
parameters = {'alpha': np.logspace(0, 3, 100, base=10)/(10**5),
              'l1_ratio': np.linspace(0.01, 0.99, 100)}

elastic_reg = RandomizedSearchCV(ElasticNet(), parameters, cv=5)
elastic_reg.fit(X_train, y_train)
print(elastic_reg.best_params_)

{'l1_ratio': 0.059494949494949496, 'alpha': 0.0008697490026177834}


In [13]:
final_elastic = ElasticNet(l1_ratio = elastic_reg.best_params_['l1_ratio'],
                           alpha = elastic_reg.best_params_['alpha']).fit(X_train, y_train)
r2_elastic = r2_score(y_test, final_elastic.predict(X_test))

print(final_elastic.coef_)
print(f"R2: {r2_elastic}")
print(np.sum(np.array(final_elastic.coef_) != 0))

reg_results[3, :-1] = np.array(final_elastic.coef_)
reg_results[3, -1] = r2_elastic

[ 0.03959471 -0.06787966  0.08747788  0.2581133   0.          0.4565332
  0.52262084  0.73376201  0.35461208 -0.0192919  -0.02413888  0.
 -0.03106565 -0.05601302 -0.01502453  0.00642186  0.01449364  0.
  0.06210243  0.09649829 -0.14238757 -0.05054639  0.12155413 -0.14659472
 -0.02607408  0.05109339  0.04945887 -0.0494558 ]
R2: 0.27541585618348197
25


In [14]:
reg_results2 = np.hstack((reg_results, (np.sum(reg_results != 0, axis=1).reshape(-1, 1) - 1)))

reg_results_df = pd.DataFrame(np.round(reg_results2.T, 3))

indexes = df.drop(columns=['logwage']).columns.tolist()
indexes.append("R2")
indexes.append("n_nonzero_coef")
reg_results_df.index = indexes
reg_results_df.columns = ["OLS", "Ridge", "Lasso", "Elasticnet"]


print(reg_results_df)

                          OLS   Ridge  Lasso  Elasticnet
education               0.012   0.012  0.098       0.040
educ_dum_12             0.057   0.057 -0.000      -0.068
educ_dum_13             0.250   0.249 -0.000       0.087
educ_dum_14             0.456   0.455  0.000       0.258
educ_dum_15             0.000   0.000  0.000       0.000
educ_dum_16             0.703   0.700  0.000       0.457
educ_dum_18             0.836   0.834  0.000       0.523
educ_dum_20             1.107   1.106  0.000       0.734
experience/40         -11.778   7.322  0.000       0.355
experience/40_pow_2   108.689 -11.302  0.000      -0.019
experience/40_pow_3  -403.644  -1.091  0.000      -0.024
experience/40_pow_4   817.520   9.600  0.000       0.000
experience/40_pow_5 -1014.932   4.840  0.000      -0.031
experience/40_pow_6   810.763  -7.264  0.000      -0.056
experience/40_pow_7  -413.095  -7.907  0.000      -0.015
experience/40_pow_8   122.758   9.711  0.000       0.006
experience/40_pow_9   -16.113  