In [2]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
import statsmodels.api as sm

df = pd.read_csv("Hitters.csv")
df.dropna()
df = df[df['Salary'].notna()] 
df.info()

data = pd.get_dummies(df, columns = ['League', 'Division', 'NewLeague'])

def datos(data):


    x = data.drop(columns = ['Salary'])
    y = data['Salary']
    x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.7, random_state=42)
    scaler = StandardScaler().fit(x_train)
    x_train_scaled = scaler.transform(x_train)
    x_test_scaled = scaler.transform(x_test)
    
    x_train_scaled = pd.DataFrame(x_train_scaled, columns=x_train.columns, index=x_train.index)
    x_test_scaled = pd.DataFrame(x_test_scaled, columns=x_test.columns, index=x_test.index)
    
    return x_train_scaled, x_test_scaled, y_train, y_test

def datos_polinomiales(data, degree=2):


    x_train_scaled, x_test_scaled, y_train, y_test = datos(data)

    poly = PolynomialFeatures(degree=degree, include_bias=False)
    x_train_poly = poly.fit_transform(x_train_scaled)
    x_test_poly = poly.transform(x_test_scaled)

    feature_names = poly.get_feature_names_out(x_train_scaled.columns)
    x_train_poly = pd.DataFrame(x_train_poly, columns=feature_names, index=x_train_scaled.index)
    x_test_poly = pd.DataFrame(x_test_poly, columns=feature_names, index=x_test_scaled.index)

    return x_train_poly, x_test_poly, y_train, y_test

def sin_penalizacion(data, degree):


    if degree == 1:
        x_train_scaled, x_test_scaled, y_train, y_test = datos(data)
        ols = sm.OLS(y_train, sm.add_constant(x_train_scaled))
        results = ols.fit()
        y_pred = results.predict(sm.add_constant(x_test_scaled))
        r2s = r2_score(y_test, y_pred)
        res = results.summary()
    else:           
        x_train_scaled_poly, x_test_scaled_poly, y_train, y_test = datos_polinomiales(data, degree)
        model = LinearRegression()
        model.fit(x_train_scaled_poly, y_train)
        y_pred_train = model.predict(x_train_scaled_poly)
        r2_train = r2_score(y_train, y_pred_train)
        y_pred = model.predict(x_test_scaled_poly)
        r2_test = r2_score(y_test, y_pred)

        res = pd.DataFrame({'Variables': x_train_scaled_poly.columns, 'Coeficientes': model.coef_})
        r2s = pd.DataFrame({'R2_train': [r2_train], 'R2_test': [r2_test]})

    return res,r2s

def ridge(data, alpha, degree):


    if degree == 1:
        x_train_scaled, x_test_scaled, y_train, y_test = datos(data)
    else:
         x_train_scaled, x_test_scaled, y_train, y_test = datos_polinomiales(data, degree)   
    ridge = Ridge(alpha=alpha)
    ridge.fit(x_train_scaled, y_train)
    y_pred_train = ridge.predict(x_train_scaled)
    r2_train = r2_score(y_train, y_pred_train)
    y_pred = ridge.predict(x_test_scaled)
    r2 = r2_score(y_test, y_pred)

    res = pd.DataFrame({'Variables': x_train_scaled.columns, 'Coeficientes': ridge.coef_})
    r2s = pd.DataFrame({'R2_train': [r2_train], 'R2_test': [r2]})
    return res, r2s

def lasso(data, alpha, degree):


    if degree == 1:
        x_train_scaled, x_test_scaled, y_train, y_test = datos(data)
    else:
         x_train_scaled, x_test_scaled, y_train, y_test = datos_polinomiales(data, degree)  
    lasso = Lasso(alpha = alpha, max_iter = 10_000)
    lasso.fit(x_train_scaled, y_train)
    y_pred_train = lasso.predict(x_train_scaled)
    r2_train = r2_score(y_train, y_pred_train)
    y_pred = lasso.predict(x_test_scaled)
    r2 = r2_score(y_test, y_pred)

    res = pd.DataFrame({'Variables': x_train_scaled.columns, 'Coeficientes': lasso.coef_})
    r2s = pd.DataFrame({'R2_train': [r2_train], 'R2_test': [r2]})
    return res, r2s

def elastic_net(data, alpha, ratio, degree):

    
    if degree == 1:
        x_train_scaled, x_test_scaled, y_train, y_test = datos(data)
    else:
         x_train_scaled, x_test_scaled, y_train, y_test = datos_polinomiales(data, degree)  
    elastic_net = ElasticNet(alpha=alpha, l1_ratio=ratio, max_iter=10_000)
    elastic_net.fit(x_train_scaled, y_train)
    y_pred_train = elastic_net.predict(x_train_scaled)
    r2_train = r2_score(y_train, y_pred_train)
    y_pred = elastic_net.predict(x_test_scaled)
    r2 = r2_score(y_test, y_pred)

    res = pd.DataFrame({'Variables': x_train_scaled.columns, 'Coeficientes': elastic_net.coef_})
    r2s = pd.DataFrame({'R2_train': [r2_train], 'R2_test': [r2]})
    return res, r2s

<class 'pandas.core.frame.DataFrame'>
Index: 263 entries, 1 to 321
Data columns (total 20 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   AtBat      263 non-null    int64  
 1   Hits       263 non-null    int64  
 2   HmRun      263 non-null    int64  
 3   Runs       263 non-null    int64  
 4   RBI        263 non-null    int64  
 5   Walks      263 non-null    int64  
 6   Years      263 non-null    int64  
 7   CAtBat     263 non-null    int64  
 8   CHits      263 non-null    int64  
 9   CHmRun     263 non-null    int64  
 10  CRuns      263 non-null    int64  
 11  CRBI       263 non-null    int64  
 12  CWalks     263 non-null    int64  
 13  League     263 non-null    object 
 14  Division   263 non-null    object 
 15  PutOuts    263 non-null    int64  
 16  Assists    263 non-null    int64  
 17  Errors     263 non-null    int64  
 18  Salary     263 non-null    float64
 19  NewLeague  263 non-null    object 
dtypes: float64(1), 

In [3]:
n = len(data)
n
a, b, b_0, b_i, x_scaled = sin_penalizacion(data, 2)
n = len(x_scaled)



X = np.column_stack((np.ones(n), x_scaled))
n2 = X.shape[1]
aux = X.T @ X + np.eye(n2)*1e-3
var_beta = np.linalg.inv(aux) #* var
std_beta = np.sqrt(np.diag(var_beta))

t_stats = np.array([b_0] + list(b_i)) / std_beta
p_values = [2 * (1 - stats.t.cdf(np.abs(t), n - n2 - 1)) for t in t_stats]

ValueError: not enough values to unpack (expected 5, got 2)