# Proyecto 1 - Laboratorio de Aprendizaje Estadístico

In [3]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
import statsmodels.api as sm

Teoria:

- Regresion lineal
- Regresion polinomial
- Ridge
- Lasso
- p values
- elastic net

### Descarga de Datos

In [4]:
df = pd.read_csv("Hitters.csv")
df.dropna()
df = df[df['Salary'].notna()] 
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 263 entries, 1 to 321
Data columns (total 20 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   AtBat      263 non-null    int64  
 1   Hits       263 non-null    int64  
 2   HmRun      263 non-null    int64  
 3   Runs       263 non-null    int64  
 4   RBI        263 non-null    int64  
 5   Walks      263 non-null    int64  
 6   Years      263 non-null    int64  
 7   CAtBat     263 non-null    int64  
 8   CHits      263 non-null    int64  
 9   CHmRun     263 non-null    int64  
 10  CRuns      263 non-null    int64  
 11  CRBI       263 non-null    int64  
 12  CWalks     263 non-null    int64  
 13  League     263 non-null    object 
 14  Division   263 non-null    object 
 15  PutOuts    263 non-null    int64  
 16  Assists    263 non-null    int64  
 17  Errors     263 non-null    int64  
 18  Salary     263 non-null    float64
 19  NewLeague  263 non-null    object 
dtypes: float64(1), 

In [5]:
data = pd.get_dummies(df, columns = ['League', 'Division', 'NewLeague'])

### Funciones

In [34]:
def datos(data):


    x = data.drop(columns = ['Salary'])
    y = data['Salary']
    x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.7, random_state=42)
    scaler = StandardScaler().fit(x_train)
    x_train_scaled = scaler.transform(x_train)
    x_test_scaled = scaler.transform(x_test)
    
    x_train_scaled = pd.DataFrame(x_train_scaled, columns=x_train.columns, index=x_train.index)
    x_test_scaled = pd.DataFrame(x_test_scaled, columns=x_test.columns, index=x_test.index)
    
    return x_train_scaled, x_test_scaled, y_train, y_test

def datos_polinomiales(data, degree=2):


    x_train_scaled, x_test_scaled, y_train, y_test = datos(data)

    poly = PolynomialFeatures(degree=degree, include_bias=False)
    x_train_poly = poly.fit_transform(x_train_scaled)
    x_test_poly = poly.transform(x_test_scaled)

    feature_names = poly.get_feature_names_out(x_train_scaled.columns)
    x_train_poly = pd.DataFrame(x_train_poly, columns=feature_names, index=x_train_scaled.index)
    x_test_poly = pd.DataFrame(x_test_poly, columns=feature_names, index=x_test_scaled.index)

    return x_train_poly, x_test_poly, y_train, y_test

def sin_penalizacion(data, degree):


    if degree == 1:
        x_train_scaled, x_test_scaled, y_train, y_test = datos(data)
        ols = sm.OLS(y_train, sm.add_constant(x_train_scaled))
        results = ols.fit()
        y_pred = results.predict(sm.add_constant(x_test_scaled))
        r2s = r2_score(y_test, y_pred)
        res = results.summary()
    else:           
        x_train_scaled_poly, x_test_scaled_poly, y_train, y_test = datos_polinomiales(data, degree)
        model = LinearRegression()
        model.fit(x_train_scaled_poly, y_train)
        y_pred_train = model.predict(x_train_scaled_poly)
        r2_train = r2_score(y_train, y_pred_train)
        y_pred = model.predict(x_test_scaled_poly)
        r2_test = r2_score(y_test, y_pred)

        res = pd.DataFrame({'Variables': x_train_scaled_poly.columns, 'Coeficientes': model.coef_})
        r2s = pd.DataFrame({'R2_train': [r2_train], 'R2_test': [r2_test]})

    return res,r2s

def ridge(data, alpha, degree):


    if degree == 1:
        x_train_scaled, x_test_scaled, y_train, y_test = datos(data)
    else:
         x_train_scaled, x_test_scaled, y_train, y_test = datos_polinomiales(data, degree)   
    ridge = Ridge(alpha=alpha)
    ridge.fit(x_train_scaled, y_train)
    y_pred_train = ridge.predict(x_train_scaled)
    r2_train = r2_score(y_train, y_pred_train)
    y_pred = ridge.predict(x_test_scaled)
    r2 = r2_score(y_test, y_pred)

    res = pd.DataFrame({'Variables': x_train_scaled.columns, 'Coeficientes': ridge.coef_})
    r2s = pd.DataFrame({'R2_train': [r2_train], 'R2_test': [r2]})
    return res, r2s

def lasso(data, alpha, degree):


    if degree == 1:
        x_train_scaled, x_test_scaled, y_train, y_test = datos(data)
    else:
         x_train_scaled, x_test_scaled, y_train, y_test = datos_polinomiales(data, degree)  
    lasso = Lasso(alpha = alpha, max_iter = 10_000)
    lasso.fit(x_train_scaled, y_train)
    y_pred_train = lasso.predict(x_train_scaled)
    r2_train = r2_score(y_train, y_pred_train)
    y_pred = lasso.predict(x_test_scaled)
    r2 = r2_score(y_test, y_pred)

    res = pd.DataFrame({'Variables': x_train_scaled.columns, 'Coeficientes': lasso.coef_})
    r2s = pd.DataFrame({'R2_train': [r2_train], 'R2_test': [r2]})
    return res, r2s

def elastic_net(data, alpha, ratio, degree):

    
    if degree == 1:
        x_train_scaled, x_test_scaled, y_train, y_test = datos(data)
    else:
         x_train_scaled, x_test_scaled, y_train, y_test = datos_polinomiales(data, degree)  
    elastic_net = ElasticNet(alpha=alpha, l1_ratio=ratio, max_iter=10_000)
    elastic_net.fit(x_train_scaled, y_train)
    y_pred_train = elastic_net.predict(x_train_scaled)
    r2_train = r2_score(y_train, y_pred_train)
    y_pred = elastic_net.predict(x_test_scaled)
    r2 = r2_score(y_test, y_pred)

    res = pd.DataFrame({'Variables': x_train_scaled.columns, 'Coeficientes': elastic_net.coef_})
    r2s = pd.DataFrame({'R2_train': [r2_train], 'R2_test': [r2]})
    return res, r2s



### Regresión lineal

In [12]:
n = len(data)
n

263

In [16]:
X

array([[ 1.        ,  0.29490081,  0.28256464, ...,  0.82178218,
        -0.82178218,  0.82178218],
       [ 1.        ,  1.31711123,  1.23889828, ...,  1.21686747,
        -1.21686747,  1.21686747],
       [ 1.        ,  0.53459153,  0.3991907 , ...,  1.21686747,
        -1.21686747,  1.21686747],
       ...,
       [ 1.        ,  0.7883817 ,  0.74906886, ...,  0.82178218,
        -0.82178218,  0.82178218],
       [ 1.        , -0.39597244, -0.06731352, ...,  0.82178218,
        -0.82178218,  0.82178218],
       [ 1.        ,  0.32309972, -0.16061436, ...,  0.82178218,
        -0.82178218,  0.82178218]])

In [32]:
n

184

In [23]:
X.shape[1]

276

In [36]:
sin_penalizacion(data, 1)[0]

0,1,2,3
Dep. Variable:,Salary,R-squared:,0.603
Model:,OLS,Adj. R-squared:,0.557
Method:,Least Squares,F-statistic:,13.12
Date:,"Thu, 25 Sep 2025",Prob (F-statistic):,4.36e-24
Time:,16:17:42,Log-Likelihood:,-1292.1
No. Observations:,184,AIC:,2624.0
Df Residuals:,164,BIC:,2688.0
Df Model:,19,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,535.8383,21.185,25.294,0.000,494.008,577.668
AtBat,-191.3439,104.702,-1.828,0.069,-398.082,15.394
Hits,297.1558,109.671,2.710,0.007,80.607,513.705
HmRun,84.9538,64.213,1.323,0.188,-41.837,211.745
Runs,-36.1422,79.797,-0.453,0.651,-193.704,121.419
RBI,-74.9421,73.741,-1.016,0.311,-220.546,70.662
Walks,81.8926,41.687,1.964,0.051,-0.420,164.205
Years,96.4996,67.973,1.420,0.158,-37.716,230.716
CAtBat,-911.7612,386.421,-2.359,0.019,-1674.764,-148.759

0,1,2,3
Omnibus:,8.366,Durbin-Watson:,1.871
Prob(Omnibus):,0.015,Jarque-Bera (JB):,10.475
Skew:,0.322,Prob(JB):,0.00531
Kurtosis:,3.976,Cond. No.,4.48e+16


In [27]:
a, b, b_0, b_i, x_scaled = sin_penalizacion(data, 2)
n = len(x_scaled)



X = np.column_stack((np.ones(n), x_scaled))
n2 = X.shape[1]
aux = X.T @ X + np.eye(n2)*1e-3
var_beta = np.linalg.inv(aux) #* var
std_beta = np.sqrt(np.diag(var_beta))

t_stats = np.array([b_0] + list(b_i)) / std_beta
p_values = [2 * (1 - stats.t.cdf(np.abs(t), n - n2 - 1)) for t in t_stats]

In [31]:
n - n2

-92

In [28]:
p_values

[np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float64(nan),
 np.float6

In [11]:
sin_penalizacion(data, 1)[1]

UnboundLocalError: local variable 'model' referenced before assignment

In [7]:
ridge(data, 1, 1)[0]

Unnamed: 0,Variables,Coeficientes
0,AtBat,-199.559388
1,Hits,289.887062
2,HmRun,73.273902
3,Runs,-17.321385
4,RBI,-56.166674
5,Walks,75.59
6,Years,35.772276
7,CAtBat,-320.705418
8,CHits,140.723455
9,CHmRun,49.71939


In [8]:
ridge(data, 1, 1)[1]

Unnamed: 0,R2_train,R2_test
0,0.594157,0.403631


In [9]:
lasso(data, 1, 1)[0]

Unnamed: 0,Variables,Coeficientes
0,AtBat,-214.1244
1,Hits,302.52
2,HmRun,64.47799
3,Runs,-21.95379
4,RBI,-43.02229
5,Walks,72.46259
6,Years,44.24827
7,CAtBat,-392.8808
8,CHits,24.25419
9,CHmRun,0.0


In [10]:
lasso(data, 1, 1)[1]

Unnamed: 0,R2_train,R2_test
0,0.5971,0.404124


In [11]:
elastic_net(data, 1, 0.5, 1)[0]

Unnamed: 0,Variables,Coeficientes
0,AtBat,15.808969
1,Hits,51.345039
2,HmRun,22.786881
3,Runs,32.841385
4,RBI,16.239586
5,Walks,23.333458
6,Years,16.829502
7,CAtBat,24.185327
8,CHits,37.727043
9,CHmRun,27.412571


In [12]:
elastic_net(data, 1, 0.5, 1)[1]

Unnamed: 0,R2_train,R2_test
0,0.529303,0.353081


### Regresión grado 2

In [13]:
sin_penalizacion(data, 2)[0]

Unnamed: 0,Variables,Coeficientes
0,AtBat,741.560388
1,Hits,-233.153075
2,HmRun,7.876847
3,Runs,-84.767595
4,RBI,-289.173229
...,...,...
270,Division_W NewLeague_A,-14.338885
271,Division_W NewLeague_N,14.338885
272,NewLeague_A^2,10.791840
273,NewLeague_A NewLeague_N,-10.791840


In [14]:
sin_penalizacion(data, 2)[1]

Unnamed: 0,R2_train,R2_test
0,1.0,-8.444412


In [15]:
ridge(data, 1, 2)[0]

Unnamed: 0,Variables,Coeficientes
0,AtBat,19.727930
1,Hits,66.221184
2,HmRun,-12.020896
3,Runs,72.557081
4,RBI,-58.157291
...,...,...
270,Division_W NewLeague_A,3.623508
271,Division_W NewLeague_N,-3.623508
272,NewLeague_A^2,1.987164
273,NewLeague_A NewLeague_N,-1.987164


In [16]:
ridge(data, 1, 2)[1]

Unnamed: 0,R2_train,R2_test
0,0.974752,0.248717


In [17]:
lasso(data, 1, 2)[0]

Unnamed: 0,Variables,Coeficientes
0,AtBat,0.000000e+00
1,Hits,7.426494e+01
2,HmRun,-0.000000e+00
3,Runs,2.448237e+01
4,RBI,-2.195337e+01
...,...,...
270,Division_W NewLeague_A,1.700396e-15
271,Division_W NewLeague_N,-7.729072e-16
272,NewLeague_A^2,0.000000e+00
273,NewLeague_A NewLeague_N,-0.000000e+00


In [18]:
lasso(data, 1, 2)[1]

Unnamed: 0,R2_train,R2_test
0,0.959924,0.354491


In [19]:
elastic_net(data, 1, 0.5, 2)[0]

Unnamed: 0,Variables,Coeficientes
0,AtBat,20.417965
1,Hits,29.092452
2,HmRun,12.224807
3,Runs,18.861799
4,RBI,10.776028
...,...,...
270,Division_W NewLeague_A,1.286660
271,Division_W NewLeague_N,-1.286670
272,NewLeague_A^2,1.275044
273,NewLeague_A NewLeague_N,-1.275044


In [20]:
elastic_net(data, 1, 0.5, 2)[1]

Unnamed: 0,R2_train,R2_test
0,0.850219,0.546487


### Grado 3

In [21]:
sin_penalizacion(data, 3)[0]

Unnamed: 0,Variables,Coeficientes
0,AtBat,1.725460
1,Hits,7.684386
2,HmRun,5.740870
3,Runs,1.180612
4,RBI,1.538413
...,...,...
2294,Division_W NewLeague_N^2,-3.715489
2295,NewLeague_A^3,-2.316836
2296,NewLeague_A^2 NewLeague_N,2.316836
2297,NewLeague_A NewLeague_N^2,-2.316836


In [22]:
sin_penalizacion(data, 3)[1]

Unnamed: 0,R2_train,R2_test
0,1.0,0.204029


In [23]:
ridge(data, 1, 3)[0]

Unnamed: 0,Variables,Coeficientes
0,AtBat,1.691486
1,Hits,7.332439
2,HmRun,5.730521
3,Runs,1.664145
4,RBI,1.308949
...,...,...
2294,Division_W NewLeague_N^2,-3.489885
2295,NewLeague_A^3,-2.342504
2296,NewLeague_A^2 NewLeague_N,2.342504
2297,NewLeague_A NewLeague_N^2,-2.342504


In [24]:
ridge(data, 1, 3)[1]

Unnamed: 0,R2_train,R2_test
0,0.999942,0.255183


In [25]:
lasso(data, 1, 3)[0]

Unnamed: 0,Variables,Coeficientes
0,AtBat,0.0
1,Hits,0.0
2,HmRun,0.0
3,Runs,0.0
4,RBI,0.0
...,...,...
2294,Division_W NewLeague_N^2,-0.0
2295,NewLeague_A^3,-0.0
2296,NewLeague_A^2 NewLeague_N,0.0
2297,NewLeague_A NewLeague_N^2,-0.0


In [26]:
lasso(data, 1, 3)[1]

Unnamed: 0,R2_train,R2_test
0,0.992105,0.458592


In [27]:
elastic_net(data, 1, 0.5, 3)[0]

  model = cd_fast.enet_coordinate_descent(


Unnamed: 0,Variables,Coeficientes
0,AtBat,2.340815
1,Hits,4.503705
2,HmRun,2.826331
3,Runs,2.326564
4,RBI,0.657702
...,...,...
2294,Division_W NewLeague_N^2,-2.153634
2295,NewLeague_A^3,-2.453714
2296,NewLeague_A^2 NewLeague_N,2.453695
2297,NewLeague_A NewLeague_N^2,-2.453677


In [28]:
elastic_net(data, 1, 0.5, 3)[1]

  model = cd_fast.enet_coordinate_descent(


Unnamed: 0,R2_train,R2_test
0,0.973776,0.420591
