# Proyecto 1 - Laboratorio de Aprendizaje Estadístico

In [1]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
import statsmodels.api as sm

Teoria:

- Regresion lineal
- Regresion polinomial
- Ridge
- Lasso
- p values
- elastic net

### Descarga de Datos

In [2]:
df = pd.read_csv("Hitters.csv")
df.dropna()
df = df[df['Salary'].notna()] 
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 263 entries, 1 to 321
Data columns (total 20 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   AtBat      263 non-null    int64  
 1   Hits       263 non-null    int64  
 2   HmRun      263 non-null    int64  
 3   Runs       263 non-null    int64  
 4   RBI        263 non-null    int64  
 5   Walks      263 non-null    int64  
 6   Years      263 non-null    int64  
 7   CAtBat     263 non-null    int64  
 8   CHits      263 non-null    int64  
 9   CHmRun     263 non-null    int64  
 10  CRuns      263 non-null    int64  
 11  CRBI       263 non-null    int64  
 12  CWalks     263 non-null    int64  
 13  League     263 non-null    object 
 14  Division   263 non-null    object 
 15  PutOuts    263 non-null    int64  
 16  Assists    263 non-null    int64  
 17  Errors     263 non-null    int64  
 18  Salary     263 non-null    float64
 19  NewLeague  263 non-null    object 
dtypes: float64(1), 

In [3]:
data = pd.get_dummies(df, columns = ['League', 'Division', 'NewLeague'])

### Funciones

In [4]:
def datos(data):


    x = data.drop(columns = ['Salary'])
    y = data['Salary']
    x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.7, random_state=42)
    scaler = StandardScaler().fit(x_train)
    x_train_scaled = scaler.transform(x_train)
    x_test_scaled = scaler.transform(x_test)
    
    x_train_scaled = pd.DataFrame(x_train_scaled, columns=x_train.columns, index=x_train.index)
    x_test_scaled = pd.DataFrame(x_test_scaled, columns=x_test.columns, index=x_test.index)
    
    return x_train_scaled, x_test_scaled, y_train, y_test


def var_polinomiales(data, degree):
    poly = PolynomialFeatures(degree=degree, include_bias=False)
    pols = poly.fit_transform(data)
    feature_names = poly.get_feature_names_out(data.columns)
    data_poly = pd.DataFrame(pols, columns=feature_names, index=data.index)
    return data_poly


def datos_polinomiales(data, degree):
    x_train_scaled, x_test_scaled, y_train, y_test = datos(var_polinomiales(data, degree))
    poly = PolynomialFeatures(degree=degree, include_bias=False)
    
    x_train_poly = poly.fit_transform(x_train_scaled)
    x_test_poly = poly.transform(x_test_scaled)
    
    feature_names = poly.get_feature_names_out(x_train_scaled.columns)
    
    x_train_poly = pd.DataFrame(x_train_poly, columns=feature_names, index=x_train_scaled.index)
    x_test_poly = pd.DataFrame(x_test_poly, columns=feature_names, index=x_test_scaled.index)
    
    return x_train_poly, x_test_poly, y_train, y_test


def dataframe_polinomial(degree):
    resultados = sin_penalizacion(data)[2]
    tabla = resultados.summary2().tables[1]
    tabla_filtrada = tabla[tabla['P>|t|'] > 0.2]  # no tomamos mayores a 0.05, sino los 8 valores de p-value mayores
    
    variables = tabla_filtrada.index.tolist()
    data_variables = data[variables]
    
    variables_poly = var_polinomiales(data_variables, degree=degree)
    
    data_nueva = data
    data_nueva = data.drop(columns=variables)
    
    data_nueva_variables = pd.concat([data_nueva, variables_poly], axis=1)
    return data_nueva_variables


def sin_penalizacion(data):


    x_train_scaled, x_test_scaled, y_train, y_test = datos(data)
    
    ols = sm.OLS(y_train, sm.add_constant(x_train_scaled))
    results = ols.fit()
    
    y_pred = results.predict(sm.add_constant(x_test_scaled))
    r2s = r2_score(y_test, y_pred)
    r2s = pd.DataFrame({'R2_test': [r2s]})
    res = results.summary()
    return res, r2s, results


def ridge(data, alpha):

    x_train_scaled, x_test_scaled, y_train, y_test = datos(data) 
    ridge = Ridge(alpha=alpha)
    ridge.fit(x_train_scaled, y_train)
    y_pred_train = ridge.predict(x_train_scaled)
    r2_train = r2_score(y_train, y_pred_train)
    y_pred = ridge.predict(x_test_scaled)
    r2 = r2_score(y_test, y_pred)

    res = pd.DataFrame({'Variables': x_train_scaled.columns, 'Coeficientes': ridge.coef_})
    r2s = pd.DataFrame({'R2_train': [r2_train], 'R2_test': [r2]})
    return res, r2s

def lasso(data, alpha):


    x_train_scaled, x_test_scaled, y_train, y_test = datos(data)
    lasso = Lasso(alpha = alpha, max_iter = 10_000)
    lasso.fit(x_train_scaled, y_train)
    y_pred_train = lasso.predict(x_train_scaled)
    r2_train = r2_score(y_train, y_pred_train)
    y_pred = lasso.predict(x_test_scaled)
    r2 = r2_score(y_test, y_pred)

    res = pd.DataFrame({'Variables': x_train_scaled.columns, 'Coeficientes': lasso.coef_})
    r2s = pd.DataFrame({'R2_train': [r2_train], 'R2_test': [r2]})
    return res, r2s

def elastic_net(data, alpha, ratio):

    
    x_train_scaled, x_test_scaled, y_train, y_test = datos(data)
    
    elastic_net = ElasticNet(alpha=alpha, l1_ratio=ratio, max_iter=10_000)
    elastic_net.fit(x_train_scaled, y_train)
    y_pred_train = elastic_net.predict(x_train_scaled)
    r2_train = r2_score(y_train, y_pred_train)
    y_pred = elastic_net.predict(x_test_scaled)
    r2 = r2_score(y_test, y_pred)

    res = pd.DataFrame({'Variables': x_train_scaled.columns, 'Coeficientes': elastic_net.coef_})
    r2s = pd.DataFrame({'R2_train': [r2_train], 'R2_test': [r2]})
    return res, r2s



### Regresión lineal

In [5]:
sin_penalizacion(data)[0]

0,1,2,3
Dep. Variable:,Salary,R-squared:,0.603
Model:,OLS,Adj. R-squared:,0.557
Method:,Least Squares,F-statistic:,13.12
Date:,"Fri, 26 Sep 2025",Prob (F-statistic):,4.36e-24
Time:,07:40:02,Log-Likelihood:,-1292.1
No. Observations:,184,AIC:,2624.0
Df Residuals:,164,BIC:,2688.0
Df Model:,19,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,535.8383,21.185,25.294,0.000,494.008,577.668
AtBat,-191.3439,104.702,-1.828,0.069,-398.082,15.394
Hits,297.1558,109.671,2.710,0.007,80.607,513.705
HmRun,84.9538,64.213,1.323,0.188,-41.837,211.745
Runs,-36.1422,79.797,-0.453,0.651,-193.704,121.419
RBI,-74.9421,73.741,-1.016,0.311,-220.546,70.662
Walks,81.8926,41.687,1.964,0.051,-0.420,164.205
Years,96.4996,67.973,1.420,0.158,-37.716,230.716
CAtBat,-911.7612,386.421,-2.359,0.019,-1674.764,-148.759

0,1,2,3
Omnibus:,8.366,Durbin-Watson:,1.871
Prob(Omnibus):,0.015,Jarque-Bera (JB):,10.475
Skew:,0.322,Prob(JB):,0.00531
Kurtosis:,3.976,Cond. No.,4.48e+16


In [6]:
sin_penalizacion(data)[1]

Unnamed: 0,R2_test
0,0.380623


In [7]:
ridge(data, 1)[0]

Unnamed: 0,Variables,Coeficientes
0,AtBat,-199.559388
1,Hits,289.887062
2,HmRun,73.273902
3,Runs,-17.321385
4,RBI,-56.166674
5,Walks,75.59
6,Years,35.772276
7,CAtBat,-320.705418
8,CHits,140.723455
9,CHmRun,49.71939


In [8]:
ridge(data, 1)[1]

Unnamed: 0,R2_train,R2_test
0,0.594157,0.403631


In [9]:
lasso(data, 1)[0]

Unnamed: 0,Variables,Coeficientes
0,AtBat,-214.1244
1,Hits,302.52
2,HmRun,64.47799
3,Runs,-21.95379
4,RBI,-43.02229
5,Walks,72.46259
6,Years,44.24827
7,CAtBat,-392.8808
8,CHits,24.25419
9,CHmRun,0.0


In [10]:
lasso(data, 1)[1]

Unnamed: 0,R2_train,R2_test
0,0.5971,0.404124


In [11]:
elastic_net(data, 1, 0.5)[0]

Unnamed: 0,Variables,Coeficientes
0,AtBat,15.808969
1,Hits,51.345039
2,HmRun,22.786881
3,Runs,32.841385
4,RBI,16.239586
5,Walks,23.333458
6,Years,16.829502
7,CAtBat,24.185327
8,CHits,37.727043
9,CHmRun,27.412571


In [12]:
elastic_net(data, 1, 0.5)[1]

Unnamed: 0,R2_train,R2_test
0,0.529303,0.353081


### Regresión grado 2

In [13]:
data_cuadrada = dataframe_polinomial(2)
data_cubica = dataframe_polinomial(3)

In [14]:
sin_penalizacion(data_cuadrada)[0]

0,1,2,3
Dep. Variable:,Salary,R-squared:,0.861
Model:,OLS,Adj. R-squared:,0.814
Method:,Least Squares,F-statistic:,18.45
Date:,"Fri, 26 Sep 2025",Prob (F-statistic):,4.69e-40
Time:,07:40:03,Log-Likelihood:,-1195.6
No. Observations:,184,AIC:,2485.0
Df Residuals:,137,BIC:,2636.0
Df Model:,46,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,535.8383,13.718,39.060,0.000,508.711,562.965
AtBat,56.8659,81.924,0.694,0.489,-105.133,218.865
Hits,-52.0170,90.656,-0.574,0.567,-231.284,127.250
HmRun,-154.5649,58.373,-2.648,0.009,-269.993,-39.136
Walks,17.0232,31.119,0.547,0.585,-44.512,78.558
Years,3.7296,51.115,0.073,0.942,-97.347,104.807
CAtBat,-1701.8105,315.094,-5.401,0.000,-2324.887,-1078.734
CRuns,-19.2089,220.724,-0.087,0.931,-455.676,417.258
CWalks,1.1541,75.298,0.015,0.988,-147.742,150.050

0,1,2,3
Omnibus:,8.16,Durbin-Watson:,2.083
Prob(Omnibus):,0.017,Jarque-Bera (JB):,15.537
Skew:,0.029,Prob(JB):,0.000423
Kurtosis:,4.422,Cond. No.,1.27e+16


In [15]:
sin_penalizacion(data_cuadrada)[1]

Unnamed: 0,R2_test
0,0.407619


In [16]:
ridge(data_cuadrada, 1)[0]

Unnamed: 0,Variables,Coeficientes
0,AtBat,-69.018819
1,Hits,140.851465
2,HmRun,-84.542601
3,Walks,48.699308
4,Years,-77.22907
5,CAtBat,-414.988389
6,CRuns,309.497035
7,CWalks,-74.573385
8,PutOuts,81.712894
9,Assists,-19.554098


In [17]:
ridge(data_cuadrada, 1)[1]

Unnamed: 0,R2_train,R2_test
0,0.799778,0.475286


In [18]:
lasso(data_cuadrada, 1)[0]

Unnamed: 0,Variables,Coeficientes
0,AtBat,0.0
1,Hits,34.09458
2,HmRun,-95.48441
3,Walks,23.76546
4,Years,-36.52049
5,CAtBat,-1174.362
6,CRuns,183.7732
7,CWalks,-0.0
8,PutOuts,82.61202
9,Assists,-5.360429


In [19]:
lasso(data_cuadrada, 1)[1]

Unnamed: 0,R2_train,R2_test
0,0.825276,0.470247


In [20]:
elastic_net(data_cuadrada, 1, 0.5)[0]

Unnamed: 0,Variables,Coeficientes
0,AtBat,0.854778
1,Hits,29.025143
2,HmRun,0.0
3,Walks,15.301269
4,Years,4.895961
5,CAtBat,14.887193
6,CRuns,32.643846
7,CWalks,19.960686
8,PutOuts,55.999154
9,Assists,-0.0


In [21]:
elastic_net(data_cuadrada, 1, 0.5)[1]

Unnamed: 0,R2_train,R2_test
0,0.631365,0.428992


### Grado 3

In [22]:
sin_penalizacion(data_cubica)[0]

0,1,2,3
Dep. Variable:,Salary,R-squared:,0.968
Model:,OLS,Adj. R-squared:,0.904
Method:,Least Squares,F-statistic:,14.93
Date:,"Fri, 26 Sep 2025",Prob (F-statistic):,1.27e-22
Time:,07:40:03,Log-Likelihood:,-1059.4
No. Observations:,184,AIC:,2367.0
Df Residuals:,60,BIC:,2765.0
Df Model:,123,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,535.8383,9.890,54.177,0.000,516.054,555.622
AtBat,158.7750,107.340,1.479,0.144,-55.936,373.486
Hits,-197.9426,129.699,-1.526,0.132,-457.378,61.493
HmRun,14.9406,78.373,0.191,0.849,-141.829,171.711
Walks,-12.8007,44.540,-0.287,0.775,-101.894,76.293
Years,-42.0179,72.506,-0.580,0.564,-187.052,103.017
CAtBat,-2151.5475,597.148,-3.603,0.001,-3346.021,-957.074
CRuns,-295.0201,331.463,-0.890,0.377,-958.046,368.005
CWalks,136.1641,148.644,0.916,0.363,-161.168,433.496

0,1,2,3
Omnibus:,3.998,Durbin-Watson:,2.03
Prob(Omnibus):,0.135,Jarque-Bera (JB):,4.503
Skew:,-0.135,Prob(JB):,0.105
Kurtosis:,3.717,Cond. No.,1.25e+16


In [23]:
sin_penalizacion(data_cubica)[1]

Unnamed: 0,R2_test
0,-1.95472


In [24]:
ridge(data_cubica, 1)[0]

Unnamed: 0,Variables,Coeficientes
0,AtBat,-17.262003
1,Hits,67.866510
2,HmRun,-69.656346
3,Walks,33.648582
4,Years,-49.775180
...,...,...
173,Errors NewLeague_N^2,30.153108
174,NewLeague_A^3,13.898360
175,NewLeague_A^2 NewLeague_N,0.000000
176,NewLeague_A NewLeague_N^2,0.000000


In [25]:
ridge(data_cubica, 1)[1]

Unnamed: 0,R2_train,R2_test
0,0.875765,0.412473


In [26]:
lasso(data_cubica, 1)[0]

Unnamed: 0,Variables,Coeficientes
0,AtBat,0.474711
1,Hits,23.626147
2,HmRun,-67.675944
3,Walks,22.656311
4,Years,-19.801481
...,...,...
173,Errors NewLeague_N^2,0.000000
174,NewLeague_A^3,0.129337
175,NewLeague_A^2 NewLeague_N,0.000000
176,NewLeague_A NewLeague_N^2,0.000000


In [27]:
lasso(data_cubica, 1)[1]

Unnamed: 0,R2_train,R2_test
0,0.860321,0.417089


In [28]:
elastic_net(data_cubica, 1, 0.5)[0]

Unnamed: 0,Variables,Coeficientes
0,AtBat,2.471654
1,Hits,22.359702
2,HmRun,-11.181716
3,Walks,11.313165
4,Years,2.805896
...,...,...
173,Errors NewLeague_N^2,-0.688813
174,NewLeague_A^3,-0.000000
175,NewLeague_A^2 NewLeague_N,0.000000
176,NewLeague_A NewLeague_N^2,0.000000


In [29]:
elastic_net(data_cubica, 1, 0.5)[1]

Unnamed: 0,R2_train,R2_test
0,0.72348,0.411073
