# Proyecto 1 - Laboratorio de Aprendizaje Estadístico

In [1]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
import statsmodels.api as sm

Teoria:

- Regresion lineal
- Regresion polinomial
- Ridge
- Lasso
- p values
- elastic net

### Descarga de Datos

In [2]:
df = pd.read_csv("Hitters.csv")
df.dropna()
df = df[df['Salary'].notna()] 
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 263 entries, 1 to 321
Data columns (total 20 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   AtBat      263 non-null    int64  
 1   Hits       263 non-null    int64  
 2   HmRun      263 non-null    int64  
 3   Runs       263 non-null    int64  
 4   RBI        263 non-null    int64  
 5   Walks      263 non-null    int64  
 6   Years      263 non-null    int64  
 7   CAtBat     263 non-null    int64  
 8   CHits      263 non-null    int64  
 9   CHmRun     263 non-null    int64  
 10  CRuns      263 non-null    int64  
 11  CRBI       263 non-null    int64  
 12  CWalks     263 non-null    int64  
 13  League     263 non-null    object 
 14  Division   263 non-null    object 
 15  PutOuts    263 non-null    int64  
 16  Assists    263 non-null    int64  
 17  Errors     263 non-null    int64  
 18  Salary     263 non-null    float64
 19  NewLeague  263 non-null    object 
dtypes: float64(1), 

In [3]:
data = pd.get_dummies(df, columns = ['League', 'Division', 'NewLeague'])

### Funciones

In [None]:
def datos(data):


    x = data.drop(columns = ['Salary'])
    y = data['Salary']
    x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.7, random_state=42)
    scaler = StandardScaler().fit(x_train)
    x_train_scaled = scaler.transform(x_train)
    x_test_scaled = scaler.transform(x_test)
    
    x_train_scaled = pd.DataFrame(x_train_scaled, columns=x_train.columns, index=x_train.index)
    x_test_scaled = pd.DataFrame(x_test_scaled, columns=x_test.columns, index=x_test.index)
    
    return x_train_scaled, x_test_scaled, y_train, y_test


def var_polinomiales(data, degree):
    poly = PolynomialFeatures(degree=degree, include_bias=False)
    pols = poly.fit_transform(data)
    feature_names = poly.get_feature_names_out(data.columns)
    data_poly = pd.DataFrame(pols, columns=feature_names, index=data.index)
    return data_poly


def datos_polinomiales(data, degree):
    x_train_scaled, x_test_scaled, y_train, y_test = datos(var_polinomiales(data, degree))
    poly = PolynomialFeatures(degree=degree, include_bias=False)
    
    x_train_poly = poly.fit_transform(x_train_scaled)
    x_test_poly = poly.transform(x_test_scaled)
    
    feature_names = poly.get_feature_names_out(x_train_scaled.columns)
    
    x_train_poly = pd.DataFrame(x_train_poly, columns=feature_names, index=x_train_scaled.index)
    x_test_poly = pd.DataFrame(x_test_poly, columns=feature_names, index=x_test_scaled.index)
    
    return x_train_poly, x_test_poly, y_train, y_test


def dataframe_polinomial(degree):
    resultados = sin_penalizacion(data)[2]
    tabla = resultados.summary2().tables[1]
    tabla_filtrada = tabla[tabla['P>|t|'] > 0.2]  # no tomamos mayores a 0.05, sino los 8 valores de p-value mayores
    
    variables = tabla_filtrada.index.tolist()
    data_variables = data[variables]
    
    variables_poly = var_polinomiales(data_variables, degree=degree)
    
    data_nueva = data
    data_nueva = data.drop(columns=variables)
    
    data_nueva_variables = pd.concat([data_nueva, variables_poly], axis=1)
    return data_nueva_variables


# def sin_penalizacion_polinomial(data, degree):
    
    x_train_scaled_poly, x_test_scaled_poly, y_train, y_test = datos_polinomiales(data, degree)
    
    model = LinearRegression()
    model.fit(x_train_scaled_poly, y_train)
    
    y_pred_train = model.predict(x_train_scaled_poly)
    r2_train = r2_score(y_train, y_pred_train)
    
    y_pred = model.predict(x_test_scaled_poly)
    r2_test = r2_score(y_test, y_pred)
    
    res = pd.DataFrame({
        'Variables': x_train_scaled_poly.columns,
        'Coeficientes': model.coef_
    })
    
    r2s = pd.DataFrame({
        'R2_train': [r2_train],
        'R2_test': [r2_test]
    })
    
    return res, r2s


def sin_penalizacion(data):


    x_train_scaled, x_test_scaled, y_train, y_test = datos(data)
    
    ols = sm.OLS(y_train, sm.add_constant(x_train_scaled))
    results = ols.fit()
    
    y_pred = results.predict(sm.add_constant(x_test_scaled))
    r2s = r2_score(y_test, y_pred)
    
    res = results.summary()
    return res, r2s, results

# def sin_penalizacion(data, degree):


    if degree == 1:
        return sin_penalizacion_lineal(data)
    else:
        return sin_penalizacion_polinomial(data, degree)

def ridge(data, alpha):

    x_train_scaled, x_test_scaled, y_train, y_test = datos(data) 
    ridge = Ridge(alpha=alpha)
    ridge.fit(x_train_scaled, y_train)
    y_pred_train = ridge.predict(x_train_scaled)
    r2_train = r2_score(y_train, y_pred_train)
    y_pred = ridge.predict(x_test_scaled)
    r2 = r2_score(y_test, y_pred)

    res = pd.DataFrame({'Variables': x_train_scaled.columns, 'Coeficientes': ridge.coef_})
    r2s = pd.DataFrame({'R2_train': [r2_train], 'R2_test': [r2]})
    return res, r2s

def lasso(data, alpha):


    x_train_scaled, x_test_scaled, y_train, y_test = datos(data)
    lasso = Lasso(alpha = alpha, max_iter = 10_000)
    lasso.fit(x_train_scaled, y_train)
    y_pred_train = lasso.predict(x_train_scaled)
    r2_train = r2_score(y_train, y_pred_train)
    y_pred = lasso.predict(x_test_scaled)
    r2 = r2_score(y_test, y_pred)

    res = pd.DataFrame({'Variables': x_train_scaled.columns, 'Coeficientes': lasso.coef_})
    r2s = pd.DataFrame({'R2_train': [r2_train], 'R2_test': [r2]})
    return res, r2s

def elastic_net(data, alpha, ratio, degree):

    
    x_train_scaled, x_test_scaled, y_train, y_test = datos(data)
    
    elastic_net = ElasticNet(alpha=alpha, l1_ratio=ratio, max_iter=10_000)
    elastic_net.fit(x_train_scaled, y_train)
    y_pred_train = elastic_net.predict(x_train_scaled)
    r2_train = r2_score(y_train, y_pred_train)
    y_pred = elastic_net.predict(x_test_scaled)
    r2 = r2_score(y_test, y_pred)

    res = pd.DataFrame({'Variables': x_train_scaled.columns, 'Coeficientes': elastic_net.coef_})
    r2s = pd.DataFrame({'R2_train': [r2_train], 'R2_test': [r2]})
    return res, r2s



### Regresión lineal

In [28]:
sin_penalizacion(data)[0]

0,1,2,3
Dep. Variable:,Salary,R-squared:,0.603
Model:,OLS,Adj. R-squared:,0.557
Method:,Least Squares,F-statistic:,13.12
Date:,"Thu, 25 Sep 2025",Prob (F-statistic):,4.36e-24
Time:,21:16:21,Log-Likelihood:,-1292.1
No. Observations:,184,AIC:,2624.0
Df Residuals:,164,BIC:,2688.0
Df Model:,19,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,535.8383,21.185,25.294,0.000,494.008,577.668
AtBat,-191.3439,104.702,-1.828,0.069,-398.082,15.394
Hits,297.1558,109.671,2.710,0.007,80.607,513.705
HmRun,84.9538,64.213,1.323,0.188,-41.837,211.745
Runs,-36.1422,79.797,-0.453,0.651,-193.704,121.419
RBI,-74.9421,73.741,-1.016,0.311,-220.546,70.662
Walks,81.8926,41.687,1.964,0.051,-0.420,164.205
Years,96.4996,67.973,1.420,0.158,-37.716,230.716
CAtBat,-911.7612,386.421,-2.359,0.019,-1674.764,-148.759

0,1,2,3
Omnibus:,8.366,Durbin-Watson:,1.871
Prob(Omnibus):,0.015,Jarque-Bera (JB):,10.475
Skew:,0.322,Prob(JB):,0.00531
Kurtosis:,3.976,Cond. No.,4.48e+16


In [29]:
datita = dataframe_polinomial(2)

In [31]:
datita3 = dataframe_polinomial(3)
datita3

Unnamed: 0,AtBat,Hits,HmRun,Walks,Years,CAtBat,CRuns,CWalks,PutOuts,Assists,...,Errors^3,Errors^2 NewLeague_A,Errors^2 NewLeague_N,Errors NewLeague_A^2,Errors NewLeague_A NewLeague_N,Errors NewLeague_N^2,NewLeague_A^3,NewLeague_A^2 NewLeague_N,NewLeague_A NewLeague_N^2,NewLeague_N^3
1,315,81,7,39,14,3449,321,375,632,43,...,1000.0,0.0,100.0,0.0,0.0,10.0,0.0,0.0,0.0,1.0
2,479,130,18,76,3,1624,224,263,880,82,...,2744.0,196.0,0.0,14.0,0.0,0.0,1.0,0.0,0.0,0.0
3,496,141,20,37,11,5628,828,354,200,11,...,27.0,0.0,9.0,0.0,0.0,3.0,0.0,0.0,0.0,1.0
4,321,87,10,30,2,396,48,33,805,40,...,64.0,0.0,16.0,0.0,0.0,4.0,0.0,0.0,0.0,1.0
5,594,169,4,35,11,4408,501,194,282,421,...,15625.0,625.0,0.0,25.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
317,497,127,7,37,5,2703,379,138,325,9,...,27.0,0.0,9.0,0.0,0.0,3.0,0.0,0.0,0.0,1.0
318,492,136,5,94,12,5511,897,875,313,381,...,8000.0,400.0,0.0,20.0,0.0,0.0,1.0,0.0,0.0,0.0
319,475,126,3,52,6,1700,217,146,37,113,...,343.0,49.0,0.0,7.0,0.0,0.0,1.0,0.0,0.0,0.0
320,573,144,9,78,8,3198,470,332,1314,131,...,1728.0,144.0,0.0,12.0,0.0,0.0,1.0,0.0,0.0,0.0


In [36]:
sin_penalizacion(data)[1]

0.3806233966612892

In [None]:
ridge(data, 1)[0]

Unnamed: 0,Variables,Coeficientes
0,AtBat,-199.559388
1,Hits,289.887062
2,HmRun,73.273902
3,Runs,-17.321385
4,RBI,-56.166674
5,Walks,75.59
6,Years,35.772276
7,CAtBat,-320.705418
8,CHits,140.723455
9,CHmRun,49.71939


In [10]:
ridge(data, 1, 1)[1]

Unnamed: 0,R2_train,R2_test
0,0.594157,0.403631


In [11]:
lasso(data, 1, 1)[0]

Unnamed: 0,Variables,Coeficientes
0,AtBat,-214.1244
1,Hits,302.52
2,HmRun,64.47799
3,Runs,-21.95379
4,RBI,-43.02229
5,Walks,72.46259
6,Years,44.24827
7,CAtBat,-392.8808
8,CHits,24.25419
9,CHmRun,0.0


In [12]:
lasso(data, 1, 1)[1]

Unnamed: 0,R2_train,R2_test
0,0.5971,0.404124


In [13]:
elastic_net(data, 1, 0.5, 1)[0]

Unnamed: 0,Variables,Coeficientes
0,AtBat,15.808969
1,Hits,51.345039
2,HmRun,22.786881
3,Runs,32.841385
4,RBI,16.239586
5,Walks,23.333458
6,Years,16.829502
7,CAtBat,24.185327
8,CHits,37.727043
9,CHmRun,27.412571


In [14]:
elastic_net(data, 1, 0.5, 1)[1]

Unnamed: 0,R2_train,R2_test
0,0.529303,0.353081


### Regresión grado 2

In [15]:
sin_penalizacion(data, 2)[0]

Unnamed: 0,Variables,Coeficientes
0,AtBat,0.148165
1,Hits,0.292380
2,HmRun,0.207748
3,Runs,0.081323
4,RBI,-0.170578
...,...,...
44844,NewLeague_A^2 NewLeague_A NewLeague_N,0.000000
44845,NewLeague_A^2 NewLeague_N^2,0.007034
44846,NewLeague_A NewLeague_N^2,0.000000
44847,NewLeague_A NewLeague_N NewLeague_N^2,0.000000


In [16]:
sin_penalizacion(data, 2)[1]

Unnamed: 0,R2_train,R2_test
0,1.0,0.807561


In [17]:
ridge(data, 1, 2)[0]

Unnamed: 0,Variables,Coeficientes
0,AtBat,0.148254
1,Hits,0.296032
2,HmRun,0.207978
3,Runs,0.082006
4,RBI,-0.171160
...,...,...
44844,NewLeague_A^2 NewLeague_A NewLeague_N,0.000000
44845,NewLeague_A^2 NewLeague_N^2,0.007069
44846,NewLeague_A NewLeague_N^2,0.000000
44847,NewLeague_A NewLeague_N NewLeague_N^2,0.000000


In [18]:
ridge(data, 1, 2)[1]

Unnamed: 0,R2_train,R2_test
0,1.0,0.808121


In [19]:
lasso(data, 1, 2)[0]

Unnamed: 0,Variables,Coeficientes
0,AtBat,0.0
1,Hits,0.0
2,HmRun,0.0
3,Runs,0.0
4,RBI,0.0
...,...,...
44844,NewLeague_A^2 NewLeague_A NewLeague_N,0.0
44845,NewLeague_A^2 NewLeague_N^2,0.0
44846,NewLeague_A NewLeague_N^2,0.0
44847,NewLeague_A NewLeague_N NewLeague_N^2,0.0


In [20]:
lasso(data, 1, 2)[1]

Unnamed: 0,R2_train,R2_test
0,0.999972,0.999958


In [21]:
elastic_net(data, 1, 0.5, 2)[0]

KeyboardInterrupt: 

In [None]:
elastic_net(data, 1, 0.5, 2)[1]

Unnamed: 0,R2_train,R2_test
0,0.976066,0.912329


### Grado 3

In [None]:
sin_penalizacion(data, 3)[0]

Unnamed: 0,Variables,Coeficientes
0,AtBat,1.577960
1,Hits,-0.043630
2,HmRun,-0.319313
3,Runs,-1.423214
4,RBI,-2.530357
...,...,...
2593,Division_W NewLeague_N^2,-0.606660
2594,NewLeague_A^3,-0.000935
2595,NewLeague_A^2 NewLeague_N,0.000000
2596,NewLeague_A NewLeague_N^2,0.000000


In [None]:
sin_penalizacion(data, 3)[1]

Unnamed: 0,R2_train,R2_test
0,1.0,0.978878


In [None]:
ridge(data, 1, 3)[0]

Unnamed: 0,Variables,Coeficientes
0,AtBat,0.658091
1,Hits,1.054175
2,HmRun,-0.608738
3,Runs,-1.066099
4,RBI,-3.138202
...,...,...
2593,Division_W NewLeague_N^2,-0.640149
2594,NewLeague_A^3,-0.130833
2595,NewLeague_A^2 NewLeague_N,0.000000
2596,NewLeague_A NewLeague_N^2,0.000000


In [None]:
ridge(data, 1, 3)[1]

Unnamed: 0,R2_train,R2_test
0,0.999969,0.974504


In [None]:
lasso(data, 1, 3)[0]

Unnamed: 0,Variables,Coeficientes
0,AtBat,-0.0
1,Hits,-0.0
2,HmRun,0.0
3,Runs,-0.0
4,RBI,-0.0
...,...,...
2593,Division_W NewLeague_N^2,-0.0
2594,NewLeague_A^3,-0.0
2595,NewLeague_A^2 NewLeague_N,0.0
2596,NewLeague_A NewLeague_N^2,0.0


In [None]:
lasso(data, 1, 3)[1]

Unnamed: 0,R2_train,R2_test
0,0.999923,0.999839


In [None]:
elastic_net(data, 1, 0.5, 3)[0]

  model = cd_fast.enet_coordinate_descent(


Unnamed: 0,Variables,Coeficientes
0,AtBat,0.0
1,Hits,0.0
2,HmRun,0.0
3,Runs,0.0
4,RBI,-0.0
...,...,...
2593,Division_W NewLeague_N^2,-0.0
2594,NewLeague_A^3,-0.0
2595,NewLeague_A^2 NewLeague_N,0.0
2596,NewLeague_A NewLeague_N^2,0.0


In [None]:
elastic_net(data, 1, 0.5, 3)[1]

  model = cd_fast.enet_coordinate_descent(


Unnamed: 0,R2_train,R2_test
0,0.990804,0.924081
