<a href="https://colab.research.google.com/github/Angelaestefan/concentracion/blob/master/ejemplos_tema2_regresion_lineal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Regresión lineal simple con solución cerrada (Ecuaciones normales)

$$ w = \frac{\sum_{i=1}^n x_i y_i - \frac{1}{n}\left(\sum_{i=1}^n x_i\right)\left(\sum_{i=1}^n y_i\right)}{\sum_{i=1}^n x_i^2 - \frac{1}{n}\left(\sum_{i=1}^n x_i\right)^2} $$

$$ b = \frac{1}{n}\sum_{i=1}^n y_i - \frac{w}{n}\sum_{i=1}^n x_i $$


In [None]:
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(0)

# Generar datos
x = np.random.uniform(0, 10, 100)
y = 5 + 2*x + np.random.normal(0, 1, 100)

# Calcular coeficientes del modelo (closed-form)
n = len(x)
w = ((x*y).sum() - (1./n)*x.sum()*y.sum()) / ((x*x).sum() - (1./n)*(x.sum()**2))
b = (1./n)*y.sum() - (w/n)*x.sum()
print("Modelo: y =", b, "+", w, "* x")

# Predicciones y residuos
y_pred = w*x + b
r = y - y_pred

# ---- Tres subplots ----
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Datos originales
axes[0].scatter(x, y)
axes[0].set_xlabel('x')
axes[0].set_ylabel('y')
axes[0].set_title('Datos')

# línea de regression
idx = np.argsort(x)
axes[1].scatter(x, y)
axes[1].plot(x[idx], y_pred[idx], color='red', linewidth=2)
axes[1].set_xlabel('x')
axes[1].set_ylabel('y')
axes[1].set_title('Modelo')

# Residuos
axes[2].scatter(y, r)
axes[2].axhline(0, color='red', linestyle='--', linewidth=2)
axes[2].set_xlabel('y')
axes[2].set_ylabel('Error')
axes[2].set_title('Residuos')

plt.tight_layout()
plt.show()


# Dataset diabetes

Diabetes dataset
442 muestras (filas), 10 caracteristicas (columnas)

Características:
- age – Edad del paciente.
- sex – Sexo del paciente.
- bmi – Índice de masa corporal (body mass index).
- bp – Presión arterial promedio.
- s1 – Medida de colesterol sérico.
- s2 – LDL (lipoproteínas de baja densidad).
- s3 – HDL (lipoproteínas de alta densidad).
- s4 – Relación de colesterol total con HDL.
- s5 – Nivel de triglicéridos en sangre.
- s6 – Nivel de glucosa en sangre.

Variable objetivo:

Una medida cuantitativa de la progresión de la diabetes un año después de la primera observación.


### Caso 1. Entrenamiento y validación con el mismo conjunto de datos (No recomendado, sobreestima el desempeño)

La siguiente celda entrena y valida un modelo de regresion lineal utilizando el mismo conjunto de datos. Esto no es recomendado, ya que la estimación del desempeño es optimista.

In [None]:
from sklearn import datasets
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


diabetes = datasets.load_diabetes()
x = diabetes.data
y = diabetes.target
features = diabetes.feature_names
n_features = len(features)

print(f"# features: {n_features}")

regr = LinearRegression()
regr.fit(x, y)

print("Coeficientes del modelo: \n", regr.coef_)

y_pred = regr.predict(x)
print('MSE: \n', mean_squared_error(y, y_pred))
print('MAE: \n', mean_absolute_error(y, y_pred))
print("R^2: \n", r2_score(y, y_pred))


### Caso 2. Entrena y evalúa en conjuntos separados (train, test)

In [None]:
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


# 1. Load dataset
diabetes = load_diabetes()
x = diabetes.data
y = diabetes.target
features = diabetes.feature_names
n_features = len(features)
print(f"# features: {n_features}")

# 2. Dividir en train/test
X_train, X_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=0
)

# 4. Entrenar el modelo
model = LinearRegression()
model.fit(X_train, y_train)

# 5. Predicciones y evaluación
y_pred = model.predict(X_test)
print("Intercepto:", model.intercept_)
for name, coef in zip(features, model.coef_):
    print(f"{name}: {coef:.4f}")
print('-'*10)
print("MSE:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))

### Caso 3. Utiliza K-Fold validación cruzada en un bucle manual

In [None]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.datasets import load_diabetes
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


# 1. Load dataset
diabetes = load_diabetes()
x = diabetes.data
y = diabetes.target
features = diabetes.feature_names
n_features = len(features)
print(f"# features: {n_features}")

n_folds = 5
kf = KFold(n_splits=n_folds, shuffle = True, random_state=0)

mse = 0
mae = 0
r2 = 0
for k, (train_index, test_index) in enumerate(kf.split(x)):
    print(f'Iteración de k-fold: {k+1}')
    # Training phase
    x_train = x[train_index, :]
    y_train = y[train_index]

    regr_cv = LinearRegression()
    regr_cv.fit(x_train, y_train)

    # Test phase
    x_test = x[test_index, :]
    y_test = y[test_index]

    y_pred = regr_cv.predict(x_test)

    # Calculate MSE and R^2
    mse_i = mean_squared_error(y_test, y_pred)
    print('\t mse = ', mse_i)

    mae_i = mean_absolute_error(y_test, y_pred)
    print('\t mae = ', mae_i)

    r2_i = r2_score(y_test, y_pred)
    print('\t r^2= ', r2_i)

    mse += mse_i
    mae += mae_i
    r2 += r2_i


print("\nMetricas promedio: \n")

mse = mse/n_folds
print('MSE = ', mse)

mae = mae/n_folds
print('MAE = ', mae)

r2 = r2/n_folds
print('R^2 = ', r2)



### Caso 4. Utiliza K-Fold validación cruzada y la funcion cross validate

1. Investiga la función cross_validate de sklearn
2. Utilizala para evaluar el modelo

In [None]:

# Tu código aqui

### Caso 5. Utiliza cross_val_predict function

In [None]:
from sklearn.datasets import load_diabetes
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

diabetes = load_diabetes()
x = diabetes.data
y = diabetes.target
features = diabetes.feature_names
n_features = len(features)
print(f"# features: {n_features}")

regr = LinearRegression()
y_pred = cross_val_predict(regr, x, y, cv = 5)

print("y_pred shape: ", y_pred.shape)

print('mse = ', mean_squared_error(y, y_pred))
print('mae = ', mean_absolute_error(y, y_pred))
print('r^2= ', r2_score(y, y_pred))


# Selección de características utilizando métodos tipo wrapper

### Regresion lineal con selección de características sequencial

La siguiente celda:

1. Selecciona la mitad de las características usando una busqueda sequencial de tipo codiciosa.
    - a) Selecciona la mitad de las caracteristicas con SequentialFeatureSelector
    - b) Evalúa usando los datos de entrenamiento (no recomendado)
    - c) Evalúa utilizando validación cruzada (recomendado).

2. Utiliza la selección de características sequencial para decidir el número óptimo de características.



#### Reduce el número de características a la mitad (Modo incorrecto)

El problema es que hay data leakage, porque la selección de variables usa todo el dataset antes de evaluar, dando métricas demasiado optimistas.

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn import linear_model
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.feature_selection import SequentialFeatureSelector

diabetes = datasets.load_diabetes()
x = diabetes.data
y = diabetes.target
features = diabetes.feature_names
n_features = len(features)

print("\n ----- Feature selection using 50% of predictors -----")

regr = linear_model.LinearRegression()
fselection = SequentialFeatureSelector(regr, n_features_to_select = 0.5)
fselection.fit(x, y)
print("Selected features: ", fselection.get_feature_names_out())

# Fit model using the new dataset and evaluate MSE, MAE and R^2
x_transformed = fselection.transform(x)
regr.fit(x_transformed, y)
print("Model coefficients: ", regr.coef_)
print("Model intercept: ", regr.intercept_)

y_pred = regr.predict(x_transformed)
print("Evaluation using training data (not recommended): ")
print('MSE: ', mean_squared_error(y, y_pred))
print("MAE: ", mean_absolute_error(y, y_pred))
print("R^2: ", r2_score(y, y_pred))


#### Reduce el número de características a la mitad usando validación cruzada (Modo correcto)

Este es el modo correcto para evitar fuga de datos. Ya que la selección de características se hace solamente con los datos de entrenamiento. La evaluación se realiza en un conjunto de prueba separado.

In [None]:

import numpy as np
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn import linear_model
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.feature_selection import SequentialFeatureSelector

diabetes = datasets.load_diabetes()
x = diabetes.data
y = diabetes.target
features = diabetes.feature_names
n_features = len(features)

mse_cv = []
mae_cv = []
r2_cv = []

kf = KFold(n_splits=5, shuffle = True)

for train_index, test_index in kf.split(x):

    # Training phase
    x_train = x[train_index, :]
    y_train = y[train_index]

    regr_cv = linear_model.LinearRegression()

    fselection_cv = SequentialFeatureSelector(regr_cv, n_features_to_select=0.5)
    fselection_cv.fit(x_train, y_train)
    x_train = fselection_cv.transform(x_train)

    regr_cv.fit(x_train, y_train)

    # Test phase
    x_test = fselection_cv.transform(x[test_index, :])
    y_test = y[test_index]
    y_pred = regr_cv.predict(x_test)

    mse_i = mean_squared_error(y_test, y_pred)
    mse_cv.append(mse_i)

    mae_i = mean_absolute_error(y_test, y_pred)
    mae_cv.append(mae_i)

    r2_i = r2_score(y_test, y_pred)
    r2_cv.append(r2_i)

print("Evaluation using cross-validation (recommended): ")
print('MSE:', np.average(mse_cv), '  MAE:', np.average(mae_cv),'  R^2:', np.average(r2_cv))

#### Encuentra el número óptimo de features

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn import linear_model
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.feature_selection import SequentialFeatureSelector

diabetes = datasets.load_diabetes()
x = diabetes.data
y = diabetes.target
features = diabetes.feature_names
n_features = len(features)

n_feats = [1, 2, 3, 4, 5, 6, 7, 8, 9]

mse_nfeat = []
mae_nfeat = []
r2_nfeat = []
for n_feat in n_feats:
    print('---- n features =', n_feat)

    mse_cv = []
    mae_cv = []
    r2_cv = []

    kf = KFold(n_splits=5, shuffle = True)

    for train_index, test_index in kf.split(x):

        # Training phase
        x_train = x[train_index, :]
        y_train = y[train_index]

        regr_cv = linear_model.LinearRegression()

        fselection_cv = SequentialFeatureSelector(regr_cv, n_features_to_select=n_feat)
        fselection_cv.fit(x_train, y_train)
        x_train = fselection_cv.transform(x_train)

        regr_cv.fit(x_train, y_train)

        # Test phase
        x_test = fselection_cv.transform(x[test_index, :])
        y_test = y[test_index]
        y_pred = regr_cv.predict(x_test)

        mse_i = mean_squared_error(y_test, y_pred)
        mse_cv.append(mse_i)

        mae_i = mean_absolute_error(y_test, y_pred)
        mae_cv.append(mae_i)

        r2_i = r2_score(y_test, y_pred)
        r2_cv.append(r2_i)


    mse = np.average(mse_cv)
    mse_nfeat.append(mse)

    mae = np.average(mae_cv)
    mae_nfeat.append(mae)

    r2 = np.average(r2_cv)
    r2_nfeat.append(r2)

    print('MSE:', mse, '  MAE:', mae,'  R^2:', r2)

opt_index = np.argmin(mse_nfeat)
opt_features = n_feats[opt_index]
print("Optimal number of features: ", opt_features)

fig, axs = plt.subplots(1, 3, figsize=(15, 4), tight_layout=True)

axs[0].plot(n_feats, mse_nfeat)
axs[0].set_xlabel("features")
axs[0].set_ylabel("MSE")

axs[1].plot(n_feats, mae_nfeat)
axs[1].set_xlabel("features")
axs[1].set_ylabel("MAE")

axs[2].plot(n_feats, r2_nfeat)
axs[2].set_xlabel("features")
axs[2].set_ylabel("r^2")

plt.show()

# Fit model with optimal number of features
regr = linear_model.LinearRegression()
fselection = SequentialFeatureSelector(regr, n_features_to_select = opt_features)
fselection.fit(x, y)

print("Selected features: ", fselection.get_feature_names_out())

x_transformed = fselection.transform(x)
regr.fit(x_transformed, y)
print("Model coefficients: ", regr.coef_)
print("Model intercept: ", regr.intercept_)


### Regresion lineal con selección de características regresiva

1. Empieza con todas las características.
2. Entrena un modelo (por ejemplo, LinearRegression, RandomForest, SVC, etc.).
3. Calcula la importancia de cada característica (según los coeficientes o atributos del modelo).
4. Elimina la(s) característica(s) menos importante(s).
5. Repite el proceso recursivamente hasta que queden solo las características deseadas (n_features_to_select).



In [None]:

import numpy as np
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn import linear_model
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.feature_selection import RFE

# Import Diabetes dataset
diabetes = datasets.load_diabetes()
x = diabetes.data
y = diabetes.target
features = diabetes.feature_names
n_features = len(features)

# Evaluate model using cross validation
mse_cv = []
mae_cv = []
r2_cv = []

kf = KFold(n_splits=5, shuffle = True)

for train_index, test_index in kf.split(x):

    # Training phase
    x_train = x[train_index, :]
    y_train = y[train_index]

    regr_cv = linear_model.LinearRegression()

    fselection_cv = RFE(regr_cv, n_features_to_select=0.5)
    fselection_cv.fit(x_train, y_train)
    x_train = fselection_cv.transform(x_train)

    regr_cv.fit(x_train, y_train)

    # Test phase
    x_test = fselection_cv.transform(x[test_index, :])
    y_test = y[test_index]
    y_pred = regr_cv.predict(x_test)

    mse_i = mean_squared_error(y_test, y_pred)
    mse_cv.append(mse_i)

    mae_i = mean_absolute_error(y_test, y_pred)
    mae_cv.append(mae_i)

    r2_i = r2_score(y_test, y_pred)
    r2_cv.append(r2_i)

print('MSE:', np.average(mse_cv), '  MAE:', np.average(mae_cv),'  R^2:', np.average(r2_cv))

Encontrar el número óptimo de features usando método recursivo

In [None]:

import numpy as np
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn import linear_model
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.feature_selection import RFE

# Import Diabetes dataset
diabetes = datasets.load_diabetes()
x = diabetes.data
y = diabetes.target
features = diabetes.feature_names
n_features = len(features)

n_feats = [1, 2, 3, 4, 5, 6, 7, 8, 9]

mse_nfeat = []
mae_nfeat = []
r2_nfeat = []
for n_feat in n_feats:
    print('---- n features =', n_feat)

    mse_cv = []
    mae_cv = []
    r2_cv = []

    kf = KFold(n_splits=5, shuffle = True)

    for train_index, test_index in kf.split(x):

        # Training phase
        x_train = x[train_index, :]
        y_train = y[train_index]

        regr_cv = linear_model.LinearRegression()

        fselection_cv = RFE(regr_cv, n_features_to_select=n_feat)
        fselection_cv.fit(x_train, y_train)
        x_train = fselection_cv.transform(x_train)

        regr_cv.fit(x_train, y_train)

        # Test phase
        x_test = fselection_cv.transform(x[test_index, :])
        y_test = y[test_index]
        y_pred = regr_cv.predict(x_test)

        mse_i = mean_squared_error(y_test, y_pred)
        mse_cv.append(mse_i)

        mae_i = mean_absolute_error(y_test, y_pred)
        mae_cv.append(mae_i)

        r2_i = r2_score(y_test, y_pred)
        r2_cv.append(r2_i)


    mse = np.average(mse_cv)
    mse_nfeat.append(mse)

    mae = np.average(mae_cv)
    mae_nfeat.append(mae)

    r2 = np.average(r2_cv)
    r2_nfeat.append(r2)

    print('MSE:', mse, '  MAE:', mae,'  R^2:', r2)

opt_index = np.argmin(mse_nfeat)
opt_features = n_feats[opt_index]
print("Optimal number of features: ", opt_features)

fig, axs = plt.subplots(1, 3, figsize=(15, 4), tight_layout=True)
axs[0].plot(n_feats, mse_nfeat)
axs[0].set_xlabel("features")
axs[0].set_ylabel("MSE")

axs[1].plot(n_feats, mae_nfeat)
axs[1].set_xlabel("features")
axs[1].set_ylabel("MAE")

axs[2].plot(n_feats, r2_nfeat)
axs[2].set_xlabel("features")
axs[2].set_ylabel("r^2")

plt.show()

regr = linear_model.LinearRegression()
fselection = RFE(regr, n_features_to_select = opt_features)
fselection.fit(x, y)

print("Selected features: ", fselection.get_feature_names_out())

x_transformed = fselection.transform(x)
regr.fit(x_transformed, y)
print("Model coefficients: ", regr.coef_)
print("Model intercept: ", regr.intercept_)


### Regresión lineal con selección de características tipo filter

1. Uso de un método filter que utiliza la correlación de pearson para calcular las 5 mejores caracteristicas bajo este criterio.
2. Encontrar el número óptimo de características utilizando el método filter.


In [None]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn import linear_model
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.feature_selection import SelectKBest, r_regression

# Import Diabetes dataset
diabetes = datasets.load_diabetes()
x = diabetes.data
y = diabetes.target
features = diabetes.feature_names
n_features = len(features)
n_feats = [1, 2, 3, 4, 5, 6, 7, 8, 9]

mse_nfeat = []
mae_nfeat = []
r2_nfeat = []
for n_feat in n_feats:
    print('---- n features =', n_feat)

    mse_cv = []
    mae_cv = []
    r2_cv = []

    kf = KFold(n_splits=5, shuffle = True)

    for train_index, test_index in kf.split(x):

        # Training phase
        x_train = x[train_index, :]
        y_train = y[train_index]

        fselection_cv = SelectKBest(r_regression, k = n_feat)
        fselection_cv.fit(x_train, y_train)
        x_train = fselection_cv.transform(x_train)

        regr_cv = linear_model.LinearRegression()
        regr_cv.fit(x_train, y_train)

        # Test phase
        x_test = fselection_cv.transform(x[test_index, :])
        y_test = y[test_index]
        y_pred = regr_cv.predict(x_test)

        mse_i = mean_squared_error(y_test, y_pred)
        mse_cv.append(mse_i)

        mae_i = mean_absolute_error(y_test, y_pred)
        mae_cv.append(mae_i)

        r2_i = r2_score(y_test, y_pred)
        r2_cv.append(r2_i)


    mse = np.average(mse_cv)
    mse_nfeat.append(mse)

    mae = np.average(mae_cv)
    mae_nfeat.append(mae)

    r2 = np.average(r2_cv)
    r2_nfeat.append(r2)

    print('MSE:', mse, '  MAE:', mae,'  R^2:', r2)

opt_index = np.argmin(mse_nfeat)
opt_features = n_feats[opt_index]
print("Optimal number of features: ", opt_features)

fig, axs = plt.subplots(1, 3, figsize=(15, 4), tight_layout=True)
axs[0].plot(n_feats, mse_nfeat)
axs[0].set_xlabel("k")
axs[0].set_ylabel("MSE")

axs[1].plot(n_feats, mae_nfeat)
axs[1].set_xlabel("k")
axs[1].set_ylabel("MAE")

axs[2].plot(n_feats, r2_nfeat)
axs[2].set_xlabel("k")
axs[2].set_ylabel("r^2")

plt.show()

# Fit model with optimal number of features
print("\n ----- Final model with optimal selection of number of features -----")

regr = linear_model.LinearRegression()
fselection = SelectKBest(r_regression, k = opt_features)
fselection.fit(x, y)

print("Selected features: ", fselection.get_feature_names_out())

x_transformed = fselection.transform(x)
regr.fit(x_transformed, y)
print("Model coefficients: ", regr.coef_)
print("Model intercept: ", regr.intercept_)
