In [46]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from scipy.stats import spearmanr
from sklearn.linear_model import ElasticNetCV
from sklearn.metrics import mean_squared_error

import numpy as np

## 1.1

### Cargar el dataset

In [47]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data"
column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
df = pd.read_csv(url, header=None, delimiter=r"\s+", names=column_names)

# Muestra las primeras filas del DataFrames
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


### Dividir el dataset

In [48]:
# split the df into 80 20 using train_test_split
X = df.drop('MEDV', axis=1)
y = df['MEDV']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

### Modelo 1

In [49]:
# Estandarizar las variables
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Convertir a DataFrame
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)

### Distancia de Mahalanobis
Usando el vector de medianas y la matriz de correlación de Spearman para calcular la Distancia de Mahalanobis.

In [50]:
def mahalanobis_distance(x, medians, inv_cov_matrix):
    delta = x - medians
    return np.sqrt(np.dot(np.dot(delta, inv_cov_matrix), delta))


In [51]:
# matriz de correlación de Spearman
spearman_corr = spearmanr(X_train_scaled).correlation

# Invertir la matriz de correlación para usar en la fórmula de la distancia de Mahalanobis
inv_cov_matrix = np.linalg.inv(spearman_corr)

# Calcular el vector de medianas
medians = X_train_scaled.median().values

# Aplicar la función mahalanobis_distance a cada fila del DataFrame
X_train_scaled['mahalanobis'] = X_train_scaled.apply(mahalanobis_distance, axis=1, args=(medians, inv_cov_matrix))

X_train_scaled.head()


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,mahalanobis
0,-0.372574,-0.499608,-0.704925,3.664502,-0.424879,0.935678,0.693669,-0.437218,-0.162242,-0.561656,-0.484638,0.371691,-0.411,4.359109
1,-0.397099,-0.499608,-0.044878,-0.272888,-1.241859,-0.491181,-1.835528,0.730055,-0.624648,-0.573376,0.336491,0.205012,-0.387681,3.694088
2,-0.402693,0.771168,-0.88676,-0.272888,-1.11149,0.857849,-0.524621,1.234769,-0.393445,-0.602677,-0.849584,0.366609,-0.181919,3.285733
3,-0.405769,0.029882,-0.465819,-0.272888,-0.277127,-0.417676,-0.086464,0.861527,-0.509046,-0.538216,-1.48824,0.415732,-0.037886,3.422002
4,2.774932,-0.499608,0.998884,-0.272888,1.070021,-1.438097,0.715042,-1.021528,1.687378,1.542121,0.792674,-2.613393,1.911362,5.19149


In [52]:
X_train_scaled.shape
y_train.shape

(404,)

In [53]:
# Filtrar atípicos
threshold = np.percentile(X_train_scaled['mahalanobis'], 95)
X_train_clean = X_train_scaled[X_train_scaled['mahalanobis'] < threshold]

# Filtrar los valores de y_train correspondientes a los outliers
outliers = X_train_scaled[X_train_scaled['mahalanobis'] >= threshold].index
all_positions = set(range(len(y_train)))
positions_to_keep = list(all_positions - set(outliers))
y_train_clean = y_train.iloc[positions_to_keep]

In [54]:
X_train_clean = X_train_clean.drop('mahalanobis', axis=1)
X_train_clean.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,-0.372574,-0.499608,-0.704925,3.664502,-0.424879,0.935678,0.693669,-0.437218,-0.162242,-0.561656,-0.484638,0.371691,-0.411
1,-0.397099,-0.499608,-0.044878,-0.272888,-1.241859,-0.491181,-1.835528,0.730055,-0.624648,-0.573376,0.336491,0.205012,-0.387681
2,-0.402693,0.771168,-0.88676,-0.272888,-1.11149,0.857849,-0.524621,1.234769,-0.393445,-0.602677,-0.849584,0.366609,-0.181919
3,-0.405769,0.029882,-0.465819,-0.272888,-0.277127,-0.417676,-0.086464,0.861527,-0.509046,-0.538216,-1.48824,0.415732,-0.037886
4,2.774932,-0.499608,0.998884,-0.272888,1.070021,-1.438097,0.715042,-1.021528,1.687378,1.542121,0.792674,-2.613393,1.911362


### Modelo Elastic-Net

In [55]:
# Ranggos de hiperparámetros
l1_ratios = np.linspace(0.1, 0.9, 9)
alphas = np.logspace(-4, 0, 10)

In [56]:
#modelo ElasticNetCV
elastic_net_cv = ElasticNetCV(l1_ratio=l1_ratios, alphas=alphas, cv=5, random_state=0)
elastic_net_cv.fit(X_train_clean, y_train_clean)

In [57]:
# Mejores parámetros encontrados
print("Mejor l1_ratio:", elastic_net_cv.l1_ratio_)
print("Mejor alpha:", elastic_net_cv.alpha_)

Mejor l1_ratio: 0.1
Mejor alpha: 0.046415888336127774


#### Ecuación Modelo 1

In [61]:
# Coeficientes del modelo
coefficients = elastic_net_cv.coef_

# Intercepción
intercept = elastic_net_cv.intercept_

# Nombres de las columnas (variables)
features = X_train.columns

# Construir la ecuación en formato de texto
regression_eq = f"MEDV = {intercept:.4f}"
for coef, feature in zip(coefficients, features):
    regression_eq += f" + ({coef:.4f})*{feature}"

print("Ecuación de regresión:")
print(regression_eq)


Ecuación de regresión:
MEDV = 22.5676 + (-0.6722)*CRIM + (0.5963)*ZN + (-0.1327)*INDUS + (0.8066)*CHAS + (-1.3970)*NOX + (3.2617)*RM + (-0.2837)*AGE + (-2.3649)*DIS + (1.0197)*RAD + (-1.0033)*TAX + (-2.0776)*PTRATIO + (0.7784)*B + (-3.2308)*LSTAT


### Evaluar el Modelo 1

In [58]:
# Estandarizar el conjunto de prueba
X_test_scaled = scaler.transform(X_test)

# Realizar predicciones con el modelo ajustado
y_pred = elastic_net_cv.predict(X_test_scaled)



In [59]:
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("RMSE en el conjunto de prueba:", rmse)

RMSE en el conjunto de prueba: 5.8309678417663315


- El modelo 1 tiene un RMSE de **5.83**
- El RMSE de 5.79 es menor que la desviación estándar de MEDV (9.07). Esto es una buena señal y sugiere que tu modelo está realizando predicciones razonablemente precisas. Generalmente, si el RMSE es menor que la desviación estándar, se considera que el modelo tiene una precisión aceptable.
- El RMSE es aproximadamente el 26% del valor medio de MEDV (5.79 / 22.22 ≈ 0.26). Esto nos da una idea de que el error promedio de las predicciones es una cuarta parte del valor medio de las propiedades, lo cual puede ser aceptable dependiendo de tus requisitos de precisión.

In [60]:
# get average and scale of MEDV from the test set
average_MEDV = y_test.mean()
scale_MEDV = y_test.std()
print("Average MEDV:", average_MEDV)
print("Scale MEDV:", scale_MEDV)

Average MEDV: 22.21960784313725
Scale MEDV: 9.068332880459835
