# Análisis de Datos — Regresión

**Objetivo:** análisis exploratorio y modelo de regresión.

**Dataset:** `fetch_california_housing` (si no está disponible, usar `make_regression`).


In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing, make_regression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

# Intentar California Housing; si falla, generar sintético
try:
    data = fetch_california_housing(as_frame=True)
    df = pd.concat([data.data, data.target.rename('target')], axis=1)
except Exception as e:
    X, y = make_regression(n_samples=2000, n_features=8, noise=15, random_state=42)
    cols = [f'x{i}' for i in range(X.shape[1])]
    df = pd.DataFrame(X, columns=cols)
    df['target'] = y

df.head()


In [None]:
# Split
X = df.drop(columns=['target'])
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Escalado + modelo
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

reg = LinearRegression()
reg.fit(X_train_s, y_train)
y_pred = reg.predict(X_test_s)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print('MSE:', mse, 'R2:', r2)


In [None]:
# Diagrama de dispersión real vs predicho
plt.figure()
plt.scatter(y_test, y_pred, alpha=0.5)
plt.xlabel('Real'); plt.ylabel('Predicho'); plt.title('Real vs Predicho')
plt.show()


In [None]:
print('Conclusión: analiza el R2, posibles variables relevantes y mejoras (regularización, no-linealidad, etc.).')