In [None]:
# 1. Librerías
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing

# 2. Cargar dataset
data = fetch_california_housing(as_frame=True)
df = data.frame
df.head()

# 3. Información general
df.info()
df.describe()

# 4. Valores nulos
print("Valores nulos por columna:")
print(df.isnull().sum())

# 5. Distribución de variables
df.hist(figsize=(14,10), bins=30)
plt.tight_layout()
plt.show()

# 6. Boxplots
plt.figure(figsize=(16,8))
sns.boxplot(data=df, orient="h")
plt.title("Boxplots de las variables")
plt.show()

# 7. Correlación
plt.figure(figsize=(10,6))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title("Mapa de calor de correlaciones")
plt.show()



In [None]:
# 1. Librerías
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# 2. Cargar datos
data = fetch_california_housing(as_frame=True)
df = data.frame

X = df.drop('MedHouseVal', axis=1)
y = df['MedHouseVal']

# 3. Información mutua
mi = mutual_info_regression(X, y)
mi_series = pd.Series(mi, index=X.columns).sort_values(ascending=False)

# 4. Visualización
mi_series.plot(kind='bar', figsize=(10,6), title="Información mutua con el target")
plt.show()

# 5. Selección de variables más importantes
top_features = mi_series.head(6).index
X_selected = X[top_features]

# 6. Escalado
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_selected)

# 7. División
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 8. Guardar datasets
pd.DataFrame(X_train, columns=top_features).to_csv("X_train.csv", index=False)
pd.DataFrame(X_test, columns=top_features).to_csv("X_test.csv", index=False)
y_train.to_csv("y_train.csv", index=False)
y_test.to_csv("y_test.csv", index=False)


In [None]:
# 1. Librerías
import pandas as pd
!pip install lazypredict -q
from lazypredict.Supervised import LazyRegressor
import mlflow

# 2. Cargar datos
X_train = pd.read_csv("X_train.csv")
X_test = pd.read_csv("X_test.csv")
y_train = pd.read_csv("y_train.csv").squeeze()
y_test = pd.read_csv("y_test.csv").squeeze()

# 3. LazyPredict
reg = LazyRegressor(verbose=0, ignore_warnings=True)
models, predictions = reg.fit(X_train, X_test, y_train, y_test)

# 4. Resultados
print(models)
models.to_csv("model_comparison.csv")


In [None]:
# 1. Librerías
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import mlflow
import mlflow.sklearn

# 2. Cargar datos
X_train = pd.read_csv("X_train.csv")
X_test = pd.read_csv("X_test.csv")
y_train = pd.read_csv("y_train.csv").squeeze()
y_test = pd.read_csv("y_test.csv").squeeze()

# 3. Modelo final
model = GradientBoostingRegressor(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# 4. Métricas
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)

# 5. MLflow logging
with mlflow.start_run():
    mlflow.log_param("model", "GradientBoostingRegressor")
    mlflow.log_metric("MSE", mse)
    mlflow.log_metric("MAE", mae)
    mlflow.sklearn.log_model(model, "gradient_boosting_model")

print("Modelo registrado en MLflow correctamente")
