In [1]:
## Importar librerías necesarias
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, SVR
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import (
    mean_squared_error, mean_absolute_error, r2_score,
    accuracy_score, precision_score, recall_score, f1_score, classification_report
)
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import ConfusionMatrixDisplay
import warnings

In [2]:
## Ignorar warnings para una salida más limpia
warnings.filterwarnings('ignore')

In [3]:
# Cargar el dataset de California Housing
california = fetch_california_housing()
X = pd.DataFrame(california.data, columns=california.feature_names)
y = pd.Series(california.target, name='MedHouseVal')  # Valor medio de las viviendas

# Combinar las características y el outcome en un solo DataFrame
X['MedHouseVal'] = y

In [4]:
print(X.head())

   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  MedHouseVal  
0    -122.23        4.526  
1    -122.22        3.585  
2    -122.24        3.521  
3    -122.25        3.413  
4    -122.25        3.422  


In [5]:
## Remover filas con valores faltantes en el outcome
data_clean = X.dropna(subset=['MedHouseVal'])

#no es la manera óptima de tratar con este problema. Recurrir a técnicas de imputación

In [6]:
selected_columns = ['MedInc', 'HouseAge', 'AveRooms', 'Population', 'AveOccup']
features = X[selected_columns]

In [7]:
#En caso que existan valores faltantes

X_isna = features.isna().any(axis=1)

features = features.drop(features[X_isna].index)
y = y.drop(y[X_isna].index)

In [8]:
#Preprocesamiento de Datos
from sklearn.compose import make_column_selector as selector

numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

numerical_columns = numerical_columns_selector(features)
categorical_columns = categorical_columns_selector(features)

In [9]:
#Preprocesador según tipo de variable

from sklearn.preprocessing import OneHotEncoder, StandardScaler
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()

from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer([
    ('one-hot-encoder', categorical_preprocessor, categorical_columns),
    ('standard_scaler', numerical_preprocessor, numerical_columns)])

In [10]:
## División 70 / 30
X_train, X_test, y_train, y_test = train_test_split(
    features, y, test_size=0.30, random_state=86)

In [11]:
## Preparación para clasificación: Binarizar la variable
discretizer = KBinsDiscretizer(n_bins=2, encode='ordinal', strategy='quantile')
y_train_class = discretizer.fit_transform(y_train.values.reshape(-1, 1)).astype(int).flatten()
y_test_class = discretizer.transform(y_test.values.reshape(-1, 1)).astype(int).flatten()

In [12]:
## Función para imprimir métricas de clasificación
def print_classification_metrics(y_true, y_pred, model_name):
    print(f"--- {model_name} ---")
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    print(f"Precision: {precision_score(y_true, y_pred, average='weighted'):.4f}")
    print(f"Recall: {recall_score(y_true, y_pred, average='weighted'):.4f}")
    print(f"F1-Score: {f1_score(y_true, y_pred, average='weighted'):.4f}")
    print("Classification Report:")
    print(classification_report(y_true, y_pred))
    print("\n")

In [13]:
#Punto de Comparación: Regresión Logística

model1 = make_pipeline(preprocessor, LogisticRegression(max_iter=500))
model1

In [14]:
#Regresión logística

logistic = model1.fit(X_train, y_train_class)
y_pred1 = logistic.predict(X_test)
print_classification_metrics(y_test_class, y_pred1, "LR Clasificación")

--- LR Clasificación ---
Accuracy: 0.7980
Precision: 0.7981
Recall: 0.7980
F1-Score: 0.7979
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.81      0.80      3111
           1       0.80      0.78      0.79      3081

    accuracy                           0.80      6192
   macro avg       0.80      0.80      0.80      6192
weighted avg       0.80      0.80      0.80      6192





In [15]:
#Naive Bayes
model2 = make_pipeline(preprocessor, GaussianNB())
model2

In [16]:
nb = model2.fit(X_train, y_train_class)
y_pred2 = nb.predict(X_test)
print_classification_metrics(y_test_class, y_pred2, "Naive Bayes Clasificación")

--- Naive Bayes Clasificación ---
Accuracy: 0.6852
Precision: 0.7650
Recall: 0.6852
F1-Score: 0.6590
Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.96      0.75      3111
           1       0.91      0.41      0.56      3081

    accuracy                           0.69      6192
   macro avg       0.77      0.68      0.66      6192
weighted avg       0.77      0.69      0.66      6192





In [17]:
#SVM
model3 = make_pipeline(preprocessor, SVC(kernel='rbf', probability=True))
model3

In [18]:
svc = model3.fit(X_train, y_train_class)
y_pred3 = svc.predict(X_test)
print_classification_metrics(y_test_class, y_pred3, "SVM Clasificación")

--- SVM Clasificación ---
Accuracy: 0.8182
Precision: 0.8185
Recall: 0.8182
F1-Score: 0.8181
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.84      0.82      3111
           1       0.83      0.80      0.81      3081

    accuracy                           0.82      6192
   macro avg       0.82      0.82      0.82      6192
weighted avg       0.82      0.82      0.82      6192





In [19]:
#Random Forest
model4 = make_pipeline(preprocessor, RandomForestClassifier(n_estimators=200, random_state=86))
model4

In [20]:
rforest = model4.fit(X_train, y_train_class)
y_pred4 = rforest.predict(X_test)
print_classification_metrics(y_test_class, y_pred4, "RForest Clasificación")

--- RForest Clasificación ---
Accuracy: 0.8246
Precision: 0.8251
Recall: 0.8246
F1-Score: 0.8245
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.84      0.83      3111
           1       0.84      0.80      0.82      3081

    accuracy                           0.82      6192
   macro avg       0.83      0.82      0.82      6192
weighted avg       0.83      0.82      0.82      6192





In [None]:
#XGBoost

**Comentario.** El código corre bien, buenas métricas también.