In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score

In [2]:
# Cargar datos
data = pd.read_csv('train.csv')

# Crear variable categórica basada en rangos de precios
bins = [0, 150000, 300000, float('inf')]
labels = ['Económica', 'Intermedia', 'Cara']
data['PriceCategory'] = pd.cut(data['SalePrice'], bins=bins, labels=labels)

# Crear variables dicotómicas
data['is_expensive'] = (data['PriceCategory'] == 'Cara').astype(int)
data['is_medium'] = (data['PriceCategory'] == 'Intermedia').astype(int)
data['is_economic'] = (data['PriceCategory'] == 'Económica').astype(int)

# Separar características y objetivo
X = data.drop(['SalePrice', 'PriceCategory', 'is_expensive', 'is_medium', 'is_economic'], axis=1)
y = data['PriceCategory']  # Para modelo multiclase
y_binary = data['is_expensive']  # Para modelo binario

# Preprocesamiento
# Convertir variables categóricas a dummy variables
X = pd.get_dummies(X)

# Manejar valores faltantes
imputer = SimpleImputer(strategy='mean')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Dividir datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_binary, X_test_binary, y_train_binary, y_test_binary = train_test_split(X, y_binary, test_size=0.2, random_state=42)

# Estandarizar características
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train_binary = scaler.fit_transform(X_train_binary)
X_test_binary = scaler.transform(X_test_binary)

In [3]:
# Crear y entrenar modelo
log_reg_binary = LogisticRegression(max_iter=1000, random_state=42)
log_reg_binary.fit(X_train_binary, y_train_binary)

# Validación cruzada
cv_scores = cross_val_score(log_reg_binary, X_train_binary, y_train_binary, cv=5)
print(f"Puntuaciones de validación cruzada: {cv_scores}")
print(f"Precisión media: {cv_scores.mean():.4f}")

# Evaluar en conjunto de prueba
y_pred_binary = log_reg_binary.predict(X_test_binary)
y_proba_binary = log_reg_binary.predict_proba(X_test_binary)[:, 1]

# Métricas de evaluación
print("\nMatriz de Confusión:")
print(confusion_matrix(y_test_binary, y_pred_binary))
print("\nReporte de Clasificación:")
print(classification_report(y_test_binary, y_pred_binary))
print(f"AUC-ROC: {roc_auc_score(y_test_binary, y_proba_binary):.4f}")

# Análisis de coeficientes para identificar variables importantes
coef_df = pd.DataFrame({'Variable': X.columns, 'Coeficiente': log_reg_binary.coef_[0]})
coef_df = coef_df.sort_values(by='Coeficiente', ascending=False)
print("\nVariables más importantes:")
print(coef_df.head(10))

Puntuaciones de validación cruzada: [0.95726496 0.95299145 0.96581197 0.9527897  0.95708155]
Precisión media: 0.9572

Matriz de Confusión:
[[260   8]
 [  4  20]]

Reporte de Clasificación:
              precision    recall  f1-score   support

           0       0.98      0.97      0.98       268
           1       0.71      0.83      0.77        24

    accuracy                           0.96       292
   macro avg       0.85      0.90      0.87       292
weighted avg       0.96      0.96      0.96       292

AUC-ROC: 0.9684

Variables más importantes:
                 Variable  Coeficiente
4             OverallQual     1.232959
27             GarageArea     0.811316
16              GrLivArea     0.722236
17           BsmtFullBath     0.711864
51        LandContour_HLS     0.711052
167          ExterQual_Ex     0.676272
86   Neighborhood_StoneBr     0.664958
53        LandContour_Lvl     0.646876
19               FullBath     0.617422
13               1stFlrSF     0.589312
