In [3]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [4]:
# Carga y vista preliminar de los datos
train = pd.read_csv('../Data/train.csv')
test  = pd.read_csv('../Data/test.csv')

print(train.head())
print(train.info())
print(train.isnull().sum())


   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD   

In [5]:
# Limpieza y transformación de los datos

# 3.1 Separar características y objetivo
x = train.drop(['SalePrice','Id'], axis=1)
y = train['SalePrice']

# 3.2 Identificar columnas numéricas y categóricas
numeric_features     = x.select_dtypes(include=['int64','float64']).columns.tolist()
categorical_features = x.select_dtypes(include=['object']).columns.tolist()

# 3.3 Definir pipelines
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# 3.4 Combinar en ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# 3.5 Ajustar y transformar
x_processed    = preprocessor.fit_transform(x)
test_processed = preprocessor.transform(test.drop('Id', axis=1))

print("x_processed.shape:", x_processed.shape)
print("test_processed.shape:", test_processed.shape)

x_processed.shape: (1460, 287)
test_processed.shape: (1459, 287)


In [6]:
# División en train/validation y entrenamiento inicial
x_train, x_val, y_train, y_val = train_test_split(
    x_processed, y,
    test_size=0.2,
    random_state=42
)

model = RandomForestRegressor(random_state=42)
model.fit(x_train, y_train)

print("R² en validación inicial:", model.score(x_val, y_val))


R² en validación inicial: 0.8912261819155116


In [9]:
from sklearn.model_selection import RandomizedSearchCV

param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
rand_search = RandomizedSearchCV(
    RandomForestRegressor(random_state=42),
    param_distributions=param_dist,
    n_iter=20,  
    cv=3,
    scoring='r2',
    n_jobs=-1,
    verbose=2,
    random_state=42
)
rand_search.fit(x_processed, y)
print("Mejores parámetros:", rand_search.best_params_)
print("Mejor R² (CV):", rand_search.best_score_)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Mejores parámetros: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': None}
Mejor R² (CV): 0.8557121290747342


In [11]:
# Predicción sobre test y creación del archivo de envío
predictions = rand_search.predict(test_processed)

submission = pd.DataFrame({
    'Id': test['Id'],
    'SalePrice': predictions
})

submission.to_csv('submission.csv', index=False)
print("Archivo 'submission.csv' generado.")

Archivo 'submission.csv' generado.
