In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
import joblib

# Load data (download from Kaggle)
df = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')  # For later submission

# Simple preprocessing: drop high-cardinality, fill NaNs, encode cats
df = df.drop(['Id', 'PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu'], axis=1, errors='ignore')
df['LotFrontage'].fillna(df['LotFrontage'].median(), inplace=True)
df['GarageType'].fillna('None', inplace=True)
df['GarageYrBlt'].fillna(0, inplace=True)
df['MasVnrType'].fillna('None', inplace=True)
df['MasVnrArea'].fillna(0, inplace=True)
df['BsmtFinType2'].fillna('None', inplace=True)

# Encode categoricals
le = LabelEncoder()
cat_cols = df.select_dtypes(include=['object']).columns
for col in cat_cols:
    df[col] = le.fit_transform(df[col].astype(str))

# Target log transform
df['SalePrice'] = np.log(df['SalePrice'])

# Features/target
X = df.drop('SalePrice', axis=1)
y = df['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest (good baseline)
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Score (RMSE)
print("Test RMSE:", np.sqrt(((model.predict(X_test) - y_test)**2).mean()))

# Save model + label encoder
joblib.dump(model, 'house_model.pkl')
joblib.dump(le, 'label_encoder.pkl')

Test RMSE: 0.1457553971751057


['label_encoder.pkl']