In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt

# Generate synthetic house price dataset
np.random.seed(42)
n_samples = 3000

# Generate features
size_sqft = np.random.normal(2000, 500, n_samples)
size_sqft = np.clip(size_sqft, 800, 4000)  # Reasonable house sizes

num_bedrooms = np.random.choice([2, 3, 4, 5], n_samples, p=[0.2, 0.4, 0.3, 0.1])
num_bathrooms = np.random.choice([1, 2, 3, 4], n_samples, p=[0.1, 0.5, 0.3, 0.1])
age_years = np.random.uniform(0, 50, n_samples)
location_score = np.random.uniform(1, 10, n_samples)

# Create target variable with realistic relationships
base_price = (
    size_sqft * 150 +  # $150 per sqft
    num_bedrooms * 15000 +  # $15k per bedroom
    num_bathrooms * 10000 +  # $10k per bathroom
    (50 - age_years) * 2000 +  # Newer houses worth more
    location_score * 8000  # Location premium
)

# Add some noise
price_usd = base_price + np.random.normal(0, 25000, n_samples)
price_usd = np.clip(price_usd, 100000, 800000)  # Reasonable price range

# Create DataFrame
data = pd.DataFrame({
    'size_sqft': size_sqft,
    'num_bedrooms': num_bedrooms,
    'num_bathrooms': num_bathrooms,
    'age_years': age_years,
    'location_score': location_score,
    'price_usd': price_usd
})

print("Dataset shape:", data.shape)
print("Price statistics:")
print(data['price_usd'].describe())

# Split data
X = data.drop('price_usd', axis=1)
y = data['price_usd']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Calculate regression metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Calculate MAPE
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

print(f"\nRegression Model Performance:")
print(f"Mean Absolute Error (MAE): ${mae:,.2f}")
print(f"Mean Squared Error (MSE): ${mse:,.2f}")
print(f"Root Mean Squared Error (RMSE): ${rmse:,.2f}")
print(f"R-squared (R²): {r2:.4f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")

# Feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print(f"\nFeature Importance:")
print(feature_importance)

Dataset shape: (3000, 6)
Price statistics:
count      3000.000000
mean     469514.664897
std       85672.544915
min      211815.972417
25%      409658.061360
50%      468933.653633
75%      527952.964961
max      741442.730408
Name: price_usd, dtype: float64

Regression Model Performance:
Mean Absolute Error (MAE): $22,524.01
Mean Squared Error (MSE): $824,707,844.76
Root Mean Squared Error (RMSE): $28,717.73
R-squared (R²): 0.8836
Mean Absolute Percentage Error (MAPE): 4.99%

Feature Importance:
          feature  importance
0       size_sqft    0.746981
3       age_years    0.143051
4  location_score    0.082270
1    num_bedrooms    0.017988
2   num_bathrooms    0.009710
