In [29]:
import pandas as pd

# Load your dataset
df = pd.read_csv("house_prices.csv")

# Quick look
print(df.shape)       # rows, columns
print(df.head())      # first rows

(21613, 21)
           id             date     price  bedrooms  bathrooms  sqft_living  \
0  7129300520  20141013T000000  221900.0         3       1.00         1180   
1  6414100192  20141209T000000  538000.0         3       2.25         2570   
2  5631500400  20150225T000000  180000.0         2       1.00          770   
3  2487200875  20141209T000000  604000.0         4       3.00         1960   
4  1954400510  20150218T000000  510000.0         3       2.00         1680   

   sqft_lot  floors waterfront  view  ... grade  sqft_above  sqft_basement  \
0      5650     1.0          N     0  ...     7        1180              0   
1      7242     2.0          N     0  ...     7        2170            400   
2     10000     1.0          N     0  ...     6         770              0   
3      5000     1.0          N     0  ...     7        1050            910   
4      8080     1.0          N     0  ...     8        1680              0   

   yr_built  yr_renovated  zipcode      lat     lo

In [30]:
X = df.drop("price", axis=1)  # features
Y = df["price"]               # target

In [31]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

print("Numeric features:", numeric_features)
print("Categorical features:", categorical_features)

Numeric features: ['id', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'view', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15']
Categorical features: ['date', 'waterfront', 'condition']


In [32]:
from sklearn.impute import SimpleImputer

num_imputer = SimpleImputer(strategy="median")
cat_imputer = SimpleImputer(strategy="most_frequent")

In [33]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Categorical pipeline: fill missing values → encode
cat_pipeline = Pipeline(steps=[
    ("imputer", cat_imputer),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_imputer, numeric_features),
        ("cat", cat_pipeline, categorical_features)
    ]
)

In [34]:
from sklearn.tree import DecisionTreeRegressor

dt_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", DecisionTreeRegressor(
        random_state=1,
        max_depth=5,        # limit depth
        min_samples_leaf=4  # minimum samples per leaf
    ))
])

In [35]:
from sklearn.model_selection import train_test_split

# Split data (80% train, 20% test is standard)
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=1
)

In [36]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Fit on training data
dt_pipeline.fit(X_train, y_train)

# Predict
y_pred = dt_pipeline.predict(X_test)

# Evaluate
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print("Decision Tree MSE:", mse)
print("Decision Tree R²:", r2)
print("Decision Tree MAE:", mae)

Decision Tree MSE: 50239239881.09809
Decision Tree R²: 0.7089284053841449
Decision Tree MAE: 115436.12636967892


In [37]:
from sklearn.ensemble import RandomForestRegressor

# Random Forest pipeline
rf_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestRegressor(
        n_estimators=100,    # number of trees
        random_state=1,
        n_jobs=-1            # use all CPU cores
    ))
])
# Fit and predict
rf_pipeline.fit(X_train, y_train)
y_pred_rf = rf_pipeline.predict(X_test)

# Evaluate
print("Random Forest MSE:", mean_squared_error(y_test, y_pred_rf))
print("Random Forest R²:", r2_score(y_test, y_pred_rf))
print("Random Forest MAE:", mean_absolute_error(y_test, y_pred_rf))

Random Forest MSE: 24236123248.269356
Random Forest R²: 0.8595829264559731
Random Forest MAE: 72590.71894286375


In [39]:
print(Y.mean())

540088.1417665294


In [42]:
import matplotlib.pyplot as plt

# Colors
actual_color = 'black'
dt_color = 'blue'
rf_color = 'green'

plt.figure(figsize=(8,6))

# Plot perfect prediction line
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2, label='Perfect Prediction')

# Scatter plots
plt.scatter(y_test, y_test, color=actual_color, alpha=0.3, label='Actual Prices')  # actuals along y=x line
plt.scatter(y_test, y_pred, color=dt_color, alpha=0.6, label='Decision Tree')
plt.scatter(y_test, y_pred_rf, color=rf_color, alpha=0.6, label='Random Forest')

# Labels and title
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.title('Predicted vs Actual Prices: Decision Tree vs Random Forest')
plt.legend()
plt.grid(True)

# Save figure
plt.savefig('actual_dt_rf_pred_vs_actual.png')
plt.close()