In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

In [2]:
df = pd.read_csv("../notebook/data/property_listings.csv")


In [3]:
df = df[df['price'] > 0]
df['annual_rent_income'] = df['rentZestimate'] * 12
df['roi'] = (df['annual_rent_income'] / df['price']) * 100
df = df.replace([np.inf, -np.inf], np.nan)
df = df.dropna(subset=['roi'])
df = df[df['roi'] < 100]

In [4]:
df = df.drop(columns=[
    'zpid', 'streetAddress', 'state', 'county', 'url', 'datePosted',
    'dateSold', 'lastUpdated', 'livingAreaUnits'
], errors='ignore')

In [5]:
def parse_time_on_zillow(x):
    if pd.isnull(x): return np.nan
    if 'hour' in x: return int(x.split()[0])
    if 'day' in x: return int(x.split()[0]) * 24
    return np.nan

if 'timeOnZillow' in df.columns:
    df['timeOnZillowHours'] = df['timeOnZillow'].apply(parse_time_on_zillow)
    df.drop(columns=['timeOnZillow'], inplace=True)

In [6]:
df.dropna(subset=['price', 'rentZestimate', 'livingArea', 'bathrooms', 'bedrooms'], inplace=True)


In [7]:
categorical_cols = ['homeStatus', 'homeType', 'city', 'zipcode']
df = pd.get_dummies(df, columns=[col for col in categorical_cols if col in df.columns], drop_first=True)

In [8]:
X = df.drop(columns=['roi'])
y = df['roi']
X = X.fillna(X.mean())

In [9]:
# Step 7: Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Step 8: Train model
model = GradientBoostingRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [11]:
# Step 9: Evaluate
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

In [12]:
print(f"📈 MAE:  {mae:.2f}")
print(f"📉 RMSE: {rmse:.2f}")
print(f"🎯 R²:   {r2:.2f}")

📈 MAE:  0.30
📉 RMSE: 0.73
🎯 R²:   0.98


In [13]:
# Step 10: Save model and features
joblib.dump(model, 'gradient_boosting_roi_model.pkl')
joblib.dump(X.columns.tolist(), 'model_features.pkl')

['model_features.pkl']