In [3]:
import os
import sys
os.chdir('/app')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

try:
    import xgboost as xgb
    HAS_XGBOOST = True
except ImportError:
    HAS_XGBOOST = False

pd.set_option('display.float_format', lambda x: f'{x:,.2f}')
plt.style.use('seaborn-v0_8-whitegrid')

In [5]:
# Load and prepare data
df = pd.read_csv('data/clean_listings.csv')
features = [
    'bedrooms', 'bathrooms', 'size_sqft',
    'amenity_score', 'has_parking', 'has_pool',
    'has_gym', 'has_security', 'has_garden',
    'is_land', 'location', 'property_type'
]
target = 'price_kes'

df_model = df[features + [target]].dropna().copy()
print(f"Records for modeling: {len(df_model)}")

le_location = LabelEncoder()
le_type = LabelEncoder()
df_model['location_enc'] = le_location.fit_transform(df_model['location'].astype(str))
df_model['property_type_enc'] = le_type.fit_transform(df_model['property_type'].astype(str))

feature_cols = [
    'bedrooms', 'bathrooms', 'size_sqft',
    'amenity_score', 'has_parking', 'has_pool',
    'has_gym', 'has_security', 'has_garden',
    'is_land', 'location_enc', 'property_type_enc'
]

X = df_model[feature_cols]
y = df_model[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Records for modeling: 614


In [7]:
# Train models
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
y_pred_lr = np.clip(lr.predict(X_test_scaled), 0, None)

rf = RandomForestRegressor(n_estimators=200, max_depth=12, min_samples_leaf=5, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = np.clip(rf.predict(X_test), 0, None)

models = {'Linear Regression': (lr, y_pred_lr, True), 'Random Forest': (rf, y_pred_rf, False)}

if HAS_XGBOOST:
    xgb_model = xgb.XGBRegressor(n_estimators=200, max_depth=6, learning_rate=0.1, random_state=42)
    xgb_model.fit(X_train, y_train)
    y_pred_xgb = np.clip(xgb_model.predict(X_test), 0, None)
    models['XGBoost'] = (xgb_model, y_pred_xgb, False)

In [8]:
# Model comparison
def eval_model(name, y_pred):
    return {
        'Model': name,
        'MAE': mean_absolute_error(y_test, y_pred),
        'RMSE': np.sqrt(mean_squared_error(y_test, y_pred)),
        'R2': r2_score(y_test, y_pred)
    }

results = [eval_model(name, pred) for name, (_, pred, _) in models.item()]
comparison_df = pd.DataFrame(results)
comparison_df['MAE_KES'] = comparison_df['MAE'].apply(lambda x: f'{x:,.0f}')
comparison_df['RMSE_KES'] = comparison_df['RMSE'].apply(lambda x: f'{x:,.0f}')

print("\n" + "-" * 60)
print("MODEL COMPARISON")
print("-" * 60)
print(comparison_df[['Model', 'MAE_KES', 'RMSE_KES', 'R2']].to_string(index=False))

best_idx = comparison_df['R2'].idxmax()
best_model_name = comparison_df.loc[best_idx, 'Model']
print(f"\nBest model (by RÂ²): {best_model_name}")

os.makedirs('data', exist_ok=True)
comparison_df.to_csv('data/model_comparison.csv', index=False)
print("Saved: data/model_comparison.csv")


AttributeError: 'dict' object has no attribute 'item'