In [3]:
import os
import sys
os.chdir('/app')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

try:
    import xgboost as xgb
    HAS_XGBOOST = True
except ImportError:
    HAS_XGBOOST = False

pd.set_option('display.float_format', lambda x: f'{x:,.2f}')
plt.style.use('seaborn-v0_8-whitegrid')

In [None]:
# Load and prepare data
df = pd.read_csv('data/clean_listings.csv')
features = [
    'bedrooms', 'bathrooms', 'size_sqft',
    'amenity_score', 'has_parking', 'has_pool',
    'has_gym', 'has_security', 'has_garden',
    'is_land', 'location', 'property_type'
]
target = 'price_kes'

df_model = df[features + [target]].dropna().copy()
print(f"Records for modeling: {len(df_model)}")

le_location = LabelEncoder()
le_type = LabelEncoder()
df_model['location_enc'] = le_location.fit_transform(df_model['lacation'].astype(str))
df_model['property_type_enc'] = le_type.fit_transform(df_model['property_type'].astype(str))

feature_cols = [
    'bedrooms', 'bathrooms', 'size_sqft',
    'amenity_score', 'has_parking', 'has_pool',
    'has_gym', 'has_security', 'has_garden',
    'is_land', 'location_enc', 'property_type_enc'
]

X = df_model[feature_cols]
y = df_model[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)