In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import joblib

# 1. Load & Prepare Data
df = pd.read_csv('lagos_houses_prices.csv')
df_clean = df[
    (df['cleaned_price'] >= 10_000_000) & 
    (df['cleaned_price'] <= 2_000_000_000) & 
    (df['Bedrooms'] >= 1)
].copy()

# Log Transform Target
y_log = np.log1p(df_clean['cleaned_price'])
X = df_clean.drop(columns=['Title', 'Location', 'Full_Location', 'cleaned_price', 'lat', 'lon'], errors='ignore')
X = pd.get_dummies(X, columns=['Prop_Type'], drop_first=True)

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=0.2, random_state=42)

# THE CONTESTANTS 

models = {
    "1. Linear Regression": LinearRegression(),
    "2. Random Forest": RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1),
    "3. XGBoost (Gradient Boost)": XGBRegressor(n_estimators=200, learning_rate=0.1, random_state=42)
}

print(f"{'Model Name':<30} | {'R² Score':<10} | {'MAE (Error)'}")
print("-" * 65)

best_model_name = ""
best_score = -np.inf

for name, model in models.items():
    # Train
    model.fit(X_train, y_train)
    
    # Predict (Log Scale)
    preds_log = model.predict(X_test)
    
    # Convert back to Naira
    preds_naira = np.expm1(preds_log)
    y_test_naira = np.expm1(y_test)
    
    # Evaluate
    r2 = r2_score(y_test_naira, preds_naira)
    mae = mean_absolute_error(y_test_naira, preds_naira)
    
    print(f"{name:<30} | {r2:.4f}     | ₦{mae:,.0f}")
    
    if r2 > best_score:
        best_score = r2
        best_model_name = name

print("-" * 65)
print(f"WINNER: {best_model_name}")



print("-" * 65)
print(f"WINNER: {best_model_name}")

# We grab the actual model object that won
best_model_object = models[best_model_name]

print(f"Saving {best_model_name} to 'lagos_model.pkl'...")
joblib.dump(best_model_object, 'lagos_model.pkl')

# 2. SAVE FEATURE COLUMNS
joblib.dump(X.columns, 'model_features.pkl')

# 3. SAVE METRICS TO CSV
metrics_list = []
for name, model in models.items():
    # Quick re-eval to grab numbers for the CSV
    p_log = model.predict(X_test)
    p_naira = np.expm1(p_log)
    y_test_naira = np.expm1(y_test)
    
    metrics_list.append({
        "Model": name,
        "R2": r2_score(y_test_naira, p_naira),
        "MAE": mean_absolute_error(y_test_naira, p_naira)
    })

pd.DataFrame(metrics_list).to_csv('model_metrics.csv', index=False)

Model Name                     | R² Score   | MAE (Error)
-----------------------------------------------------------------
1. Linear Regression           | 0.2987     | ₦179,035,947
2. Random Forest               | 0.3900     | ₦160,339,389
3. XGBoost (Gradient Boost)    | 0.3760     | ₦161,019,309
-----------------------------------------------------------------
WINNER: 2. Random Forest
-----------------------------------------------------------------
WINNER: 2. Random Forest
Saving 2. Random Forest to 'lagos_model.pkl'...
