In [None]:
from datasets import load_dataset

dataset = load_dataset("mistag/real-estate-image-dataset.csv")
print(dataset)
print(dataset['train'].features)

In [None]:
import pandas as pd

df = dataset['train'].to_pandas()
print(df.columns)
df.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

features = ['bedrooms', 'bathrooms', 'area']  # Update to match your dataset columns

target = 'price'
df = df.dropna(subset=features + [target, 'image'])

X_tab = df[features]
y = df[target]
images = df['image']

X_train_tab, X_test_tab, y_train, y_test, img_train, img_test = train_test_split(
    X_tab, y, images, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train_tab_scaled = scaler.fit_transform(X_train_tab)
X_test_tab_scaled = scaler.transform(X_test_tab)

In [None]:
import torch
from torchvision import models, transforms
from PIL import Image
import numpy as np

img_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

resnet = models.resnet18(pretrained=True)
resnet = torch.nn.Sequential(*(list(resnet.children())[:-1]))
resnet.eval()

def extract_img_feature(img):
    if isinstance(img, str):
        img = Image.open(img).convert('RGB')
    elif not isinstance(img, Image.Image):
        img = Image.fromarray(img)
    img_t = img_transform(img).unsqueeze(0)
    with torch.no_grad():
        feat = resnet(img_t)
    return feat.flatten().numpy()

def batch_img_features(img_series):
    feats = []
    for img in img_series:
        try:
            feats.append(extract_img_feature(img))
        except Exception:
            feats.append(np.zeros(512))
    return np.array(feats)

X_train_img_feats = batch_img_features(img_train)
X_test_img_feats = batch_img_features(img_test)

In [None]:
X_train_all = np.concatenate([X_train_tab_scaled, X_train_img_feats], axis=1)
X_test_all = np.concatenate([X_test_tab_scaled, X_test_img_feats], axis=1)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

reg = RandomForestRegressor(n_estimators=100, random_state=42)
reg.fit(X_train_all, y_train)

y_pred = reg.predict(X_test_all)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")

In [None]:
import joblib
joblib.dump(reg, "house_price_rf.joblib")

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(6,6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Predicted vs Actual Prices')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.show()