In [4]:
pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.20.0-cp313-cp313-win_amd64.whl.metadata (4.6 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Downloading absl_py-2.3.1-py3-none-any.whl.metadata (3.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Downloading flatbuffers-25.12.19-py2.py3-none-any.whl.metadata (1.0 kB)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Downloading gast-0.7.0-py3-none-any.whl.metadata (1.5 kB)
Collecting google_pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-py2.py3-none-win_amd64.whl.metadata (5.3 kB)
Collecting opt_einsum>=2.3.2 (from tensorflow)
  Downloading opt_einsum-3.4.0-py3-none-any.whl.metadata (6.3 kB)
Collecting termcolor>=1.1.0 (from tensorflow)
  Downloading termcolor-3.3.0

In [2]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.1.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.1.2-py3-none-win_amd64.whl (72.0 MB)
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   - -------------------------------------- 2.6/72.0 MB 25.9 MB/s eta 0:00:03
   ---- ----------------------------------- 8.1/72.0 MB 27.3 MB/s eta 0:00:03
   ------- -------------------------------- 12.8/72.0 MB 25.0 MB/s eta 0:00:03
   ---------- ----------------------------- 18.6/72.0 MB 25.5 MB/s eta 0:00:03
   ------------- -------------------------- 24.4/72.0 MB 25.8 MB/s eta 0:00:02
   ---------------- ----------------------- 29.4/72.0 MB 25.8 MB/s eta 0:00:02
   ------------------- -------------------- 34.3/72.0 MB 25.1 MB/s eta 0:00:02
   ---------------------- ----------------- 40.9/72.0 MB 25.8 MB/s eta 0:00:02
   ------------------------- -------------- 45.9/72.0 MB 25.4 MB/s eta 0:00:02
   ---------------------------- ----------- 50.9/72.0 MB 25.5 MB/s eta 0:0

In [1]:
import numpy as np
import os
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tqdm import tqdm
import pandas as pd
from xgboost import XGBRegressor

  if not hasattr(np, "object"):


In [2]:
train_path = "data/train.csv"
train = pd.read_csv(train_path)
train_image_dir = "data/images/train/"

In [2]:
model = ResNet50(weights="imagenet", include_top=False, pooling="avg")

In [3]:
def get_embedding(img_path):
    img = load_img(img_path, target_size=(224,224))
    img = img_to_array(img)
    img = np.expand_dims(img, axis=0)
    img = preprocess_input(img)
    return model.predict(img, verbose=0)[0]

In [4]:
import os
os.makedirs("/data", exist_ok=True)

In [5]:
train_embeddings = []
valid_ids = []

for pid in tqdm(train["id"]):
    path = os.path.join(train_image_dir, f"{pid}.png")
    if os.path.exists(path):
        emb = get_embedding(path)
        train_embeddings.append(emb)
        valid_ids.append(pid)

train_embeddings = np.array(train_embeddings)



100%|██████████| 16209/16209 [2:21:23<00:00,  1.91it/s]  


In [25]:
np.save("data/train_embeddings.npy", train_embeddings)

In [3]:
train_embeddings = np.load("data/train_embeddings.npy")
print(train_embeddings.shape) 

(16209, 2048)


In [4]:
import os
print(os.listdir("data"))

['.ipynb_checkpoints', 'images', 'test.csv', 'test_embeddings.npy', 'train.csv', 'train_embeddings.npy']


In [5]:
train_embeddings.shape

(16209, 2048)

In [6]:
import os

valid_ids = []
for pid in train["id"]:
    if os.path.exists(f"{train_image_dir}/{pid}.png"):
        valid_ids.append(pid)

len(valid_ids), train_embeddings.shape[0]

(16209, 16209)

In [7]:
img_feat = np.load("data/train_embeddings.npy")

In [8]:
train2 = train[train["id"].isin(valid_ids)].copy()
train2 = train2.drop_duplicates(subset="id")
train2 = train2.set_index("id").loc[valid_ids]

In [9]:
features = [
    "bedrooms","bathrooms","sqft_living","sqft_lot",
    "floors","waterfront","view","condition","grade",
    "lat","long"
]

In [10]:
from sklearn.preprocessing import StandardScaler

X_tab = train2[features].values
y = train2["price"].values

scaler = StandardScaler()
X_tab_scaled = scaler.fit_transform(X_tab)

In [11]:
from sklearn.decomposition import PCA

pca = PCA(n_components=256, random_state=42)
img_feat_pca = pca.fit_transform(img_feat)

print("PCA variance retained:", pca.explained_variance_ratio_.sum())

PCA variance retained: 0.9018703


In [12]:
from sklearn.model_selection import train_test_split

X_tab_train, X_tab_val, img_train, img_val, y_train, y_val = train_test_split(
    X_tab_scaled, img_feat_pca, y, test_size=0.2, random_state=42
)

In [13]:
X_fusion_train = np.hstack([X_tab_train, img_train])
X_fusion_val   = np.hstack([X_tab_val, img_val])

In [14]:
X_tab_train_only = X_tab_train
X_tab_val_only   = X_tab_val

In [15]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
xgb_tab = XGBRegressor(
    n_estimators=900,
    max_depth=7,
    learning_rate=0.03,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42,
    n_jobs=-1
)

xgb_tab.fit(X_tab_train_only, y_train)

pred_tab = xgb_tab.predict(X_tab_val_only)

rmse_tab = np.sqrt(mean_squared_error(y_val, pred_tab))
r2_tab = r2_score(y_val, pred_tab)

print("XGB Tabular RMSE:", rmse_tab)
print("XGB Tabular R2:", r2_tab)

XGB Tabular RMSE: 117049.87627503072
XGB Tabular R2: 0.8907175064086914


In [16]:
train["id"] = train["id"].astype(str)
valid_ids = [str(v) for v in valid_ids]

img_feat = np.load("/data/train_embeddings.npy")

train2 = train[train["id"].isin(valid_ids)].copy()

train2 = train2.drop_duplicates(subset="id")
train2 = train2.set_index("id").loc[valid_ids]

X_tab = train2[features].values
y = train2["price"].values

print("Tabular rows:", X_tab.shape[0])
print("Image rows:  ", img_feat.shape[0])

Tabular rows: 16209
Image rows:   16209


In [17]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_tab)

X_tab_train, X_tab_val, img_train, img_val, y_train, y_val = train_test_split(
    X_scaled, img_feat, y, test_size=0.2, random_state=42
)

In [18]:
test_image_dir = "data/images/test/"

In [16]:
test_path = "data/test.csv"
test = pd.read_csv(test_path)

test_embeddings = []
test_ids = []

for pid in tqdm(test["id"]):
    path = os.path.join(test_image_dir, f"{pid}.png")
    if os.path.exists(path):
        emb = get_embedding(path)
        test_embeddings.append(emb)
        test_ids.append(pid)

test_embeddings = np.array(test_embeddings)


100%|██████████| 5404/5404 [56:45<00:00,  1.59it/s] 


In [27]:
np.save("data/test_embeddings.npy", test_embeddings)

In [28]:
print(os.listdir("data"))

['.ipynb_checkpoints', 'images', 'test.csv', 'test_embeddings.npy', 'train.csv', 'train_embeddings.npy']


In [19]:

xgb_fusion = XGBRegressor(
    n_estimators=900,
    max_depth=7,
    learning_rate=0.03,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42,
    n_jobs=-1
)

xgb_fusion.fit(X_fusion_train, y_train)

pred_fusion = xgb_fusion.predict(X_fusion_val)

rmse_fusion = np.sqrt(mean_squared_error(y_val, pred_fusion))
r2_fusion = r2_score(y_val, pred_fusion)

print("XGB Fusion RMSE:", rmse_fusion)
print("XGB Fusion R2:", r2_fusion)

XGB Fusion RMSE: 126529.65502205402
XGB Fusion R2: 0.8722992539405823


In [20]:
test_path = "data/test.csv"
test = pd.read_csv(test_path)

In [21]:
X_test = test[features].values
X_test_scaled = scaler.transform(X_test)

test_preds = xgb_tab.predict(X_test_scaled)

submission = pd.DataFrame({
    "id": test["id"],
    "predicted_price": test_preds
})

submission.to_csv("predictions.csv", index=False)
print("Saved predictions.csv")

Saved predictions.csv
