In [4]:
pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.20.0-cp313-cp313-win_amd64.whl.metadata (4.6 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Downloading absl_py-2.3.1-py3-none-any.whl.metadata (3.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Downloading flatbuffers-25.12.19-py2.py3-none-any.whl.metadata (1.0 kB)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Downloading gast-0.7.0-py3-none-any.whl.metadata (1.5 kB)
Collecting google_pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-py2.py3-none-win_amd64.whl.metadata (5.3 kB)
Collecting opt_einsum>=2.3.2 (from tensorflow)
  Downloading opt_einsum-3.4.0-py3-none-any.whl.metadata (6.3 kB)
Collecting termcolor>=1.1.0 (from tensorflow)
  Downloading termcolor-3.3.0

In [1]:
import numpy as np
import os
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tqdm import tqdm
import pandas as pd

train_path = "data/train.csv"
train = pd.read_csv(train_path)
train_image_dir = "data/images/train/"

  if not hasattr(np, "object"):


In [2]:
model = ResNet50(weights="imagenet", include_top=False, pooling="avg")

In [3]:
def get_embedding(img_path):
    img = load_img(img_path, target_size=(224,224))
    img = img_to_array(img)
    img = np.expand_dims(img, axis=0)
    img = preprocess_input(img)
    return model.predict(img, verbose=0)[0]

In [4]:
import os
os.makedirs("/data", exist_ok=True)

In [5]:
train_embeddings = []
valid_ids = []

for pid in tqdm(train["id"]):
    path = os.path.join(train_image_dir, f"{pid}.png")
    if os.path.exists(path):
        emb = get_embedding(path)
        train_embeddings.append(emb)
        valid_ids.append(pid)

train_embeddings = np.array(train_embeddings)



100%|██████████| 16209/16209 [2:21:23<00:00,  1.91it/s]  


In [25]:
np.save("data/train_embeddings.npy", train_embeddings)

In [26]:
import os
print(os.listdir("data"))

['.ipynb_checkpoints', 'images', 'test.csv', 'train.csv', 'train_embeddings.npy']


In [8]:
train_embeddings.shape

(16209, 2048)

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

features = [
 "bedrooms","bathrooms","sqft_living","sqft_lot",
 "floors","waterfront","view","condition","grade"
]

train2 = train[train["id"].isin(valid_ids)]
X = train2[features]
y = train2["price"]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42)

rf = RandomForestRegressor()
rf.fit(X_train, y_train)

pred = rf.predict(X_val)

mse = mean_squared_error(y_val, pred)
rmse = np.sqrt(mse)

print("Baseline RMSE =", rmse)
print("Baseline R2   =", r2_score(y_val,pred))

Baseline RMSE = 200856.20334557572
Baseline R2   = 0.6785109475215683


In [11]:
train["id"] = train["id"].astype(str)
valid_ids = [str(v) for v in valid_ids]

img_feat = np.load("/data/train_embeddings.npy")

train2 = train[train["id"].isin(valid_ids)].copy()

train2 = train2.drop_duplicates(subset="id")
train2 = train2.set_index("id").loc[valid_ids]

X_tab = train2[features].values
y = train2["price"].values

print("Tabular rows:", X_tab.shape[0])
print("Image rows:  ", img_feat.shape[0])

Tabular rows: 16209
Image rows:   16209


In [12]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_tab)

X_tab_train, X_tab_val, img_train, img_val, y_train, y_val = train_test_split(
    X_scaled, img_feat, y, test_size=0.2, random_state=42
)

In [13]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Concatenate
from tensorflow.keras.models import Model

tab_input = Input(shape=(len(features),))
img_input = Input(shape=(2048,))

t = Dense(64, activation="relu")(tab_input)
i = Dense(256, activation="relu")(img_input)

combined = Concatenate()([t,i])
x = Dense(128, activation="relu")(combined)
x = Dense(64, activation="relu")(x)

output = Dense(1)(x)

fusion_model = Model(inputs=[tab_input,img_input], outputs=output)
fusion_model.compile(optimizer="adam", loss="mse", metrics=["mae"])

fusion_model.summary()

In [14]:
fusion_model.fit(
    [X_tab_train, img_train],
    y_train,
    validation_data=([X_tab_val,img_val], y_val),
    epochs=20,
    batch_size=32
)

Epoch 1/20
[1m406/406[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 14ms/step - loss: 198546767872.0000 - mae: 300114.5938 - val_loss: 120575721472.0000 - val_mae: 223274.2812
Epoch 2/20
[1m406/406[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 15ms/step - loss: 117048033280.0000 - mae: 213570.2969 - val_loss: 101990817792.0000 - val_mae: 202426.6562
Epoch 3/20
[1m406/406[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 15ms/step - loss: 95754862592.0000 - mae: 187250.1875 - val_loss: 79404974080.0000 - val_mae: 179149.9375
Epoch 4/20
[1m406/406[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 15ms/step - loss: 70547963904.0000 - mae: 159568.2344 - val_loss: 55618551808.0000 - val_mae: 152620.3594
Epoch 5/20
[1m406/406[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 14ms/step - loss: 52564905984.0000 - mae: 141813.4844 - val_loss: 46252412928.0000 - val_mae: 140636.1875
Epoch 6/20
[1m406/406[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m

<keras.src.callbacks.history.History at 0x199cef05010>

In [15]:
test_image_dir = "data/images/test/"

In [16]:
test_path = "data/test.csv"
test = pd.read_csv(test_path)

test_embeddings = []
test_ids = []

for pid in tqdm(test["id"]):
    path = os.path.join(test_image_dir, f"{pid}.png")
    if os.path.exists(path):
        emb = get_embedding(path)
        test_embeddings.append(emb)
        test_ids.append(pid)

test_embeddings = np.array(test_embeddings)


100%|██████████| 5404/5404 [56:45<00:00,  1.59it/s] 


In [27]:
np.save("data/test_embeddings.npy", test_embeddings)

In [28]:
print(os.listdir("data"))

['.ipynb_checkpoints', 'images', 'test.csv', 'test_embeddings.npy', 'train.csv', 'train_embeddings.npy']


In [18]:
test2 = test[test["id"].isin(test_ids)]
test_tab = test2[features].values
test_img = test_embeddings

pred_test = fusion_model.predict([test_tab,test_img])

[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step


In [19]:
fusion_val_pred = fusion_model.predict([X_tab_val, img_val]).flatten()

[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step


In [20]:
baseline_mse = mean_squared_error(y_val, pred)
baseline_rmse = np.sqrt(baseline_mse)
baseline_r2 = r2_score(y_val, pred)

fusion_mse = mean_squared_error(y_val, fusion_val_pred)
fusion_rmse = np.sqrt(fusion_mse)
fusion_r2 = r2_score(y_val, fusion_val_pred)

results = pd.DataFrame({
    "Model": ["Baseline (Tabular Only)", "Fusion (Tabular + Images)"],
    "RMSE": [baseline_rmse, fusion_rmse],
    "R2 Score": [baseline_r2, fusion_r2]
})

print(results)

                       Model           RMSE  R2 Score
0    Baseline (Tabular Only)  200380.100754  0.679729
1  Fusion (Tabular + Images)  188038.948816  0.717964


In [29]:
import numpy as np
test_embeddings = np.load("data/test_embeddings.npy")

In [30]:
test["id"] = test["id"].astype(str)
test_ids = list(test["id"])

test2 = test.drop_duplicates(subset="id").set_index("id").loc[test_ids]

test_tab = test2[features].values
test_img = test_embeddings

In [31]:
pred_test = fusion_model.predict([test_tab, test_img]).flatten()

[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step


In [32]:
os.makedirs("outputs", exist_ok=True)

submission = pd.DataFrame({
    "id": test_ids,
    "predicted_price": pred_test
})

submission.to_csv("outputs/final_submission.csv", index=False)
print("FINAL SUBMISSION SAVED")

FINAL SUBMISSION SAVED
