In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

import xgboost as xgb


In [3]:
df = pd.read_csv("dependables/feature_CNN.csv")

print("Dataset shape:", df.shape)
print(df.head())


Dataset shape: (16209, 149)
    price  bedrooms  bathrooms  sqft_living  sqft_lot  floors  waterfront  \
0  268643         4       2.25         1810      9240     2.0           0   
1  245000         3       2.50         1600      2788     2.0           0   
2  200000         4       2.50         1720      8638     2.0           0   
3  352499         2       2.25         1240       705     2.0           0   
4  232000         3       2.00         1280     13356     1.0           0   

   view  condition  grade  ...   cnn_118   cnn_119       cnn_120   cnn_121  \
0     0          3      7  ...  0.000028  0.000043  7.063262e-07  0.008919   
1     0          4      7  ...  0.000171  0.000112  1.084903e-06  0.008133   
2     0          3      8  ...  0.000106  0.000489  0.000000e+00  0.003679   
3     0          3      7  ...  0.000230  0.000027  0.000000e+00  0.027600   
4     0          3      7  ...  0.000231  0.000888  2.108937e-06  0.007695   

   cnn_122  cnn_123   cnn_124   cnn_125 

In [4]:
cnn_cols = [c for c in df.columns if c.startswith("cnn_")]

print("Number of CNN features:", len(cnn_cols))
print("Sample CNN columns:", cnn_cols[:5])


Number of CNN features: 128
Sample CNN columns: ['cnn_0', 'cnn_1', 'cnn_2', 'cnn_3', 'cnn_4']


In [5]:
X = df.drop(columns=["price", "imageid"], errors="ignore")
y = df["price"]

print("Total features:", X.shape[1])

X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

Total features: 147


In [6]:
model = xgb.XGBRegressor(
    n_estimators=600,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)


In [7]:
y_pred = model.predict(X_val)

r2 = r2_score(y_val, y_pred)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))

print("CNN + Tabular R²:", r2)
print("CNN + Tabular RMSE:", rmse)


CNN + Tabular R²: 0.8886303305625916
CNN + Tabular RMSE: 118218.53770031162


In [8]:
# Remove CNN features
X_tab = X.drop(columns=cnn_cols)

X_train_t, X_val_t, y_train_t, y_val_t = train_test_split(
    X_tab,
    y,
    test_size=0.2,
    random_state=42
)

model_tab = xgb.XGBRegressor(
    n_estimators=600,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42,
    n_jobs=-1
)

model_tab.fit(X_train_t, y_train_t)

y_pred_tab = model_tab.predict(X_val_t)

print("Tabular-only R²:", r2_score(y_val_t, y_pred_tab))


Tabular-only R²: 0.8963997960090637


In [9]:
import joblib

joblib.dump(model, "xgb_cnn_tabular.pkl")


['xgb_cnn_tabular.pkl']

In [10]:
import pandas as pd

importances = pd.Series(
    model.feature_importances_,
    index=X.columns
).sort_values(ascending=False)

print(importances.head(15))


grade            0.243388
sqft_living      0.091571
waterfront       0.081800
lat              0.050086
long             0.034101
cnn_82           0.033431
view             0.031881
bathrooms        0.020224
sqft_living15    0.016747
cnn_71           0.013572
cnn_25           0.013504
cnn_61           0.011457
cnn_37           0.011130
cnn_10           0.010825
zipcode          0.010421
dtype: float32


In [11]:
pred_df = X_val.copy()
pred_df["actual_price"] = y_val.values
pred_df["predicted_price"] = model.predict(X_val)

pred_df.to_csv("data/XGB_CNN_Combined predictions.csv", index=False)
