In [None]:
# =====================================================
# TRAINING ON BASELINE(TABULAR DATA)(Training1.py)
# =====================================================
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor

# ======================================================
# PREPARE FEATURES & TARGET
# ======================================================
X = train_df[TABULAR_FEATURES]
y = np.log1p(train_df[TARGET_COL])

# ======================================================
# TRAIN / VALIDATION SPLIT
# ======================================================
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ======================================================
# TABULAR BASELINE MODEL(XgBoost)
# ======================================================
tab_model = XGBRegressor(
    n_estimators=400,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42
)

tab_model.fit(X_train, y_train)

# ======================================================
# PREDICTION
# ======================================================
val_preds_log = tab_model.predict(X_val)

# ======================================================
# METRICS 
# ======================================================
mse_log = mean_squared_error(y_val, val_preds_log)
rmse_log = np.sqrt(mse_log)
r2_log = r2_score(y_val, val_preds_log)

print(f"Tabular Model RMSE (log scale): {rmse_log:.4f}")
print(f"Tabular Model R² (log scale):   {r2_log:.4f}")

# ======================================================
# ORIGINAL PRICE SCALE
# ======================================================
val_preds_price = np.expm1(val_preds_log)
y_val_price = np.expm1(y_val)

mse_price = mean_squared_error(y_val_price, val_preds_price)
rmse_price = np.sqrt(mse_price)

print(f"Tabular Model RMSE (price scale): {rmse_price:.2f}")


In [None]:
# =====================================================
# TRAINING ON TABULAR + VISUAL DATA(TABULAR DATA)(Training2.py)
# =====================================================
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor

# ======================================================
# LOAD CNN IMAGE EMBEDDINGS
# ======================================================
image_embeddings = np.load("/kaggle/working/train_image_embeddings.npy")

print("Image embeddings shape:", image_embeddings.shape)

# ======================================================
# PREPARE TABULAR FEATURES & TARGET
# ======================================================
X_tab = train_df[TABULAR_FEATURES].values
y = np.log1p(train_df[TARGET_COL])

print("Tabular features shape:", X_tab.shape)
print("Target shape:", y.shape)

# ======================================================
# FUSE TABULAR + IMAGE FEATURES
# ======================================================
X_fused = np.hstack([X_tab, image_embeddings])

print("Fused feature matrix shape:", X_fused.shape)

# ======================================================
# TRAIN / VALIDATION SPLIT
# ======================================================
X_train, X_val, y_train, y_val = train_test_split(
    X_fused,
    y,
    test_size=0.2,
    random_state=42
)

# ======================================================
# MULTIMODAL REGRESSION MODEL
# ======================================================
multi_model = XGBRegressor(
    n_estimators=500,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42
)

multi_model.fit(X_train, y_train)

# ======================================================
# EVALUATION (LOG SCALE)
# ======================================================
val_preds_log = multi_model.predict(X_val)

mse_log = mean_squared_error(y_val, val_preds_log)
rmse_log = np.sqrt(mse_log)
r2_log = r2_score(y_val, val_preds_log)

print(f"Multimodal RMSE (log scale): {rmse_log:.4f}")
print(f"Multimodal R² (log scale):   {r2_log:.4f}")

# ======================================================
# EVALUATION ON ORIGINAL PRICE SCALE
# ======================================================
val_preds_price = np.expm1(val_preds_log)
y_val_price = np.expm1(y_val)

mse_price = mean_squared_error(y_val_price, val_preds_price)
rmse_price = np.sqrt(mse_price)

print(f"Multimodal RMSE (price scale): {rmse_price:.2f}")
