# Make Final Test Predictions

This notebook generates final predictions on the test set by loading the trained fusion model and combining it with XGBoost baseline predictions. It applies the same preprocessing pipeline, loads satellite images, and produces the final submission file `final_submission.csv` with predicted house prices.


In [2]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms 
import rasterio
from tqdm import tqdm

# --- CONFIG ---
TEST_TABULAR_PATH = 'test_tabular.csv'
XGB_TEST_PATH = 'xg_boost_test.csv' 
IMG_DIR = 'naip_images/test_640' # Double check if this should be 'test_640' or 'train_640' based on your folder structure
MODEL_PATH = 'sota_fusion_best.pth'
OUTPUT_PATH = 'final_submission.csv'

BATCH_SIZE = 128
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ==========================================
# 1. PREPROCESSING (FIXED: Drops raw cols)
# ==========================================
def preprocess_tabular_features(df):
    df = df.copy()
    
    # 1. Date Handling
    if 'date' in df.columns:
        df['date'] = pd.to_datetime(df['date'])
        df['year_sold'] = df['date'].dt.year
        df['month_sold'] = df['date'].dt.month
        df['day_sold'] = df['date'].dt.day
    
    # 2. Feature Engineering
    ref_year = 2025
    if 'yr_built' in df.columns:
        df['house_age'] = ref_year - df['yr_built']
        
    if 'yr_renovated' in df.columns:
        df['was_renovated'] = (df['yr_renovated'] > 0).astype(int)
        last_update = df['yr_renovated'].where(df['yr_renovated'] != 0, df['yr_built'])
        df['years_since_update'] = ref_year - last_update

    # --- FIX: DROP RAW COLUMNS TO MATCH TRAINING DIMENSIONS ---
    cols_to_drop = ['yr_built', 'yr_renovated']
    df = df.drop(columns=cols_to_drop, errors='ignore')

    return df

# ==========================================
# 2. DATASET DEFINITION
# ==========================================
class TestMultimodalDataset(Dataset):
    def __init__(self, tabular_df, img_dir):
        self.df = tabular_df
        self.img_dir = img_dir
        
        # --- FEATURE SELECTION ---
        excluded_cols = [
            'id', 'date', 'price', 'log_price', 'price_pred_xgb', 'xg_boost_price',
            'residual', 'residual_log', 'target_residual', 'abs_residual',
            'error_category', 'alpha', 'log_price_pred', 'log_xgb'
        ]
        
        # 1. Prepare Features
        # Ensure XGB Log Prediction is included as a feature
        if 'xg_boost_price' in self.df.columns:
            self.df['xgb_pred_log'] = np.log(self.df['xg_boost_price'])
        
        # Select numeric features
        self.features = [c for c in self.df.columns if c not in excluded_cols]
        if 'xgb_pred_log' not in self.features and 'xgb_pred_log' in self.df.columns:
            self.features.append('xgb_pred_log')
            
        print(f"Inference Features ({len(self.features)}): {self.features}")

        # 2. Standardize Tabular Data
        self.tab_data = self.df[self.features].values.astype(np.float32)
        self.tab_mean = self.tab_data.mean(axis=0)
        self.tab_std = self.tab_data.std(axis=0) + 1e-6
        self.tab_data = (self.tab_data - self.tab_mean) / self.tab_std
        
        self.ids = self.df['id'].values
        
        # 3. Image Normalization
        self.normalize = transforms.Normalize(
            mean=[0.485, 0.456, 0.406, 0.485], 
            std=[0.229, 0.224, 0.225, 0.229]
        )

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_id = self.ids[idx]
        img_path = os.path.join(self.img_dir, f"{img_id}.tif")
        
        # Load Image
        try:
            with rasterio.open(img_path) as src:
                image = src.read([1, 2, 3, 4]) 
                image = torch.from_numpy(image).float()
                
                if image.shape[1] != 224:
                     image = torch.nn.functional.interpolate(
                        image.unsqueeze(0), size=(224, 224), mode='bilinear', align_corners=False
                    ).squeeze(0)
        except Exception as e:
            image = torch.zeros((4, 224, 224), dtype=torch.float32)

        image = image / 255.0  
        image = self.normalize(image)
        
        tab = torch.tensor(self.tab_data[idx], dtype=torch.float32)
        
        return image, tab, img_id

# ==========================================
# 3. MODEL ARCHITECTURE (Concatenation Style)
# ==========================================
class FusionModel(nn.Module):
    def __init__(self, tab_input_dim):
        super(FusionModel, self).__init__()
        
        # Backbone
        self.cnn = models.resnet50(weights=None)
        
        # 4-Channel Adapter
        original_weights = self.cnn.conv1.weight.data
        new_conv1 = nn.Conv2d(4, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.cnn.conv1 = new_conv1
        self.cnn.fc = nn.Identity() 

        # Head
        self.vis_compression = nn.Sequential(
            nn.Linear(2048, 512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, 64),
            nn.ReLU(),
            nn.Dropout(0.3)
        )

        self.tab_dim = tab_input_dim
        self.head = nn.Linear(64 + self.tab_dim, 1)

    def forward(self, img, tab):
        vis_feat = self.cnn(img)              
        vis_feat = self.vis_compression(vis_feat) 
        combined = torch.cat((vis_feat, tab), dim=1) 
        return self.head(combined).squeeze()

# ==========================================
# 4. MAIN INFERENCE LOOP
# ==========================================
def main():
    print("--- 1. Loading and Merging Data ---")
    
    test_df = pd.read_csv(TEST_TABULAR_PATH)
    xgb_df = pd.read_csv(XGB_TEST_PATH)
    
    if 'price' in xgb_df.columns:
        xgb_df = xgb_df.rename(columns={'price': 'xg_boost_price'})
        
    full_test_df = pd.merge(test_df, xgb_df[['id', 'xg_boost_price']], on='id', how='left')
    
    # Preprocess (Drops 'yr_built', 'yr_renovated' to match training dim)
    full_test_df = preprocess_tabular_features(full_test_df)
    
    # Init Dataset
    test_dataset = TestMultimodalDataset(full_test_df, IMG_DIR)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
    
    print(f"Features in Test: {len(test_dataset.features)} (Should correspond to model input)")

    print("--- 2. Loading Model ---")
    model = FusionModel(tab_input_dim=len(test_dataset.features))
    
    # Load Weights (Ignore missing keys logic helps if there are minor diffs, but dimensions must match)
    state_dict = torch.load(MODEL_PATH, map_location=DEVICE)
    model.load_state_dict(state_dict)
    
    model.to(DEVICE)
    model.eval()
    
    print("--- 3. Running Inference ---")
    all_ids = []
    all_log_residuals = []
    
    with torch.no_grad():
        for imgs, tabs, ids in tqdm(test_loader, desc="Predicting"):
            imgs = imgs.to(DEVICE)
            tabs = tabs.to(DEVICE)
            
            preds = model(imgs, tabs)
            
            all_log_residuals.extend(preds.cpu().numpy())
            all_ids.extend(ids.numpy())
            
    print("--- 4. Fusing Scores & Saving ---")
    
    results_df = pd.DataFrame({
        'id': all_ids,
        'pred_log_residual': all_log_residuals
    })
    
    final_df = pd.merge(results_df, full_test_df[['id', 'xg_boost_price']], on='id', how='left')
    
    # Formula: Price = XGB * exp(residual)
    final_df['predicted_alpha'] = np.exp(final_df['pred_log_residual'])
    final_df['predicted_price'] = final_df['xg_boost_price'] * final_df['predicted_alpha']
    
    submission = final_df[['id', 'predicted_price']]
    submission.to_csv(OUTPUT_PATH, index=False)
    
    print(f"Success! Predictions saved to: {OUTPUT_PATH}")
    print(f"Sample:\n{submission.head()}")

if __name__ == "__main__":
    main()

--- 1. Loading and Merging Data ---
Inference Features (23): ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'year_sold', 'month_sold', 'day_sold', 'house_age', 'was_renovated', 'years_since_update', 'xgb_pred_log']
Features in Test: 23 (Should correspond to model input)
--- 2. Loading Model ---


  state_dict = torch.load(MODEL_PATH, map_location=DEVICE)


--- 3. Running Inference ---


Predicting: 100%|██████████| 43/43 [01:58<00:00,  2.75s/it]

--- 4. Fusing Scores & Saving ---
Success! Predictions saved to: final_submission.csv
Sample:
           id  predicted_price
0  2591820310     3.766912e+05
1  7974200820     8.695827e+05
2  7701450110     1.134790e+06
3  9522300010     2.001476e+06
4  9510861140     7.223207e+05



