In [1]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [2]:
DATA_PATH = "../data/nexkey_synthetic_dataset_v1"

queries = pd.read_csv(f"{DATA_PATH}/queries.csv")
properties = pd.read_csv(f"{DATA_PATH}/properties.csv")
interactions = pd.read_csv(f"{DATA_PATH}/interactions.csv")

In [3]:
QUERY_FEATURES = [
    "beds_min",
    "baths_min",
    "sqft_min",
    "purchase_price_max",
    "arv_min",
    "entry_fee_max",
    "monthly_payment_max",
]

PROPERTY_FEATURES = [
    "beds",
    "baths",
    "sqft",
    "purchase_price",
    "arv",
    "entry_fee",
    "estimated_monthly_payment",
]

In [4]:
# Merge query info
data = interactions.merge(
    queries[["query_id"] + QUERY_FEATURES],
    on="query_id",
    how="left"
)

# Merge property info
data = data.merge(
    properties[["property_id"] + PROPERTY_FEATURES],
    on="property_id",
    how="left"
)

# Target variable
labels = data["relevance"].values.astype(np.float32)

# Final feature matrix
features = data[QUERY_FEATURES + PROPERTY_FEATURES].values.astype(np.float32)

print("Feature shape:", features.shape)
print("Labels shape:", labels.shape)

Feature shape: (480000, 14)
Labels shape: (480000,)


In [5]:
from sklearn.preprocessing import StandardScaler
import joblib
import os

# 1) Fit scaler on training features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features).astype(np.float32)

# 2) Save the scaler so Step 3 can use the SAME scaling
os.makedirs("../models/checkpoints", exist_ok=True)
joblib.dump(scaler, "../models/checkpoints/numeric_scaler.joblib")

print("Scaled features shape:", features_scaled.shape)
print("Scaler saved to ../models/checkpoints/numeric_scaler.joblib")

Scaled features shape: (480000, 14)
Scaler saved to ../models/checkpoints/numeric_scaler.joblib


In [6]:
class DealMatchDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [7]:
dataset = DealMatchDataset(features_scaled, labels)

loader = DataLoader(
    dataset,
    batch_size=256,
    shuffle=True
)

In [8]:
class DealRanker(nn.Module):
    def __init__(self, input_dim):
        super().__init__()

        self.net = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)  # one score
        )

    def forward(self, x):
        return self.net(x).squeeze(1)

In [9]:
model = DealRanker(input_dim=features.shape[1])

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [10]:
EPOCHS = 5

for epoch in range(EPOCHS):
    total_loss = 0.0

    for X_batch, y_batch in loader:
        # 1. Forward pass
        preds = model(X_batch)

        # 2. Compute loss
        loss = criterion(preds, y_batch)

        # 3. Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(loader)
    print(f"Epoch {epoch+1}/{EPOCHS} - Loss: {avg_loss:.4f}")

Epoch 1/5 - Loss: 1.0439
Epoch 2/5 - Loss: 0.9990
Epoch 3/5 - Loss: 0.9813
Epoch 4/5 - Loss: 0.9658
Epoch 5/5 - Loss: 0.9557


In [11]:
# Make sure the directory exists
import os

os.makedirs("../models/checkpoints", exist_ok=True)

# Save trained model
torch.save(
    model.state_dict(),
    "../models/checkpoints/numeric_ranker.pt"
)

print("Model saved successfully!")

Model saved successfully!
